Esempio n. 1
0
    def __init__(self, data_loader, epochs, save_epoch, model_path, numTransform, numRef):
        self.data_loader = data_loader
        self.epochs = epochs
        self.model_path = model_path
        self.save_epoch = save_epoch
        self.numTransform = numTransform
        self.numRef = numRef

        self.G = Generator(numTransform, numRef)
        self.D = Discriminator(numTransform, numRef)
        self.G_optim = optim.SGD(self.G.parameters(), lr=1e-3, momentum=0.9)
        self.D_optim = optim.SGD(self.D.parameters(), lr=1e-3, momentum=0.9)

        if self.gpu_mode:
            self.G.cuda()
            self.D.cuda()
            self.BCE_loss = nn.BCELoss().cuda()
            self.L1_Loss = nn.L1Loss().cuda()
        else:
            self.BCE_loss = nn.BCELoss()
            self.L1_Loss = nn.L1Loss()

        self.save_path = model_path + '/model_%d.weights'
        logdir = model_path + "/tmp"
        logger = LogWriter(logdir, sync_cycle=10000)

        with logger.mode("train"):
            self.log_D_real_loss = logger.scalar("D/real_loss")
            self.log_D_fake_loss = logger.scalar("D/fake_loss")
            self.log_D_total_loss = logger.scalar("D/total_loss")
            self.log_G_D_loss = logger.scalar("G/D_Loss")
            self.log_G_L1_loss = logger.scalar("G/L1_Loss")
            self.log_G_total_loss = logger.scalar("G/total_Loss")

        with logger.mode("test"):
            self.log_test_loss = logger.scalar("test/loss")
Esempio n. 2
0
class MyLog():
    '''
    本类用于适配PaddleHub在AIStudio中VisualDL的使用
    使用方式:
    # 创建 LogWriter 对象
    log_writer = MyLog(mode="role2")  
    seq_label_task._tb_writer=log_writer
    '''
    def __init__(self, mode="train", logDir="../log"):
        self.mode = mode
        self.varDic = {}
        self.log_writer = LogWriter(logDir, sync_cycle=10)

    def add_scalar(self, tag, scalar_value, global_step):
        if not tag in self.varDic:
            with self.log_writer.mode(self.mode) as writer:
                self.varDic[tag] = writer.scalar(tag)
        self.varDic[tag].add_record(global_step, scalar_value)
Esempio n. 3
0
def main():
    env = os.environ
    FLAGS.dist = 'PADDLE_TRAINER_ID' in env and 'PADDLE_TRAINERS_NUM' in env
    if FLAGS.dist:
        trainer_id = int(env['PADDLE_TRAINER_ID'])
        import random
        local_seed = (99 + trainer_id)
        random.seed(local_seed)
        np.random.seed(local_seed)

    cfg = load_config(FLAGS.config)
    if 'architecture' in cfg:
        main_arch = cfg.architecture
    else:
        raise ValueError("'architecture' not specified in config file.")

    merge_config(FLAGS.opt)

    if 'log_iter' not in cfg:
        cfg.log_iter = 20

    # check if set use_gpu=True in paddlepaddle cpu version
    check_gpu(cfg.use_gpu)
    # check if paddlepaddle version is satisfied
    check_version()
    if not FLAGS.dist or trainer_id == 0:
        print_total_cfg(cfg)

    if cfg.use_gpu:
        devices_num = fluid.core.get_cuda_device_count()
    else:
        devices_num = int(os.environ.get('CPU_NUM', 1))

    if 'FLAGS_selected_gpus' in env:
        device_id = int(env['FLAGS_selected_gpus'])
    else:
        device_id = 0
    place = fluid.CUDAPlace(device_id) if cfg.use_gpu else fluid.CPUPlace()
    exe = fluid.Executor(place)

    scheduler = cfg.LearningRate['schedulers'][0]
    if isinstance(scheduler,
                  CosineDecayWithWarmup) and scheduler.max_iters is None:
        scheduler.max_iters = cfg.max_iters
    lr_builder = create('LearningRate')
    optim_builder = create('OptimizerBuilder')

    # build program
    startup_prog = fluid.Program()
    train_prog = fluid.Program()
    with fluid.program_guard(train_prog, startup_prog):
        with fluid.unique_name.guard():
            model = create(main_arch)
            if FLAGS.fp16:
                assert (getattr(model.backbone, 'norm_type', None)
                        != 'affine_channel'), \
                    '--fp16 currently does not support affine channel, ' \
                    ' please modify backbone settings to use batch norm'

            with mixed_precision_context(FLAGS.loss_scale, FLAGS.fp16) as ctx:
                inputs_def = cfg['TrainReader']['inputs_def']
                feed_vars, train_loader = model.build_inputs(**inputs_def)
                train_fetches = model.train(feed_vars)
                loss = train_fetches['loss']
                if FLAGS.fp16:
                    loss *= ctx.get_loss_scale_var()
                lr = lr_builder()
                optimizer = optim_builder(lr)
                optimizer.minimize(loss)
                if FLAGS.fp16:
                    loss /= ctx.get_loss_scale_var()

    # parse train fetches
    train_keys, train_values, _ = parse_fetches(train_fetches)
    train_values.append(lr)

    if FLAGS.eval:
        eval_prog = fluid.Program()
        with fluid.program_guard(eval_prog, startup_prog):
            with fluid.unique_name.guard():
                model = create(main_arch)
                inputs_def = cfg['EvalReader']['inputs_def']
                feed_vars, eval_loader = model.build_inputs(**inputs_def)
                fetches = model.eval(feed_vars)
        eval_prog = eval_prog.clone(True)

        eval_reader = create_reader(cfg.EvalReader)
        eval_loader.set_sample_list_generator(eval_reader, place)

        # parse eval fetches
        extra_keys = []
        if cfg.metric == 'COCO':
            extra_keys = ['im_info', 'im_id', 'im_shape']
        if cfg.metric == 'VOC':
            extra_keys = ['gt_bbox', 'gt_class', 'is_difficult']
        if cfg.metric == 'WIDERFACE':
            extra_keys = ['im_id', 'im_shape', 'gt_bbox']
        eval_keys, eval_values, eval_cls = parse_fetches(
            fetches, eval_prog, extra_keys)

    # compile program for multi-devices
    build_strategy = fluid.BuildStrategy()
    build_strategy.fuse_all_optimizer_ops = False
    # only enable sync_bn in multi GPU devices
    sync_bn = getattr(model.backbone, 'norm_type', None) == 'sync_bn'
    build_strategy.sync_batch_norm = sync_bn and devices_num > 1 \
        and cfg.use_gpu

    exec_strategy = fluid.ExecutionStrategy()
    # iteration number when CompiledProgram tries to drop local execution scopes.
    # Set it to be 1 to save memory usages, so that unused variables in
    # local execution scopes can be deleted after each iteration.
    exec_strategy.num_iteration_per_drop_scope = 1
    if FLAGS.dist:
        dist_utils.prepare_for_multi_process(exe, build_strategy, startup_prog,
                                             train_prog)
        exec_strategy.num_threads = 1

    exe.run(startup_prog)
    compiled_train_prog = fluid.CompiledProgram(train_prog).with_data_parallel(
        loss_name=loss.name,
        build_strategy=build_strategy,
        exec_strategy=exec_strategy)

    if FLAGS.eval:
        compiled_eval_prog = fluid.compiler.CompiledProgram(eval_prog)

    fuse_bn = getattr(model.backbone, 'norm_type', None) == 'affine_channel'

    ignore_params = cfg.finetune_exclude_pretrained_params \
                 if 'finetune_exclude_pretrained_params' in cfg else []

    start_iter = 0
    if FLAGS.resume_checkpoint:
        checkpoint.load_checkpoint(exe, train_prog, FLAGS.resume_checkpoint)
        start_iter = checkpoint.global_step()
    elif cfg.pretrain_weights and fuse_bn and not ignore_params:
        checkpoint.load_and_fusebn(exe, train_prog, cfg.pretrain_weights)
    elif cfg.pretrain_weights:
        checkpoint.load_params(exe,
                               train_prog,
                               cfg.pretrain_weights,
                               ignore_params=ignore_params)

    train_reader = create_reader(cfg.TrainReader,
                                 (cfg.max_iters - start_iter) * devices_num,
                                 cfg)
    train_loader.set_sample_list_generator(train_reader, place)

    # whether output bbox is normalized in model output layer
    is_bbox_normalized = False
    if hasattr(model, 'is_bbox_normalized') and \
            callable(model.is_bbox_normalized):
        is_bbox_normalized = model.is_bbox_normalized()

    # if map_type not set, use default 11point, only use in VOC eval
    map_type = cfg.map_type if 'map_type' in cfg else '11point'

    train_stats = TrainingStats(cfg.log_smooth_window, train_keys)
    train_loader.start()
    start_time = time.time()
    end_time = time.time()

    cfg_name = os.path.basename(FLAGS.config).split('.')[0]
    save_dir = os.path.join(cfg.save_dir, cfg_name)
    time_stat = deque(maxlen=cfg.log_smooth_window)
    best_box_ap_list = [0.0, 0]  #[map, iter]

    # use tb-paddle to log data
    if FLAGS.use_tb:
        from tb_paddle import SummaryWriter
        tb_writer = SummaryWriter(FLAGS.tb_log_dir)
        tb_loss_step = 0
        tb_mAP_step = 0

    if FLAGS.use_vdl:
        from visualdl import LogWriter
        vdl_writer = LogWriter(FLAGS.vdl_log_dir, sync_cycle=5)

        with vdl_writer.mode("train"):
            scalars = [
                vdl_writer.scalar(loss_name) for loss_name in train_keys
            ]
            mAP_scalar = vdl_writer.scalar("mAP")
        vdl_loss_step = 0
        vdl_mAP_step = 0

    for it in range(start_iter, cfg.max_iters):
        start_time = end_time
        end_time = time.time()
        time_stat.append(end_time - start_time)
        time_cost = np.mean(time_stat)
        eta_sec = (cfg.max_iters - it) * time_cost
        eta = str(datetime.timedelta(seconds=int(eta_sec)))
        outs = exe.run(compiled_train_prog, fetch_list=train_values)
        stats = {k: np.array(v).mean() for k, v in zip(train_keys, outs[:-1])}

        # use tb-paddle to log loss
        if FLAGS.use_tb:
            if it % cfg.log_iter == 0:
                for loss_name, loss_value in stats.items():
                    tb_writer.add_scalar(loss_name, loss_value, tb_loss_step)
                tb_loss_step += 1

        if FLAGS.use_vdl:
            if it % cfg.log_iter == 0:
                for loss_name, scalar in zip(train_keys, scalars):
                    loss_value = stats[loss_name]
                    scalar.add_record(vdl_loss_step, loss_value)
                vdl_loss_step += 1

        train_stats.update(stats)
        logs = train_stats.log()
        if it % cfg.log_iter == 0 and (not FLAGS.dist or trainer_id == 0):
            strs = 'iter: {}, lr: {:.6f}, {}, time: {:.3f}, eta: {}'.format(
                it, np.mean(outs[-1]), logs, time_cost, eta)
            logger.info(strs)


        if (it > 0 and it % cfg.snapshot_iter == 0 or it == cfg.max_iters - 1) \
           and (not FLAGS.dist or trainer_id == 0):
            save_name = str(it) if it != cfg.max_iters - 1 else "model_final"
            checkpoint.save(exe, train_prog, os.path.join(save_dir, save_name))

            if FLAGS.eval:
                # evaluation
                results = eval_run(exe, compiled_eval_prog, eval_loader,
                                   eval_keys, eval_values, eval_cls)
                resolution = None
                if 'mask' in results[0]:
                    resolution = model.mask_head.resolution
                box_ap_stats = eval_results(results, cfg.metric,
                                            cfg.num_classes, resolution,
                                            is_bbox_normalized,
                                            FLAGS.output_eval, map_type,
                                            cfg['EvalReader']['dataset'])

                # use tb_paddle to log mAP
                if FLAGS.use_tb:
                    tb_writer.add_scalar("mAP", box_ap_stats[0], tb_mAP_step)
                    tb_mAP_step += 1

                if FLAGS.use_vdl:
                    mAP_scalar.add_record(vdl_mAP_step, box_ap_stats[0])
                    vdl_mAP_step += 1

                if box_ap_stats[0] > best_box_ap_list[0]:
                    best_box_ap_list[0] = box_ap_stats[0]
                    best_box_ap_list[1] = it
                    checkpoint.save(exe, train_prog,
                                    os.path.join(save_dir, "best_model"))
                logger.info("Best test box ap: {}, in iter: {}".format(
                    best_box_ap_list[0], best_box_ap_list[1]))

    train_loader.reset()
Esempio n. 4
0
    def train_loop(self,
                   num_epochs,
                   train_reader,
                   train_batch_size,
                   eval_reader=None,
                   save_interval_epochs=1,
                   log_interval_steps=10,
                   save_dir='output',
                   use_vdl=False):
        if not osp.isdir(save_dir):
            if osp.exists(save_dir):
                os.remove(save_dir)
            os.makedirs(save_dir)
        if use_vdl:
            from visualdl import LogWriter
            vdl_logdir = osp.join(save_dir, 'vdl_log')
        # 给transform添加arrange操作
        self.arrange_transforms(transforms=train_reader.transforms,
                                mode='train')
        # 构建train_data_loader
        self.build_train_data_loader(reader=train_reader,
                                     batch_size=train_batch_size)

        if eval_reader is not None:
            self.eval_transforms = eval_reader.transforms
            self.test_transforms = copy.deepcopy(eval_reader.transforms)

        # 获取实时变化的learning rate
        lr = self.optimizer._learning_rate
        if isinstance(lr, fluid.framework.Variable):
            self.train_outputs['lr'] = lr

        # 在多卡上跑训练
        if self.parallel_train_prog is None:
            build_strategy = fluid.compiler.BuildStrategy()
            build_strategy.fuse_all_optimizer_ops = False
            if __init__.env_info['place'] != 'cpu' and len(self.places) > 1:
                build_strategy.sync_batch_norm = self.sync_bn
            exec_strategy = fluid.ExecutionStrategy()
            exec_strategy.num_iteration_per_drop_scope = 1
            self.parallel_train_prog = fluid.CompiledProgram(
                self.train_prog).with_data_parallel(
                    loss_name=self.train_outputs['loss'].name,
                    build_strategy=build_strategy,
                    exec_strategy=exec_strategy)

        total_num_steps = math.floor(train_reader.num_samples /
                                     train_batch_size)
        num_steps = 0
        time_stat = list()

        if use_vdl:
            # VisualDL component
            log_writer = LogWriter(vdl_logdir, sync_cycle=20)
            train_step_component = OrderedDict()
            eval_component = OrderedDict()

        best_accuracy_key = ""
        best_accuracy = -1.0
        best_model_epoch = 1
        for i in range(num_epochs):
            records = list()
            step_start_time = time.time()
            for step, data in enumerate(self.train_data_loader()):
                outputs = self.exe.run(self.parallel_train_prog,
                                       feed=data,
                                       fetch_list=list(
                                           self.train_outputs.values()))
                outputs_avg = np.mean(np.array(outputs), axis=1)
                records.append(outputs_avg)

                # 训练完成剩余时间预估
                current_time = time.time()
                step_cost_time = current_time - step_start_time
                step_start_time = current_time
                if len(time_stat) < 20:
                    time_stat.append(step_cost_time)
                else:
                    time_stat[num_steps % 20] = step_cost_time
                eta = ((num_epochs - i) * total_num_steps - step -
                       1) * np.mean(time_stat)
                eta_h = math.floor(eta / 3600)
                eta_m = math.floor((eta - eta_h * 3600) / 60)
                eta_s = int(eta - eta_h * 3600 - eta_m * 60)
                eta_str = "{}:{}:{}".format(eta_h, eta_m, eta_s)

                # 每间隔log_interval_steps,输出loss信息
                num_steps += 1
                if num_steps % log_interval_steps == 0:
                    step_metrics = OrderedDict(
                        zip(list(self.train_outputs.keys()), outputs_avg))

                    if use_vdl:
                        for k, v in step_metrics.items():
                            if k not in train_step_component.keys():
                                with log_writer.mode('Each_Step_while_Training'
                                                     ) as step_logger:
                                    train_step_component[
                                        k] = step_logger.scalar(
                                            'Training: {}'.format(k))
                            train_step_component[k].add_record(num_steps, v)

                    logging.info(
                        "[TRAIN] Epoch={}/{}, Step={}/{}, {}, eta={}".format(
                            i + 1, num_epochs, step + 1, total_num_steps,
                            dict2str(step_metrics), eta_str))
            train_metrics = OrderedDict(
                zip(list(self.train_outputs.keys()), np.mean(records, axis=0)))
            logging.info('[TRAIN] Epoch {} finished, {} .'.format(
                i + 1, dict2str(train_metrics)))

            # 每间隔save_interval_epochs, 在验证集上评估和对模型进行保存
            if (i + 1) % save_interval_epochs == 0 or i == num_epochs - 1:
                current_save_dir = osp.join(save_dir, "epoch_{}".format(i + 1))
                if not osp.isdir(current_save_dir):
                    os.makedirs(current_save_dir)
                if eval_reader is not None:
                    # 检测目前仅支持单卡评估,训练数据batch大小与显卡数量之商为验证数据batch大小。
                    eval_batch_size = train_batch_size
                    self.eval_metrics, self.eval_details = self.evaluate(
                        eval_reader=eval_reader,
                        batch_size=eval_batch_size,
                        verbose=True,
                        epoch_id=i + 1,
                        return_details=True)
                    logging.info('[EVAL] Finished, Epoch={}, {} .'.format(
                        i + 1, dict2str(self.eval_metrics)))
                    # 保存最优模型
                    best_accuracy_key = list(self.eval_metrics.keys())[0]
                    current_accuracy = self.eval_metrics[best_accuracy_key]
                    if current_accuracy > best_accuracy:
                        best_accuracy = current_accuracy
                        best_model_epoch = i + 1
                        best_model_dir = osp.join(save_dir, "best_model")
                        self.save_model(save_dir=best_model_dir)
                    if use_vdl:
                        for k, v in self.eval_metrics.items():
                            if isinstance(v, list):
                                continue
                            if isinstance(v, np.ndarray):
                                if v.size > 1:
                                    continue
                            if k not in eval_component:
                                with log_writer.mode('Each_Epoch_on_Eval_Data'
                                                     ) as eval_logger:
                                    eval_component[k] = eval_logger.scalar(
                                        'Evaluation: {}'.format(k))
                            eval_component[k].add_record(i + 1, v)
                self.save_model(save_dir=current_save_dir)
                logging.info(
                    'Current evaluated best model in eval_reader is epoch_{}, {}={}'
                    .format(best_model_epoch, best_accuracy_key,
                            best_accuracy))
Esempio n. 5
0
from visualdl import LogWriter

# Download MNIST data
mnist = mx.test_utils.get_mnist()
batch_size = 100

# Provide a folder to store data for log, model, image, etc. VisualDL's visualization will be
# based on this folder.
logdir = "./tmp"

# Initialize a logger instance. Parameter 'sync_cycle' means write a log every 10 operations on
# memory.
logger = LogWriter(logdir, sync_cycle=10)

# mark the components with 'train' label.
with logger.mode("train"):
    # scalar0 is used to record scalar metrics while MXNet is training. We will record accuracy.
    # In the visualization, we can see the accuracy is increasing as more training steps happen.
    scalar0 = logger.scalar("scalars/scalar0")
    image0 = logger.image("images/image0", 1)
    histogram0 = logger.histogram("histogram/histogram0", num_buckets=100)

# Record training steps
cnt_step = 0


# MXNet provides many callback interface. Here we define our own callback method and it is called
# after every batch.
# https://mxnet.incubator.apache.org/api/python/callback/callback.html
def add_scalar():
    def _callback(param):
Esempio n. 6
0
def train():
    img = fluid.layers.data(name="img", shape=[1, 28, 28], dtype="float32")
    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
    avg_cost, acc = lenet_5(img, label)

    # get the mnist dataset
    train_reader = paddle.batch(paddle.dataset.mnist.train(), batch_size=64)

    # define the loss
    optimizer = fluid.optimizer.Adam(learning_rate=0.001)
    optimizer.minimize(avg_cost)

    # running on cpu
    place = fluid.CPUPlace()
    feeder = fluid.DataFeeder(feed_list=[img, label], place=place)
    exe = fluid.Executor(place)

    log_writter = LogWriter("./vdl_log", sync_cycle=10)
    with log_writter.mode("train") as logger:
        scalar_loss = logger.scalar(tag="loss")
        scalar_accuracy = logger.scalar(tag="accuracy")
        num_samples = 10
        image_input = logger.image(tag="input", num_samples=num_samples)
        histogram = logger.histogram(tag="histogram", num_buckets=50)

    # init all param
    exe.run(fluid.default_startup_program())
    step = 0
    sample_num = 0
    epochs = 5
    param_name = fluid.default_startup_program().global_block().all_parameters(
    )[0].name

    # start to train
    for i in range(epochs):
        for batch in train_reader():
            cost, accuracy, input, param = exe.run(
                feed=feeder.feed(batch),
                fetch_list=[avg_cost.name, acc.name, img.name, param_name])
            step += 1

            # record the loss and accuracy
            scalar_loss.add_record(step, cost)
            scalar_accuracy.add_record(step, accuracy)

            if sample_num % num_samples == 0:
                image_input.start_sampling()

            idx = image_input.is_sample_taken()

            if idx != -1:
                # the first image in the batch data
                image_data = input[0]
                # the image shape recrod in VDL is H * W * C
                image_data = image_data.reshape([28, 28, 1])
                image_input.set_sample(idx, image_data.shape,
                                       100 * image_data.flatten())
                sample_num += 1
                if sample_num % num_samples == 0:
                    image_input.finish_sampling()
                    sample_num = 0

            # record the parameter trend
            histogram.add_record(step, param.flatten())
Esempio n. 7
0
import paddle as paddle
import paddle.dataset.cifar as cifar
import paddle.fluid as fluid
import mobilenet_v2
from visualdl import LogWriter

# 创建VisualDL的记录器,
# 通过这个记录器可以记录每次训练的数据,并存储在log/目录下。
# 创建记录器
log_writer = LogWriter(dir='log/', sync_cycle=10)

# 创建训练和测试记录数据工具
with log_writer.mode('train') as writer:
    train_cost_writer = writer.scalar('cost')
    train_acc_writer = writer.scalar('accuracy')
    histogram = writer.histogram('histogram', num_buckets=50)

with log_writer.mode('test') as writer:
    test_cost_writer = writer.scalar('cost')
    test_acc_writer = writer.scalar('accuracy')

# 定义输入层,获取MobileNet V2的分类器,
# 克隆预测程序,定义优化方法。
# 定义输入层
image = fluid.layers.data(name='image', shape=[3, 32, 32], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')

# 获取分类器
model = mobilenet_v2.net(image, 10)

# 获取损失函数和准确率函数
class SolverWrapper:
    def __init__(self, solver_prototxt, log_dir, pretrained_model=None):
        self.solver = caffe.SGDSolver(solver_prototxt)
        if pretrained_model is not None:
            print('Loading pretrained model weights from {:s}'.format(pretrained_model))
            self.solver.net.copy_from(pretrained_model)
        
        self.solver_param = caffe_pb2.SolverParameter()
        with open(solver_prototxt, 'rt') as f:
            pb2.text_format.Merge(f.read(), self.solver_param)
        self.cur_epoch = 0
        self.test_interval = 500  #用来替代self.solver_param.test_interval
        #self.test_interval = 2000  #用来替代self.solver_param.test_interval
        self.logw = LogWriter(log_dir, sync_cycle=100)
        with self.logw.mode('train') as logger:
            self.sc_train_acc = logger.scalar("Accuracy")
            self.sc_train_lr = logger.scalar("learning_rate")
        with self.logw.mode('val') as logger:
            self.sc_val_acc = logger.scalar("Accuracy")
            self.sc_val_lr = logger.scalar("learning_rate")
        

    def train_model(self):
        """执行训练的整个流程,穿插了validation"""
        cur_iter = 0
        test_batch_size, num_classes = self.solver.test_nets[0].blobs['prob'].shape
        num_test_images_tot = test_batch_size * self.solver_param.test_iter[0]
        lr_policy = self.solver_param.lr_policy
        memo_t = 25   # 2 * 25(each epoch is 25)
        while cur_iter < self.solver_param.max_iter:
            #self.solver.step(self.test_interval)
            for i in range(self.test_interval):
                self.solver.step(1)
                cur_iter += 1

                #loss = self.solver.net.blobs['loss'].data
                if (cur_iter==1 or cur_iter % memo_t==0):
                    acc = float(self.solver.net.blobs['accuracy'].data)
                    step = cur_iter
                    lr = self.get_lr(lr_policy, cur_iter)
                    #self.sc_train_loss.add_record(step, loss)
                    self.sc_train_acc.add_record(step, acc)
                    self.sc_train_lr.add_record(step, lr)
                    self.eval_on_val(num_classes, num_test_images_tot, test_batch_size)
            #self.eval_on_val(num_classes, num_test_images_tot, test_batch_size)
        
    def eval_on_val(self, num_classes, num_test_images_tot, test_batch_size):
        """在整个验证集上执行inference和evaluation"""
        self.solver.test_nets[0].share_with(self.solver.net)
        self.cur_epoch += 1
        scores = np.zeros((num_classes, num_test_images_tot), dtype=np.float32)
        gt_labels = np.zeros((1, num_test_images_tot), dtype=np.float32).squeeze()
        for t in range(self.solver_param.test_iter[0]):
            output = self.solver.test_nets[0].forward()
            probs = output['prob']
            labels = self.solver.test_nets[0].blobs['label'].data

            gt_labels[t*test_batch_size:(t+1)*test_batch_size] = labels.T.astype(np.float32)
            scores[:,t*test_batch_size:(t+1)*test_batch_size] = probs.T
        # TODO: 处理最后一个batch样本少于num_test_images_per_batch的情况
        
        ap, acc = perfeval.cls_eval(scores, gt_labels)
        print('====================================================================\n')
        print('\tDo validation after the {:d}-th training epoch\n'.format(self.cur_epoch))
        print('>>>>', end='\t')  #设定标记,方便于解析日志获取出数据
        for i in range(num_classes):
            print('AP[{:d}]={:.4f}'.format(i, ap[i]), end=', ')
        mAP = np.average(ap)
        print('mAP={:.4f}, Accuracy={:.4f}'.format(mAP, acc))
        print('\n====================================================================\n')
        step = self.solver.iter
        lr_policy = self.solver_param.lr_policy
        lr = self.get_lr(lr_policy, step)
        self.sc_val_acc.add_record(step, acc)
        self.sc_val_lr.add_record(step, lr)


    def get_lr(self, lr_policy, cur_iter):
        if lr_policy=="fixed":
            rate = self.solver_param.base_lr
        elif lr_policy=="step":
            cur_step = cur_iter / self.solver_param.stepsize
            rate = self.solver_param.base_lr * math.pow(self.solver_param.gamma, cur_step)
        elif lr_policy=="exp":
            rate = self.solver_param.base_lr * math.pow(self.solver_param.gamma, cur_iter)
        elif lr_policy=="triangular":
            cycle = cur_iter / (2*self.solver_param.stepsize)
            x = float(cur_iter - (2*cycle+1)*self.solver_param.stepsize)
            x = x / self.solver_param.stepsize
            rate = self.solver_param.base_lr + (self.solver_param.max_lr  - self.solver_param.base_lr)*max(0, 1-abs(x))
        return rate
Esempio n. 9
0
class StorageTest(unittest.TestCase):
    def setUp(self):
        self.dir = "./tmp/storage_test"
        self.writer = LogWriter(self.dir, sync_cycle=1).as_mode("train")

    def test_scalar(self):
        print('test write')
        scalar = self.writer.scalar("model/scalar/min")
        # scalar.set_caption("model/scalar/min")
        for i in range(10):
            scalar.add_record(i, float(i))

        print('test read')
        self.reader = LogReader(self.dir)
        with self.reader.mode("train") as reader:
            scalar = reader.scalar("model/scalar/min")
            self.assertEqual(scalar.caption(), "train")
            records = scalar.records()
            ids = scalar.ids()
            self.assertTrue(
                np.equal(records, [float(i) for i in range(10 - 1)]).all())
            self.assertTrue(np.equal(ids, [float(i) for i in range(10)]).all())
            print('records', records)
            print('ids', ids)

    def test_image(self):
        tag = "layer1/layer2/image0"
        image_writer = self.writer.image(tag, 10, 1)
        num_passes = 10
        num_samples = 100
        shape = [10, 10, 3]

        for pass_ in range(num_passes):
            image_writer.start_sampling()
            for ins in range(num_samples):
                data = np.random.random(shape) * 256
                data = np.ndarray.flatten(data)
                image_writer.add_sample(shape, list(data))
            image_writer.finish_sampling()

        self.reader = LogReader(self.dir)
        with self.reader.mode("train") as reader:
            image_reader = reader.image(tag)
            self.assertEqual(image_reader.caption(), tag)
            self.assertEqual(image_reader.num_records(), num_passes)

            image_record = image_reader.record(0, 1)
            self.assertTrue(np.equal(image_record.shape(), shape).all())
            data = image_record.data()
            self.assertEqual(len(data), np.prod(shape))

            image_tags = reader.tags("image")
            self.assertTrue(image_tags)
            self.assertEqual(len(image_tags), 1)

    def test_check_image(self):
        '''
        check whether the storage will keep image data consistent
        '''
        print('check image')
        tag = "layer1/check/image1"
        image_writer = self.writer.image(tag, 10)

        image = Image.open("./dog.jpg")
        shape = [image.size[1], image.size[0], 3]
        origin_data = np.array(image.getdata()).flatten()

        self.reader = LogReader(self.dir)
        with self.reader.mode("train") as reader:

            image_writer.start_sampling()
            image_writer.add_sample(shape, list(origin_data))
            image_writer.finish_sampling()

            # read and check whether the original image will be displayed
            image_reader = reader.image(tag)
            image_record = image_reader.record(0, 0)
            data = image_record.data()
            shape = image_record.shape()

            PIL_image_shape = (shape[0] * shape[1], shape[2])
            data = np.array(data, dtype='uint8').reshape(PIL_image_shape)
            print('origin', origin_data.flatten())
            print('data', data.flatten())
            image = Image.fromarray(data.reshape(shape))
            # manully check the image and found that nothing wrong with the image storage.
            # image.show()

    def test_with_syntax(self):
        with self.writer.mode("train") as writer:
            scalar = writer.scalar("model/scalar/average")
            for i in range(10):
                scalar.add_record(i, float(i))

        self.reader = LogReader(self.dir)
        with self.reader.mode("train") as reader:
            scalar = reader.scalar("model/scalar/average")
            self.assertEqual(scalar.caption(), "train")

    def test_modes(self):
        store = LogWriter(self.dir, sync_cycle=1)

        scalars = []

        for i in range(10):
            with store.mode("mode-%d" % i) as writer:
                scalar = writer.scalar("add/scalar0")
                scalars.append(scalar)

        for scalar in scalars[:-1]:
            for i in range(10):
                scalar.add_record(i, float(i))
Esempio n. 10
0
def train():
    log_writter = LogWriter('./vdl_log', sync_cycle=10)

    with log_writter.mode("train") as logger:
        log_g_loss = logger.scalar(tag="g_loss")
        log_d_loss = logger.scalar(tag="d_loss")

    place = fluid.CUDAPlace(1)
    with fluid.dygraph.guard(place):

        random_vector_data = np.random.standard_normal(
            (num_examples_to_generate, noise_dim)).astype('float32')
        random_vector_for_generation = to_variable(random_vector_data)

        mnist_dcgan = dcgan('mnist_dcgan')

        discriminator_optimizer = fluid.optimizer.Adam(learning_rate=1e-4)
        generator_optimizer = fluid.optimizer.Adam(learning_rate=1e-4)

        train_data = paddle.dataset.mnist.train()

        for epoch in range(num_epochs):

            train_reader = paddle.batch(paddle.reader.shuffle(
                train_data, buf_size=buffer_size),
                                        batch_size=batch_size,
                                        drop_last=True)

            print("Epoch id: ", epoch)

            total_loss_gen = []
            total_loss_disc = []

            for batch_id, data in enumerate(train_reader()):

                noise_data = np.random.standard_normal(
                    (batch_size, noise_dim)).astype('float32')
                noise = to_variable(noise_data)

                img_data = np.array([x[0].reshape(1, 28, 28)
                                     for x in data]).astype('float32')
                img = to_variable(img_data)

                gen_loss, generated_images = mnist_dcgan(
                    noise, img, None, True)
                gen_loss = fluid.layers.reduce_mean(gen_loss)

                gen_loss.backward()
                vars_G = []
                for parm in mnist_dcgan.parameters():
                    if parm.name[:31] == 'mnist_dcgan/dcgan_0/generator_0':
                        vars_G.append(parm)
                generator_optimizer.minimize(gen_loss, parameter_list=vars_G)
                mnist_dcgan.clear_gradients()

                disc_loss = mnist_dcgan(noise, img, generated_images, False)
                disc_loss = fluid.layers.reduce_mean(disc_loss)

                disc_loss.backward()
                vars_D = []
                for parm in mnist_dcgan.parameters():
                    if parm.name[:35] == 'mnist_dcgan/dcgan_0/discriminator_0':
                        vars_D.append(parm)
                discriminator_optimizer.minimize(disc_loss,
                                                 parameter_list=vars_D)
                mnist_dcgan.clear_gradients()

                total_loss_gen.append(gen_loss.numpy()[0])
                total_loss_disc.append(disc_loss.numpy()[0])

            if epoch % 10 == 0:
                generate_and_save_images(epoch, mnist_dcgan,
                                         random_vector_for_generation)

            print("Generator loss: ",
                  np.mean(np.array(total_loss_gen).astype('float32')))
            print("Discriminator loss: ",
                  np.mean(np.array(total_loss_disc).astype('float32')))

            log_g_loss.add_record(
                epoch, np.mean(np.array(total_loss_gen).astype('float32')))
            log_d_loss.add_record(
                epoch, np.mean(np.array(total_loss_disc).astype('float32')))
Esempio n. 11
0
import random
from visualdl import LogWriter
import ca

logdir='./temp'
logger = LogWriter(logdir,sync_cycle=10)
with logger.mode('train'):
    scalar0 = logger.scalar('scalar0')
for step in range(0,1000):
    scalar0.add_record(step,random.random())
Esempio n. 12
0
def train():
    img = fluid.layers.data(name="img", shape=[1, 28, 28], dtype="float32")
    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
    avg_cost, acc, prediction = lenet_5(img, label)

    # get the mnist dataset
    train_reader = paddle.batch(paddle.dataset.mnist.train(), batch_size=64)
    test_reader = paddle.batch(paddle.dataset.mnist.test(), batch_size=64)

    test_program = fluid.default_main_program().clone(for_test=True)
    optimizer = fluid.optimizer.Adam(learning_rate=0.001)
    optimizer.minimize(avg_cost)

    # running on cpu
    place = fluid.CPUPlace()
    #place = fluid.CUDAPlace(0)
    feeder = fluid.DataFeeder(feed_list=[img, label], place=place)
    exe = fluid.Executor(place)

    # init all param
    exe.run(fluid.default_startup_program())
    step = 0
    sample_num = 0
    epochs = 6

    log_writter = LogWriter("./vdl_log", sync_cycle=100000)
    with log_writter.mode("train") as logger:
        trn_scalar_loss = logger.scalar("loss")
        trn_scalar_acc = logger.scalar("acc")
    with log_writter.mode('test') as logger:
        tst_scalar_loss = logger.scalar("loss")
        tst_scalar_acc = logger.scalar("acc")

    # start to train
    off = 0
    for i in range(epochs):
        train_acc, train_cost = [], []
        for step, batch in enumerate(train_reader()):
            res_cost, res_acc = exe.run(fluid.default_main_program(),
                                        feed=feeder.feed(batch),
                                        fetch_list=[avg_cost.name, acc.name])
            train_cost.append(res_cost)
            train_acc.append(res_acc)

            if (step % 50 == 0 and step != 0) or (step == 0 and i == 0):
                # record the loss and accuracy
                st = step + off
                mloss = np.mean(np.array(train_cost))
                macc = np.mean(np.array(train_acc))
                trn_scalar_loss.add_record(st, mloss)
                trn_scalar_acc.add_record(st, macc)
                train_acc, train_cost = [], []
                print("Epoc:{}, Iter:{}, loss:{}, acc{}".format(
                    i, step, mloss, macc))

                test_acc, test_cost = [], []
                for data in test_reader():
                    res_cost, res_acc = exe.run(
                        test_program,
                        feed=feeder.feed(data),
                        fetch_list=[avg_cost.name, acc.name])
                    test_cost.append(res_cost)
                    test_acc.append(res_acc)
                mloss = np.mean(np.array(test_cost))
                macc = np.mean(np.array(test_acc))
                tst_scalar_loss.add_record(st, mloss)
                tst_scalar_acc.add_record(st, macc)
                test_acc, test_cost = [], []
                print("Test Epoc:{}, loss:{}, acc{}".format(i, mloss, macc))
        off = off + step
    fluid.io.save_persistables(exe, "mnist_model")
    fluid.io.save_inference_model("mnist_save_model", ['img'], [predition],
                                  exe,
                                  model_filename='model',
                                  params_filename='params')
Esempio n. 13
0
    print("setting".center(50, "="))
    print("lr = {}, rc = {}, epochs = {}, batch_size = {}".format(args.lr, args.rc, args.epochs,
                                                                  args.batch_size))
    print("Experiment ID: {}".format(args.exp_id).center(50, "="))
    print("training in GPU: {}".format(args.gpu_id).center(50, "="))
    d_name = args.d_name
    
    # get data
    g, label, train_idx, valid_idx, test_idx, evaluator = get_graph_data(
                                                            d_name=d_name, 
                                                            mini_data=eval(args.mini_data))
    
    
    # create log writer
    log_writer = LogWriter(args.log_path, sync_cycle=10)
    with log_writer.mode("train") as logger:
        log_train_loss_epoch = logger.scalar("loss")
        log_train_rocauc_epoch = logger.scalar("rocauc")
    with log_writer.mode("valid") as logger:
        log_valid_loss_epoch = logger.scalar("loss")
        log_valid_rocauc_epoch = logger.scalar("rocauc")
    log_text = log_writer.text("text")
    log_time = log_writer.scalar("time")
    log_test_loss = log_writer.scalar("test_loss")
    log_test_rocauc = log_writer.scalar("test_rocauc")

    
    # training
    samples = [25, 10] # 2-hop sample size
    batch_size = args.batch_size
    sample_workers = 1
Esempio n. 14
0
def main():
    # 配置
    cfg = load_config(FLAGS.config)
    merge_config(FLAGS.opt)
    if 'architecture' in cfg:
        main_arch = cfg.architecture
    else:
        raise ValueError("'architecture' not specified in config file.")
    check_gpu(cfg.use_gpu)
    check_version()

    # 执行器
    place = fluid.CUDAPlace(0) if cfg.use_gpu else fluid.CPUPlace()
    exe = fluid.Executor(place)

    # 模型
    lr_builder = create('LearningRate')
    optim_builder = create('OptimizerBuilder')
    startup_prog = fluid.Program()
    train_prog = fluid.Program()
    with fluid.program_guard(train_prog, startup_prog):
        with fluid.unique_name.guard():
            model = create(main_arch)
            inputs_def = cfg.TrainReader['inputs_def']
            feed_vars, train_loader = model.build_inputs(**inputs_def)
            train_fetches = model.train(feed_vars)
            loss = train_fetches['loss']
            lr = lr_builder()
            optimizer = optim_builder(lr)
            optimizer.minimize(loss)
    train_keys, train_values, _ = parse_fetches(train_fetches)
    train_values.append(lr)
    if FLAGS.eval:
        eval_prog = fluid.Program()
        with fluid.program_guard(eval_prog, startup_prog):
            with fluid.unique_name.guard():
                model = create(main_arch)
                inputs_def = cfg.EvalReader['inputs_def']
                feed_vars, eval_loader = model.build_inputs(**inputs_def)
                fetches = model.eval(feed_vars)
        eval_prog = eval_prog.clone(True)
        extra_keys = ['gt_bbox', 'gt_class', 'is_difficult']
        eval_keys, eval_values, _ = parse_fetches(fetches, eval_prog, extra_keys)
        eval_reader = create_reader(cfg.EvalReader)
        eval_loader.set_sample_list_generator(eval_reader, place)

    ##### 运行 ####
    exe.run(startup_prog)

    ## 恢复与迁移
    ignore_params = cfg.finetune_exclude_pretrained_params \
                 if 'finetune_exclude_pretrained_params' in cfg else []
    start_iter = 0
    if FLAGS.resume_checkpoint:
        checkpoint.load_checkpoint(exe, train_prog, FLAGS.resume_checkpoint)
        start_iter = checkpoint.global_step() + 1
    elif cfg.pretrain_weights:
        checkpoint.load_params(
            exe, train_prog, cfg.pretrain_weights, ignore_params=ignore_params)

    ## 数据迭代器
    train_reader = create_reader(cfg.TrainReader, cfg.max_iters - start_iter, cfg)
    train_loader.set_sample_list_generator(train_reader, place)

    ## 训练循环
    train_loader.start()

    # 过程跟踪
    train_stats = TrainingStats(cfg.log_smooth_window, train_keys)
    start_time = time.time()
    end_time = time.time()
    time_stat = deque(maxlen=cfg.log_smooth_window)
    cfg_name = os.path.basename(FLAGS.config).split('.')[0]
    save_dir = os.path.join(cfg.save_dir, cfg_name)
    best_box_ap_list = [0.0, 0]
    if FLAGS.use_vdl:
        log_writter = LogWriter(FLAGS.vdl_log_dir, sync_cycle=5)
        with log_writter.mode("train") as vdl_logger:
            train_scalar_loss = vdl_logger.scalar(tag="loss")
        with log_writter.mode("val") as vdl_logger:
            val_scalar_map = vdl_logger.scalar(tag="map")

    for it in range(start_iter, cfg.max_iters):
        # 运行程序
        outs = exe.run(train_prog, fetch_list=train_values)
        stats = {k: np.array(v).mean() for k, v in zip(train_keys, outs[:-1])}
        
        # 日志与可视化窗口
        start_time = end_time
        end_time = time.time()
        time_stat.append(end_time - start_time)
        time_cost = np.mean(time_stat)
        eta_sec = (cfg.max_iters - it) * time_cost
        eta = str(datetime.timedelta(seconds=int(eta_sec)))
        train_stats.update(stats)
        logs = train_stats.log()
        if it % cfg.log_iter == 0:
            # log
            strs = 'iter: {}, lr: {:.6f}, {}, time: {:.3f}, eta: {}'.format(
                it, np.mean(outs[-1]), logs, time_cost, eta)
            logger.info(strs)
            # vdl
            if FLAGS.use_vdl:
                train_scalar_loss.add_record(it//cfg.log_iter, stats['loss'])

        # 模型保存与评价窗口
        if (it > 0 and it % cfg.snapshot_iter == 0 or it == cfg.max_iters - 1):

            # 模型保存
            save_name = str(it) if it != cfg.max_iters - 1 else "final"
            checkpoint.save(exe, train_prog, os.path.join(save_dir, save_name))

            ## 模型评价
            if FLAGS.eval:
                current_step = it//cfg.snapshot_iter if it % cfg.snapshot_iter == 0 \
                                    else it//cfg.snapshot_iter+1
                ## 训练集评价

                ## 验证集评价
                results = eval_run(exe, eval_prog, eval_loader,
                                   eval_keys, eval_values)
                box_ap_stats = eval_results(results, cfg.num_classes)
                logger.info("eval box op: {}, in iter: {}".format(
                    box_ap_stats, it))
                if FLAGS.use_vdl:
                    val_scalar_map.add_record(current_step, box_ap_stats)

                ## 保存最佳模型
                if box_ap_stats > best_box_ap_list[0]:
                    best_box_ap_list[0] = box_ap_stats
                    best_box_ap_list[1] = it
                    checkpoint.save(exe, train_prog, os.path.join(save_dir, "best_model"))

                # 日志
                logger.info("Best eval box ap: {}, in iter: {}".format(
                    best_box_ap_list[0], best_box_ap_list[1]))


    train_loader.reset()
Esempio n. 15
0
# coding=utf-8
from visualdl import LogWriter

# 创建 LogWriter 对象
log_writter = LogWriter("./log", sync_cycle=10)

# 创建 text 组件,模式为 train, 标签为 test
with log_writter.mode("train") as logger:
    vdl_text_comp = logger.text(tag="test")

# 使用 add_record() 函数添加数据
for i in range(1, 6):
    vdl_text_comp.add_record(i, "这是第 %d 个 Step 的数据。" % i)
    vdl_text_comp.add_record(i, "This is data %d ." % i)
Esempio n. 16
0
class Trainer(object):
    @classmethod
    def add_cmdline_argument(cls, parser):
        """ Add the cmdline arguments of trainer. """
        group = parser.add_argument_group("Trainer")

        group.add_argument(
            '--infer_network',
            type=str,
            default='ResNet32',
            help=
            "Set inference network. Default is ResNet32. [ResNet10, ResNet32, ResNet110, VGG]"
        )
        group.add_argument(
            '--dataset',
            type=str,
            default='cifar-10',
            help='The dataset name. Default is cifar-10. [cifar-10, cifar-100]'
        )
        group.add_argument('--num_epochs',
                           type=int,
                           default=1,
                           help='Number of epoch. Default is 1.')
        group.add_argument('--batch_size',
                           type=int,
                           default=128,
                           help="Batch size. Default is 128.")
        group.add_argument(
            '-c',
            '--enable_ce',
            action='store_true',
            help='If set, run the task with continuous evaluation logs.')
        group.add_argument(
            '--logger',
            type=str,
            default='',
            help='Path to log data generated in deep learning tasks.')
        group.add_argument(
            '--cpu_num',
            type=int,
            default=1,
            help='Specify the number of the logic core. Default is 1.')
        group.add_argument(
            '--cuda_devices',
            type=list,
            default=1,
            help='Specify the number of the CUDA devices. Default is 1.')
        group.add_argument(
            '-m',
            '--multi_card',
            action='store_true',
            help=
            'In the mode of multi graphics card training, all graphics card will be occupied.'
            +
            'If --use_cuda is false, the model will be run in CPU. In this situation, the multi-threads'
            +
            'are used to run the model, and the number of threads is equal to the number of logic cores.'
            +
            'You can configure --cpu_num to change the number of threads that are being used.'
        )

        return group

    def __init__(self, hparams):
        # Use data distributed
        self.infer_network = hparams.infer_network
        self.dataset = hparams.dataset
        self.num_epochs = hparams.num_epochs
        self.batch_size = hparams.batch_size
        self.enable_ce = hparams.enable_ce
        self.logger = hparams.logger
        self.cpu_num = hparams.cpu_num
        self.cuda_devices = hparams.cuda_devices
        self.multi_card = hparams.multi_card
        self.num_class = 10  # default is cifar-10

        if self.logger:
            from visualdl import LogWriter
            self.log_writer = LogWriter(self.logger, sync_cycle=20)
            # Create two ScalarWriter instances, whose mode is set to be "train"
            with self.log_writer.mode("train") as logger:
                self.train_cost = logger.scalar("cost")
                self.train_acc = logger.scalar("acc")

            # Create a ScalarWriter instance, whose mode is set to be "test"
            with self.log_writer.mode("test") as logger:
                self.test_loss = logger.scalar("loss")
                self.test_acc = logger.scalar("acc")

        if self.dataset is "cifar-100":
            self.num_class = 100

        #if not os.path.exists(self.save_dir):
        #    os.makedirs(self.save_dir)

    def inference_network(self):
        # The image is 32 * 32 with RGB representation.
        data_shape = [None, 3, 32, 32]
        images = fluid.data(name='pixel', shape=data_shape, dtype='float32')

        if self.infer_network == 'ResNet20':
            predict = resnet_cifar10(images, 20, self.num_class)
        elif self.infer_network == 'ResNet32':
            predict = resnet_cifar10(images, 32, self.num_class)
        elif self.infer_network == 'ResNet110':
            predict = resnet_cifar10(images, 110, self.num_class)
        elif self.infer_network == 'VGG':
            predict = vgg_bn_drop(images, self.num_class)
        else:
            logging.error(
                'The following inference network is not supported! Choose on of: resnet, vgg.'
            )
            sys.exit(1)
        return predict

    def train_network(self, predict):
        label = fluid.data(name='label', shape=[None, 1], dtype='int64')
        cost = fluid.layers.cross_entropy(input=predict, label=label)
        avg_cost = fluid.layers.mean(cost)
        accuracy = fluid.layers.accuracy(input=predict, label=label)
        return [avg_cost, accuracy]

    def optimizer_program(self):
        return fluid.optimizer.Adam(learning_rate=0.001)

    def train(self, use_cuda, params_dirname):
        train_start = datetime.utcnow()

        if use_cuda:
            # NOTE: for multi process mode: one process per GPU device.
            # For example: CUDA_VISIBLE_DEVICES="0,1,2,3".
            # os.environ['CUDA_VISIBLE_DEVICES'] = self.cuda_devices
            # print("CUDA_VISIBLE_DEVICES:" + str(os.getenv("CUDA_VISIBLE_DEVICES")))
            pass
        else:
            # NOTE: If you use CPU to run the program, you need
            # to specify the CPU_NUM, otherwise, fluid will use
            # all the number of the logic core as the CPU_NUM,
            # in that case, the batch size of the input should be
            # greater than CPU_NUM, if not, the process will be
            # failed by an exception.
            if not use_cuda:
                os.environ['CPU_NUM'] = str(self.cpu_num)
                print("CPU_NUM:" + str(os.getenv("CPU_NUM")))

        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()

        if self.enable_ce:
            train_reader = paddle.batch(paddle.dataset.cifar.train10(),
                                        batch_size=self.batch_size)
            test_reader = paddle.batch(paddle.dataset.cifar.test10(),
                                       batch_size=self.batch_size)
        else:
            test_reader = paddle.batch(paddle.dataset.cifar.test10(),
                                       batch_size=self.batch_size)
            train_reader = paddle.batch(paddle.reader.shuffle(
                paddle.dataset.cifar.train10(), buf_size=128 * 100),
                                        batch_size=self.batch_size)

        feed_order = ['pixel', 'label']

        main_program = fluid.default_main_program()
        start_program = fluid.default_startup_program()

        if self.enable_ce:
            main_program.random_seed = 90
            start_program.random_seed = 90

        predict = self.inference_network()
        avg_cost, acc = self.train_network(predict)

        # Test program
        test_program = main_program.clone(for_test=True)
        optimizer = self.optimizer_program()
        optimizer.minimize(avg_cost)

        exe = fluid.Executor(place)

        EPOCH_NUM = self.num_epochs

        # For training test cost
        def train_test(program, reader):
            count = 0
            feed_var_list = [
                program.global_block().var(var_name) for var_name in feed_order
            ]
            feeder_test = fluid.DataFeeder(feed_list=feed_var_list,
                                           place=place)
            test_exe = fluid.Executor(place)
            accumulated = len([avg_cost, acc]) * [0]
            for tid, test_data in enumerate(reader()):
                if self.multi_card:
                    compiled_prog = fluid.compiler.CompiledProgram(
                        main_program)
                    avg_cost_np = test_exe.run(
                        program=program,
                        feed=feeder_test.feed(test_data),
                        fetch_list=[avg_cost, acc])
                else:
                    avg_cost_np = test_exe.run(
                        program=program,
                        feed=feeder_test.feed(test_data),
                        fetch_list=[avg_cost, acc])
                accumulated = [
                    x[0] + x[1][0] for x in zip(accumulated, avg_cost_np)
                ]
                count += 1
            return [x / count for x in accumulated]

        # main train loop.
        def train_loop():
            feed_var_list_loop = [
                main_program.global_block().var(var_name)
                for var_name in feed_order
            ]
            feeder = fluid.DataFeeder(feed_list=feed_var_list_loop,
                                      place=place)
            exe.run(start_program)

            # 1. MP mode, batch size for current process should be self.batch_size / GPUs
            # 2. SP/PG mode, batch size for each process should be original self.batch_size
            #if os.getenv("FLAGS_selected_gpus"):
            #    steps_per_pass = images / (
            #        self.batch_size / get_device_num()) / num_trainers
            #else:
            #    steps_per_pass = images / self.batch_size / num_trainers

            print('Train started at {}'.format(
                train_start.strftime('%Y-%m-%d %H:%M:%S.%f')))
            step = 0
            for pass_id in range(EPOCH_NUM):
                for step_id, data_train in enumerate(train_reader()):
                    if self.multi_card:
                        compiled_prog = fluid.compiler.CompiledProgram(
                            main_program)
                        avg_loss_value = exe.run(compiled_prog,
                                                 feed=feeder.feed(data_train),
                                                 fetch_list=[avg_cost, acc])
                    else:
                        avg_loss_value = exe.run(main_program,
                                                 feed=feeder.feed(data_train),
                                                 fetch_list=[avg_cost, acc])
                    if step_id % 100 == 0:
                        if self.logger is not '':
                            self.train_cost.add_record(pass_id,
                                                       avg_loss_value[0])
                            self.train_acc.add_record(pass_id,
                                                      avg_loss_value[1])
                        print("\nPass %d, Batch %d, Cost %f, Acc %f" %
                              (step_id, pass_id, avg_loss_value[0],
                               avg_loss_value[1]))
                    else:
                        sys.stdout.write('.')
                        sys.stdout.flush()
                    step += 1
                    #if step >= steps_per_pass:
                    #    break

                avg_cost_test, accuracy_test = train_test(test_program,
                                                          reader=test_reader)
                train_end = datetime.utcnow()
                elapsed_time = train_end - train_start
                if self.logger is not '':
                    self.test_loss.add_record(pass_id, avg_cost_test)
                    self.test_acc.add_record(pass_id, accuracy_test)
                print('\nTest with Pass {0}, Loss {1:2.2}, Acc {2:2.2}'.format(
                    pass_id, avg_cost_test, accuracy_test))

                if params_dirname is not None:
                    fluid.io.save_inference_model(params_dirname, ["pixel"],
                                                  [predict], exe)

                if pass_id == EPOCH_NUM - 1:
                    print('Train ended at {}'.format(
                        train_end.strftime('%Y-%m-%d %H:%M:%S.%f')))
                    print(
                        'Elapsed time for training is {}'.format(elapsed_time))

                if self.enable_ce and pass_id == EPOCH_NUM - 1:
                    print("kpis\ttrain_cost\t%f" % avg_loss_value[0])
                    print("kpis\ttrain_acc\t%f" % avg_loss_value[1])
                    print("kpis\ttest_cost\t%f" % avg_cost_test)
                    print("kpis\ttest_acc\t%f" % accuracy_test)

        train_loop()
class Trainer(object):
    @classmethod
    def add_cmdline_argument(cls, parser):
        """ Add the cmdline arguments of trainer. """
        group = parser.add_argument_group("Trainer")
        group.add_argument(
            "--use_data_distributed",
            type=str2bool,
            default=False,
            help="Whether to use data distributed for parallel training.")
        group.add_argument(
            "--valid_metric_name",
            type=str,
            default="-loss",
            help=
            "The validation metric determining which checkpoint is the best.")
        group.add_argument("--num_epochs",
                           type=int,
                           default=10,
                           help="Total number of training epochs to perform.")
        group.add_argument(
            "--save_dir",
            type=str,
            required=True,
            help="The output directory where the model will be saved.")
        group.add_argument(
            "--batch_size",
            type=int,
            default=8,
            help="Total batch size for training/evaluation/inference.")
        group.add_argument(
            "--log_steps",
            type=int,
            default=100,
            help="The number of training steps to output current metrics "
            "on past training dataset.")
        group.add_argument(
            "--valid_steps",
            type=int,
            default=2000,
            help="The number of training steps to perform a evaluation "
            "on validation datasets.")
        group.add_argument(
            "--save_checkpoint",
            type=str2bool,
            default=True,
            help="Whether to save one checkpoints for each training epoch.")
        group.add_argument(
            "--save_summary",
            type=str2bool,
            default=False,
            help="Whether to save metrics summary for visualDL module.")
        DataLoader.add_cmdline_argument(group)
        return group

    def __init__(self, model, to_tensor, hparams, logger=None):
        # Use data distributed
        if hparams.use_data_distributed:
            strategy = parallel.prepare_context()
            if strategy is not None:
                parallel_model = parallel.DataParallel(model, strategy)
                model.before_backward_fn = parallel_model.scale_loss
                model.after_backward_fn = parallel_model.apply_collective_grads
                model = parallel_model

        self.model = model
        self.to_tensor = to_tensor

        self.is_decreased_valid_metric = hparams.valid_metric_name[0] == "-"
        self.valid_metric_name = hparams.valid_metric_name[1:]
        self.num_epochs = hparams.num_epochs
        self.save_dir = hparams.save_dir
        self.log_steps = hparams.log_steps
        self.valid_steps = hparams.valid_steps
        self.save_checkpoint = hparams.save_checkpoint
        self.save_summary = hparams.save_summary

        if not os.path.exists(self.save_dir):
            os.makedirs(self.save_dir)

        self.logger = logger or get_logger(
            os.path.join(self.save_dir, "trainer.log"), "trainer")

        if self.save_summary:
            from visualdl import LogWriter
            self.summary_logger = LogWriter(os.path.join(
                self.save_dir, "summary"),
                                            sync_cycle=10000)
            self.train_summary = {}
            self.valid_summary = {}

        self.batch_metrics_tracker = MetricsTracker()
        self.token_metrics_tracker = MetricsTracker()

        self.best_valid_metric = float(
            "inf" if self.is_decreased_valid_metric else "-inf")
        self.epoch = 0
        self.batch_num = 0

    def train_epoch(self,
                    train_iter,
                    valid_iter,
                    infer_iter=None,
                    infer_parse_dict=None):
        """
        Train an epoch.

        @param train_iter
        @type : DataLoader

        @param valid_iter
        @type : DataLoader

        @param infer_iter
        @type : DataLoader

        @param infer_parse_dict
        @type : dict of function
        """
        self.epoch += 1
        num_batches = len(train_iter)
        self.batch_metrics_tracker.clear()
        self.token_metrics_tracker.clear()
        times = []
        for batch_id, (batch, batch_size) in enumerate(train_iter, 1):
            batch = type(batch)(map(lambda kv: (kv[0], self.to_tensor(kv[1])),
                                    batch.items()))
            batch["epoch"] = self.epoch
            batch["num_steps"] = self.batch_num

            # Do a training iteration
            start_time = time.time()
            metrics = self.model(batch, is_training=True)
            token_num = metrics.pop("token_num", None)
            elapsed = time.time() - start_time
            times.append(elapsed)

            batch_metrics = {
                k: v
                for k, v in metrics.items() if "token" not in k
            }
            token_metrics = {k: v for k, v in metrics.items() if "token" in k}
            self.batch_metrics_tracker.update(batch_metrics, batch_size)
            self.token_metrics_tracker.update(token_metrics, token_num)
            self.batch_num += 1

            if self.log_steps and batch_id % self.log_steps == 0:
                batch_metrics_message = self.batch_metrics_tracker.value()
                token_metrics_message = self.token_metrics_tracker.value()
                message_prefix = f"[Train][{self.epoch}][{batch_id}/{num_batches}]"
                avg_time = f"AVG_Time-{sum(times[-self.log_steps:]) / self.log_steps:.3f}"
                message = "   ".join([
                    message_prefix, batch_metrics_message,
                    token_metrics_message, avg_time
                ])
                self.logger.info(message)

            if self.save_summary:
                with self.summary_logger.mode("train"):
                    for k, v in self.batch_metrics_tracker.items():
                        if k not in self.train_summary:
                            self.train_summary[k] = self.summary_logger.scalar(
                                k)
                        scalar = self.train_summary[k]
                        scalar.add_record(self.batch_num, v)
                    for k, v in self.token_metrics_tracker.items():
                        if k not in self.train_summary:
                            self.train_summary[k] = self.summary_logger.scalar(
                                k)
                        scalar = self.train_summary[k]
                        scalar.add_record(self.batch_num, v)

            if self.valid_steps and valid_iter is not None and \
                    batch_id % self.valid_steps == 0:
                self.evaluate(valid_iter)

        if valid_iter is not None:
            self.evaluate(valid_iter)

        if infer_iter is not None and infer_parse_dict is not None:
            self.infer(infer_iter, infer_parse_dict)

        return

    def infer(self, data_iter, parse_dict, num_batches=None):
        """
        Inference interface.

        @param : data_iter
        @type : DataLoader

        @param : parse_dict
        @type : dict of function

        @param : num_batches : the number of batch to infer
        @type : int/None
        """
        self.logger.info("Generation starts ...")
        infer_save_file = os.path.join(self.save_dir,
                                       f"infer_{self.epoch}.result.json")

        # Inference
        infer_results = []
        batch_cnt = 0
        begin_time = time.time()
        for batch, batch_size in tqdm(data_iter, total=num_batches):
            batch = type(batch)(map(lambda kv: (kv[0], self.to_tensor(kv[1])),
                                    batch.items()))

            result = self.model.infer(inputs=batch)
            batch_result = {}

            def to_list(batch):
                """ Parse list. """
                return batch.tolist()

            # parse
            for k in result:
                if k in parse_dict:
                    parse_fn = parse_dict[k]
                else:
                    parse_fn = to_list
                if result[k] is not None:
                    batch_result[k] = parse_fn(result[k])

            for vs in zip(*batch_result.values()):
                infer_result = {}
                for k, v in zip(batch_result.keys(), vs):
                    infer_result[k] = v
                infer_results.append(infer_result)

            batch_cnt += 1
            if batch_cnt == num_batches:
                break

        self.logger.info(f"Saved inference results to {infer_save_file}")
        with open(infer_save_file, "w") as fp:
            json.dump(infer_results, fp, indent=2)
        infer_metrics_tracker = evaluate_generation_result(infer_results)
        metrics_message = infer_metrics_tracker.summary()
        message_prefix = f"[Infer][{self.epoch}]"
        time_cost = f"TIME-{time.time() - begin_time:.3f}"
        message = "   ".join([message_prefix, metrics_message, time_cost])
        self.logger.info(message)
        return

    def evaluate(self, data_iter, need_save=True):
        """
        Evaluation interface

        @param : data_iter
        @type : DataLoader

        @param : need_save
        @type : bool
        """
        if isinstance(self.model, parallel.DataParallel):
            need_save = need_save and parallel.Env().local_rank == 0

        # Evaluation
        begin_time = time.time()
        batch_metrics_tracker = MetricsTracker()
        token_metrics_tracker = MetricsTracker()
        for batch, batch_size in data_iter:
            batch = type(batch)(map(lambda kv: (kv[0], self.to_tensor(kv[1])),
                                    batch.items()))
            metrics = self.model(batch, is_training=False)
            token_num = int(metrics.pop("token_num"))
            batch_metrics = {
                k: v
                for k, v in metrics.items() if "token" not in k
            }
            token_metrics = {k: v for k, v in metrics.items() if "token" in k}
            batch_metrics_tracker.update(batch_metrics, batch_size)
            token_metrics_tracker.update(token_metrics, token_num)
        batch_metrics_message = batch_metrics_tracker.summary()
        token_metrics_message = token_metrics_tracker.summary()
        message_prefix = f"[Valid][{self.epoch}]"
        time_cost = f"TIME-{time.time() - begin_time:.3f}"
        message = "   ".join([
            message_prefix, batch_metrics_message, token_metrics_message,
            time_cost
        ])
        self.logger.info(message)

        if need_save:
            # Check valid metric
            cur_valid_metric = batch_metrics_tracker.get(
                self.valid_metric_name)
            if self.is_decreased_valid_metric:
                is_best = cur_valid_metric < self.best_valid_metric
            else:
                is_best = cur_valid_metric > self.best_valid_metric
            if is_best:
                # Save current best model
                self.best_valid_metric = cur_valid_metric
                best_model_path = os.path.join(self.save_dir, "best.model")
                save(self.model, best_model_path)
                self.logger.info(
                    f"Saved best model to '{best_model_path}' with new best valid metric "
                    f"{self.valid_metric_name.upper()}-{self.best_valid_metric:.3f}"
                )

            # Save checkpoint
            if self.save_checkpoint:
                model_file = os.path.join(self.save_dir,
                                          f"epoch_{self.epoch}.model")
                save(self.model, model_file)

            if self.save_summary:
                with self.summary_logger.mode("valid"):
                    for k, v in self.batch_metrics_tracker.items():
                        if k not in self.valid_summary:
                            self.valid_summary[k] = self.summary_logger.scalar(
                                k)
                        scalar = self.valid_summary[k]
                        scalar.add_record(self.batch_num, v)
                    for k, v in self.token_metrics_tracker.items():
                        if k not in self.valid_summary:
                            self.valid_summary[k] = self.summary_logger.scalar(
                                k)
                        scalar = self.valid_summary[k]
                        scalar.add_record(self.batch_num, v)

        return
class SolverWrapper:
    def __init__(self, solver_prototxt, log_dir, pretrained_model=None):
        self.solver = caffe.SGDSolver(solver_prototxt)
        if pretrained_model is not None:
            print('Loading pretrained model weights from {:s}'.format(pretrained_model))
            self.solver.net.copy_from(pretrained_model)
        
        self.solver_param = caffe_pb2.SolverParameter()
        with open(solver_prototxt, 'rt') as f:
            pb2.text_format.Merge(f.read(), self.solver_param)
        self.cur_epoch = 0
        self.test_interval = 30  #用来替代self.solver_param.test_interval
        self.logw = LogWriter(log_dir, sync_cycle=10)
        with self.logw.mode('train') as logger:
            self.sc_train_loss = logger.scalar("loss")
            self.sc_train_acc = logger.scalar("Accuracy")
        with self.logw.mode('val') as logger:
            self.sc_val_acc = logger.scalar("Accuracy(acc)")
            self.sc_val_auc = logger.scalar("Area Under Roc Curve(auc)")
            self.sc_val_ap = logger.scalar("Average Precision(ap)")
            self.sc_val_se = logger.scalar("Sensitivity(se)")
            self.sc_val_sp = logger.scalar("Specificity(sp)")

    def train_model(self):
        """执行训练的整个流程,穿插了validation"""
        cur_iter = 0
        test_batch_size, num_classes = self.solver.test_nets[0].blobs['prob'].shape
        num_test_images_tot = test_batch_size * self.solver_param.test_iter[0]
        while cur_iter < self.solver_param.max_iter:
            #self.solver.step(self.test_interval)
            for i in range(self.test_interval):
                self.solver.step(1)
                loss = self.solver.net.blobs['loss'].data
                acc = self.solver.net.blobs['accuracy'].data
                step = self.solver.iter
                self.sc_train_loss.add_record(step, loss)
                self.sc_train_acc.add_record(step, acc)
            
            self.eval_on_val(num_classes, num_test_images_tot, test_batch_size)
            cur_iter += self.test_interval
        
    def eval_on_val(self, num_classes, num_test_images_tot, test_batch_size):
        """在整个验证集上执行inference和evaluation"""
        self.solver.test_nets[0].share_with(self.solver.net)
        self.cur_epoch += 1
        scores = np.zeros((num_classes, num_test_images_tot), dtype=float)
        gt_labels = np.zeros((1, num_test_images_tot), dtype=float).squeeze()
        for t in range(self.solver_param.test_iter[0]):
            output = self.solver.test_nets[0].forward()
            probs = output['prob']
            labels = self.solver.test_nets[0].blobs['label'].data

            gt_labels[t*test_batch_size:(t+1)*test_batch_size] = labels.T.astype(float)
            scores[:,t*test_batch_size:(t+1)*test_batch_size] = probs.T
        # TODO: 处理最后一个batch样本少于num_test_images_per_batch的情况
        
        acc, auc, ap, se, sp = perfeval.isic_cls_eval(scores, gt_labels)
        print('====================================================================\n')
        print('\tDo validation after the {:d}-th training epoch\n'.format(self.cur_epoch))
        print('>>>>', end='\t')  #设定标记,方便于解析日志获取出数据
        print('acc={:.3f}, auc={:.3f}, ap={:.3f}, se={:.3f}, sp={:.3f}\n'.format(acc, auc, ap, se, sp))
        print('\n====================================================================\n')
        step = self.solver.iter
        self.sc_val_acc.add_record(step, acc)
        self.sc_val_auc.add_record(step, auc)
        self.sc_val_ap.add_record(step, ap)
        self.sc_val_se.add_record(step, se)
        self.sc_val_sp.add_record(step, sp)
Esempio n. 19
0
def train(args):
    """OCR training"""

    if args.model == "crnn_ctc":
        train_net = ctc_train_net
        get_feeder_data = get_ctc_feeder_data

    num_classes = None
    train_images = args.train_images
    train_list = args.train_list
    test_images = args.test_images
    test_list = args.test_list
    num_classes = data_reader.num_classes() if num_classes is None else num_classes
    data_shape = data_reader.data_shape()
    # define network
    sum_cost, error_evaluator, inference_program, model_average = train_net(
        args, data_shape, num_classes)

    logger = LogWriter('./log', sync_cycle=10)
    with logger.mode("train") as train_logger:
        train_acc = train_logger.scalar("train_acc")
        train_loss = train_logger.scalar("train_loss")
        val_loss = train_logger.scalar("val_loss")
        val_acc = train_logger.scalar("val_acc")
    # data reader

    train_reader = data_reader.train(
        args.batch_size,
        train_images_dir=train_images,
        train_list_file=train_list,
        cycle=args.total_step > 0,
        model=args.model)
    test_reader = data_reader.test(
        test_images_dir=test_images, test_list_file=test_list, model=args.model)


    # prepare environment
    place = fluid.CPUPlace()
    if args.use_gpu:
        place = fluid.CUDAPlace(0)
    exe = fluid.Executor(place)

    if 'ce_mode' in os.environ:
        fluid.default_startup_program().random_seed = 90

    exe.run(fluid.default_startup_program())
    # init_list=[]
    #for param in fluid.default_main_program().global_block().all_parameters():
    # if "batch_norm" in param.name or "conv2d" in param.name:
    #     init_list.append(param.name)
    # print ("%s=%s=%s" % (param.name, param.name, param.shape))
    # load init model
    print("Initing Model:****************")
    if args.init_model is not None:
        model_dir = args.init_model
        model_file_name = None
        if not os.path.isdir(args.init_model):
            model_dir = os.path.dirname(args.init_model)
            model_file_name = os.path.basename(args.init_model)
        model_file_name = os.path.basename(args.init_model)
        fluid.io.load_params(exe, dirname=args.init_model, filename="model_369000")

        print("Init model from: %s." % args.init_model)

    train_exe = exe
    error_evaluator.reset(exe)
    if args.parallel:
        train_exe = fluid.ParallelExecutor(
            use_cuda=True if args.use_gpu else False, loss_name=sum_cost.name)

    fetch_vars = [sum_cost] + error_evaluator.metrics

    def train_one_batch(data):
        var_names = [var.name for var in fetch_vars]
        if args.parallel:
            results = train_exe.run(var_names,
                                    feed=get_feeder_data(data, place))
            results = [np.array(result).sum() for result in results]
        else:
            results = train_exe.run(feed=get_feeder_data(data, place),
                                    fetch_list=fetch_vars)
            results = [result[0] for result in results]
        return results

    def test(iter_num):
        error_evaluator.reset(exe)
        res = 0
        i = 0
        for data in test_reader():
            cost =  exe.run(inference_program, feed=get_feeder_data(data, place), fetch_list=[sum_cost])
            # if i == 0:
            #     print(cost[0])
            res += cost[0][0]
            i += 1
        val_loss.add_record(iter_num, res / i)
        _, test_seq_error = error_evaluator.eval(exe)
        print("\nTime: %s; Iter[%d]; Test seq error: %s.\n" % (
            time.time(), iter_num, str(test_seq_error[0])))
        val_acc.add_record(iter_num, 1 - test_seq_error[0])
        #Note: The following logs are special for CE monitoring.
        #Other situations do not need to care about these logs.
        print("kpis test_acc    %f" % (1 - test_seq_error[0]))

    def save_model(args, exe, iter_num):
        filename = "model_%05d" % iter_num
        fluid.io.save_params(
            exe, dirname=args.save_model_dir, filename=filename)
        print("Saved model to: %s/%s." % (args.save_model_dir, filename))

    iter_num = 0
    stop = False
    start_time = time.time()
    while not stop:
        total_loss = 0.0
        total_seq_error = 0.0
        batch_times = []
        # train a pass
        for data in train_reader():
            if args.total_step > 0 and iter_num == args.total_step + args.skip_batch_num:
                stop = True
                break
            if iter_num < args.skip_batch_num:
                print("Warm-up iteration")
            if iter_num == args.skip_batch_num:
                profiler.reset_profiler()
            start = time.time()
            results = train_one_batch(data)
            batch_time = time.time() - start
            fps = args.batch_size / batch_time
            batch_times.append(batch_time)
            total_loss += results[0]
            total_seq_error += results[2]

            iter_num += 1
            # training log
            if iter_num % args.log_period == 0:
                avg_loss = total_loss / (args.log_period)
                avg_err = total_seq_error / (args.log_period * args.batch_size)
                print("\nTime: %s; Iter[%d]; Avg loss: %.3f; Avg seq err: %.3f" % (
                    time.time(), iter_num,
                    avg_loss, avg_err))
                print("kpis train_cost  %f" % (avg_loss))
                print("kpis train_acc   %f" % (1 - avg_err))
                train_loss.add_record(iter_num, avg_loss)
                train_acc.add_record(iter_num, 1 - avg_err )
                total_loss = 0.0
                total_seq_error = 0.0

            # evaluate
            if not args.skip_test and iter_num % args.eval_period == 0:
                if model_average:
                    with model_average.apply(exe):
                        test(iter_num)
                else:
                    test(iter_num)

            # save model
            if iter_num % args.save_model_period == 0:
                if model_average:
                    with model_average.apply(exe):
                        save_model(args, exe, iter_num)
                else:
                    save_model(args, exe, iter_num)
        end_time = time.time()
        print("kpis train_duration  %f" % (end_time - start_time))
        # Postprocess benchmark data
        latencies = batch_times[args.skip_batch_num:]
        latency_avg = np.average(latencies)
        latency_pc99 = np.percentile(latencies, 99)
        fpses = np.divide(args.batch_size, latencies)
        fps_avg = np.average(fpses)
        fps_pc99 = np.percentile(fpses, 1)

        # Benchmark output
        print('\nTotal examples (incl. warm-up): %d' %
              (iter_num * args.batch_size))
        print('average latency: %.5f s, 99pc latency: %.5f s' % (latency_avg,
                                                                 latency_pc99))
        print('average fps: %.5f, fps for 99pc latency: %.5f' % (fps_avg,
                                                                 fps_pc99))
Esempio n. 20
0
def train(model, args):
    # 1. Create VisualDL logger
    logwriter = LogWriter(os.path.join(args.logdir, "visualdl_log"),
                          sync_cycle=10)
    with logwriter.mode("Train") as writer:
        train_loss_scalar = writer.scalar("loss")
        train_acc_scalar = writer.scalar("acc")
        histogram1 = writer.histogram("Relation-BiLinear-W", 100)
        histogram2 = writer.histogram("Relation-BiLinear-b", 10)
        histogram3 = writer.histogram("Relation-FC-W", 100)
    with logwriter.mode("Val") as writer:
        val_acc_scalar = writer.scalar("acc")

    # 2. Setup program
    train_prog = fluid.default_main_program()
    train_startup = fluid.default_startup_program()

    train_reader = model.train_reader
    val_reader = model.val_reader
    test_reader = model.test_reader
    loss = model.loss
    mean_acc = model.mean_acc

    # Clone for val / test
    val_prog = train_prog.clone(for_test=True)
    test_prog = train_prog.clone(for_test=True)

    optimizer = fluid.optimizer.Adam(learning_rate=args.lr)
    optimizer.minimize(loss)

    # 3. Setup executor
    place = fluid.CUDAPlace(0) if args.use_cuda else fluid.CPUPlace()
    exe = fluid.Executor(place)
    exe.run(train_startup)

    # 4. Get Relation Module params for VisualDL
    # print(fluid.io.get_program_parameter(train_startup))
    relation_BL_w = train_startup.global_block().var("Relation-BiLinear.w_0")
    relation_BL_b = train_startup.global_block().var("Relation-BiLinear.b_0")
    relation_FC_w = train_startup.global_block().var("Relation-FC.w_0")

    # 5. Compile
    print("Compilling...")
    compiled_train_prog = fluid.CompiledProgram(train_prog).with_data_parallel(
        loss_name=loss.name)
    compiled_val_prog = fluid.CompiledProgram(val_prog).with_data_parallel(
        share_vars_from=compiled_train_prog)
    compiled_test_prog = fluid.CompiledProgram(test_prog).with_data_parallel(
        share_vars_from=compiled_train_prog)

    # 6. Setup data source
    token2idx_dict, unk_idx, pad_idx = get_token2id_dict(args.emb_path)
    print("Setup dataloader...")
    places = fluid.cuda_places() if args.use_cuda else fluid.cpu_places()
    train_reader.set_sample_generator(train_loader(args.train_data_path,
                                                   args.N, args.K, args.Q,
                                                   token2idx_dict, unk_idx,
                                                   pad_idx, args.max_length),
                                      batch_size=args.batch_size,
                                      places=places)
    val_reader.set_sample_generator(val_test_loader(args.val_data_path,
                                                    args.N,
                                                    args.K,
                                                    args.Q,
                                                    token2idx_dict,
                                                    unk_idx,
                                                    pad_idx,
                                                    args.max_length,
                                                    data_type="val"),
                                    batch_size=1,
                                    places=places)
    test_reader.set_sample_generator(val_test_loader(args.test_data_path,
                                                     args.N,
                                                     args.K,
                                                     args.Q,
                                                     token2idx_dict,
                                                     unk_idx,
                                                     pad_idx,
                                                     args.max_length,
                                                     data_type="test"),
                                     batch_size=1,
                                     places=places)

    # 7. Train loop
    # Record the best model
    best_val_acc = 0
    # Record the train loss/acc by sliding window
    loss_record, acc_record = [], []
    loss_window = acc_window = 0  # Sum of sliding window
    window = 50  # The size of sliding window
    for epi, train_data in zip(range(1, args.train_episodes + 1),
                               train_reader()):
        # 7.1 Run
        (train_cur_loss, train_cur_acc, relation_BL_w_value,
         relation_BL_b_value,
         relation_FC_w_value) = exe.run(program=compiled_train_prog,
                                        feed=train_data,
                                        fetch_list=[
                                            loss.name, mean_acc.name,
                                            relation_BL_w.name,
                                            relation_BL_b.name,
                                            relation_FC_w.name
                                        ])
        # print(train_cur_loss[0], train_cur_acc[0])
        loss_record.append(train_cur_loss[0])
        acc_record.append(train_cur_acc[0])

        # + right - left
        loss_window += train_cur_loss[0]
        acc_window += train_cur_acc[0]
        if epi - window - 1 >= 0:
            # Ensure that the left side is in the sliding window
            loss_window -= loss_record[epi - window - 1]
            acc_window -= acc_record[epi - window - 1]

        if epi % window == 0:
            print(
                "{}  [Train episode: {:5d}/{:5d}] ==> Loss: {:2.6f} Mean acc: {:2.4f}"
                .format(
                    str(datetime.datetime.now())[:-7], epi,
                    args.train_episodes, loss_window / window,
                    100 * acc_window / window))

        # 7.2 Add metrics/params to VisualDL
        train_loss_scalar.add_record(epi, loss_window / window)
        train_acc_scalar.add_record(epi, acc_window / window)
        histogram1.add_record(epi, relation_BL_w_value.flatten())
        histogram2.add_record(epi, relation_BL_b_value.flatten())
        histogram3.add_record(epi, relation_FC_w_value.flatten())

        # 7.3 Validation
        if args.val_data_path and epi % args.val_steps == 0:
            # 7.3.1 Run val once
            val_acc_mean = eval(exe,
                                compiled_val_prog,
                                val_reader, [mean_acc.name],
                                run_type="Val")

            print("{}  [Val result: {:5d}/{:5d}] ==> Mean acc: {:2.4f}".format(
                str(datetime.datetime.now())[:-7], epi, args.train_episodes,
                100 * val_acc_mean))
            # Add val acc to VisualDL
            val_acc_scalar.add_record(epi, val_acc_mean)

            # 7.3.2 Save best model
            if val_acc_mean > best_val_acc:
                best_val_acc = val_acc_mean
                fluid.io.save_inference_model(
                    os.path.join(args.logdir, "infer_model"),
                    ["totalQ", "support", "support_len", "query", "query_len"],
                    [model.prediction],
                    exe,
                    main_program=train_prog,
                    params_filename="__params__")
                print(
                    "{}  [Save model of val mean acc: {:2.4f}] ==> {}".format(
                        str(datetime.datetime.now())[:-7], 100 * best_val_acc,
                        os.path.join(args.logdir, "infer_model")))

    # 8. Test
    if args.test_data_path:
        test_acc_mean = eval(exe,
                             compiled_test_prog,
                             test_reader, [mean_acc.name],
                             run_type="Test")
        print("{}  [Test result] ==> Mean acc: {:2.4f}".format(
            str(datetime.datetime.now())[:-7], 100 * test_acc_mean))
Esempio n. 21
0
import os
import paddle.fluid as fluid
import paddle.fluid.framework as framework
import paddle.v2 as paddle
from paddle.fluid.initializer import NormalInitializer
from paddle.fluid.param_attr import ParamAttr
from visualdl import LogWriter
from dataset import Dataset
from net_fluid import simplenet

# 创建VisualDL,并指定当前该项目的VisualDL的路径
logdir = "./logs"
logwriter = LogWriter(logdir, sync_cycle=10)

# 创建loss的趋势图
with logwriter.mode("train") as writer:
    loss_scalar = writer.scalar("loss")

# 创建acc的趋势图
with logwriter.mode("train") as writer:
    acc_scalar = writer.scalar("acc")

# 定义输出频率
num_samples = 4
# 创建卷积层和输出图像的图形化展示
with logwriter.mode("train") as writer:
    conv_image = writer.image("conv_image", num_samples, 1)
    input_image = writer.image("input_image", num_samples, 1)

# 创建可视化的训练模型结构
with logwriter.mode("train") as writer:
Esempio n. 22
0
def train():

    model = BaseModel(batch_size=batch_size, maxlen=n_frames)
    loss, acc, output, no_grad_set = model.build_graph()

    main_program = fluid.default_main_program()
    inference_program = fluid.default_main_program().clone(for_test=True)

    optimizer = fluid.optimizer.Adadelta(0.001)
    optimizer.minimize(loss, no_grad_set=no_grad_set)

    place = fluid.CPUPlace()
    exe = Executor(place)
    exe.run(framework.default_startup_program())

    log_writter = LogWriter(log_path, sync_cycle=10)  

    with log_writter.mode("train") as logger:          
        log_train_loss = logger.scalar(tag="train_loss") 
        log_train_acc = logger.scalar(tag="train_acc")

    with log_writter.mode("validation") as logger:
        log_valid_loss = logger.scalar(tag="validation_loss")
        log_valid_acc = logger.scalar(tag="validation_acc")

    def prepare_input(batch):
        x, y, x_seqlen = batch
        res = {}

        res['input'] = np.array(x).astype("float32")
        res['input_seqlen'] = np.array(x_seqlen).astype("int64")
        res['label'] = np.array(y).astype("float32")

        return res

    # (samples, seq, width, height, pixel)
    noisy_movies, shifted_movies = reader.generate_movies(n_samples, n_frames)
    data = noisy_movies[:1000], shifted_movies[:1000]
    train_data, validation_data = split(data, validation_split)

    step_id = 0
    for epoch_id in range(max_epoch):
        start_time = time.time()
        print("epoch id", epoch_id)

        valid_data_iter = reader.get_data_iter(validation_data, batch_size) 
        train_data_iter = reader.get_data_iter(train_data, batch_size) 

        # train
        total_loss = 0
        batch_id = 0
        for batch in train_data_iter:
            input_data_feed = prepare_input(batch)
            fetch_outs = exe.run(program=main_program,
                                 feed=input_data_feed,
                                 fetch_list=[loss.name, acc.name],
                                 use_program_cache=False)

            cost_train = np.array(fetch_outs[0])
            acc_train = fetch_outs[1]
            total_loss += cost_train

            if batch_id > 0 and batch_id % 5 == 0:
                log_train_loss.add_record(step_id, total_loss) 
                log_train_acc.add_record(step_id, acc_train)
                step_id += 1
                print("current loss: %.7f, for batch %d"  % (total_loss, batch_id))
                total_loss = 0.0

            batch_id += 1


        # validate
        total_loss = 0
        total_acc = 0
        batch_id = 0
        for batch in valid_data_iter:
            input_data_feed = prepare_input(batch)
            fetch_outs = exe.run(program=inference_program,
                                 feed=input_data_feed,
                                 fetch_list=[loss.name, acc.name],
                                 use_program_cache=False)

            cost_train = np.array(fetch_outs[0])
            acc_train = fetch_outs[1]
            total_loss += cost_train
            batch_id += 1

        log_valid_loss.add_record(epoch_id, total_loss)
        log_valid_acc.add_record(epoch_id, total_acc / batch_id)
        print("validation loss: %.7f"  % (total_loss))

    fluid.io.save_inference_model(
        dirname=params_path,
        feeded_var_names=['input', 'input_seqlen'], 
        target_vars=[loss, acc], 
        executor=exe)
Esempio n. 23
0
        # Step 3. Run the forward pass, getting log probabilities over next
        # words
        log_probs = model(context_idxs)

        # Step 4. Compute your loss function. (Again, Torch wants the target
        # word wrapped in a variable)
        loss = loss_function(
            log_probs, torch.tensor([word_to_ix[target]], dtype=torch.long))

        # Step 5. Do the backward pass and update the gradient
        loss.backward()
        optimizer.step()

        # Get the Python number from a 1-element Tensor by calling tensor.item()
        total_loss += loss.item()
    losses.append(total_loss)
print(losses)  # The loss decreased every iteration over the training data!

# VisualDL setup
logw = LogWriter("./embedding_log", sync_cycle=10000)
with logw.mode('train') as logger:
    embedding = logger.embedding()

embeddings_list = model.embeddings.weight.data.numpy(
)  # convert to numpy array

# VisualDL embedding log writer takes two parameters
# The first parameter is embedding list. The type is list[list[float]]
# The second parameter is word_dict. The type is dictionary<string, int>.
embedding.add_embeddings_with_word_dict(embeddings_list, word_to_ix)
Esempio n. 24
0
# coding=utf-8
from visualdl import LogWriter

# 创建 LogWriter 对象
log_writer = LogWriter(".", sync_cycle=20)

# 创建 scalar 组件,模式为 train
with log_writer.mode("train") as logger:
    train_acc = logger.scalar("acc")
    train_loss = logger.scalar("loss")

# 创建 scalar 组件,模式设为 test, tag 设为 acc
with log_writer.mode("test") as logger:
    test_acc = logger.scalar("acc")

value = [i / 1000.0 for i in range(1000)]
for step in range(1000):
    # 向名称为 acc 的图中添加模式为train的数据
    train_acc.add_record(step, value[step])

    # 向名称为 loss 的图中添加模式为train的数据
    train_loss.add_record(step, 1 / (value[step] + 1))

    # 向名称为 acc 的图中添加模式为test的数据
    test_acc.add_record(step, 1 - value[step])
Esempio n. 25
0
def get_result(test_for):
    """
    get log from db and produce protobuf logs
    :return:
    """
    result_logs = bm.ViewVisualDLLog.objects.filter(test_for=test_for)
    if not result_logs:
        print("no {} results in latest paddle version".format(test_for))
        return
    paddle_version = result_logs[0].paddle_version if result_logs else ''
    version_path = os.path.join(conf.ROOT_PATH, 'visualdl_logs',
                                paddle_version)
    cmd = "if [ ! -d %s ]; then mkdir %s; fi" % (version_path, version_path)
    os.system(cmd)
    logdir = os.path.join(version_path, test_for)
    #logdir_des = conf.ROOT_PATH + '/visualdl_logs/latest'
    logdir_des = os.path.join(conf.ROOT_PATH, 'visualdl_logs', 'latest',
                              test_for)
    cmd = "if [ -e %s ]; then rm -rf %s; fi; mkdir %s" % (logdir, logdir,
                                                          logdir)
    os.system(cmd)

    logge = LogWriter(logdir, sync_cycle=1)

    def sample_log(result_log_dict, model, run_machine_type):
        """sample log from db log depends on model and run_machine_type"""
        if model == 'ocr':
            sample_ratio = 1
            if run_machine_type.startswith("MULTI_MACHINE_MULTI"):
                sample_ratio = 62
            elif run_machine_type.startswith("MULTI_MACHINE_ONE"):
                sample_ratio = 15
            elif run_machine_type.startswith("ONE"):
                sample_ratio = 15
            elif run_machine_type.startswith("FOUR"):
                sample_ratio = 15
            elif run_machine_type.startswith("MULTI_GPU"):
                sample_ratio = 15

            for k, v in result_log_dict.items():
                sample_list = [
                    v[index] for index in range(len(v))
                    if index % sample_ratio == 0
                ]
                result_log_dict[k] = [[index + 1, sample_list[index][1]]
                                      for index in range(len(sample_list))]

        return result_log_dict

    for log in result_logs:
        model = log.model
        test_for = log.test_for
        #code_from = log.code_from
        run_rpc_type = log.run_rpc_type.lower()
        run_machine_type = log.run_machine_type.lower()
        tag = "%s_%s_%s" % (test_for.split('_')[0], run_machine_type,
                            run_rpc_type)
        result_log_dict = json.loads(log.result_log)
        #sample_log_dict = sample_log(result_log_dict, model, run_machine_type)
        print("visualdl_paint cur is: %s_%s_%s" %
              (model, tag, log.cloud_job_id))
        for indicant, values in result_log_dict.items():
            with logge.mode(indicant) as logge:
                val_tag = logge.scalar("%s/%s" % (model, tag))
                for step, value in values:
                    if value != 'NaN':
                        val_tag.add_record(int(step), float(value))

    cmd = "rm -rf %s && cp -r %s %s" % (logdir_des, logdir, logdir_des)
    os.system(cmd)