コード例 #1
0
def main(args):
    ernie_config = ErnieConfig(
        os.path.join(args.model_path, "ernie_config.json"))
    ernie_config.print_config()

    if args.use_cuda:
        dev_list = fluid.cuda_places()
        place = dev_list[0]
        dev_count = len(dev_list)
    else:
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))

    reader = DocREDReader(vocab_path=os.path.join(args.model_path,
                                                  "vocab.txt"),
                          label_map_config=os.path.join(
                              args.data_path, "label_map.json"),
                          max_seq_len=args.max_seq_len,
                          max_ent_cnt=args.max_ent_cnt,
                          do_lower_case=args.do_lower_case,
                          in_tokens=args.in_tokens,
                          random_seed=args.random_seed)

    if not (args.do_train or args.do_val or args.do_test):
        raise ValueError("For args `do_train`, `do_val` and `do_test`, at "
                         "least one of them must be True.")

    startup_prog = fluid.Program()
    if args.random_seed is not None:
        startup_prog.random_seed = args.random_seed

    if args.do_train:
        train_data_generator = reader.data_generator(
            data_dir=args.data_path,
            mode='train',
            batch_size=args.batch_size,
            epoch=args.epoch)

        num_train_examples = reader.get_num_train_examples(args.data_path)

        if args.in_tokens:
            if args.batch_size < args.max_seq_len:
                raise ValueError(
                    'if in_tokens=True, batch_size should greater than max_sqelen, got batch_size:%d seqlen:%d'
                    % (args.batch_size, args.max_seq_len))

            max_train_steps = args.epoch * num_train_examples // (
                args.batch_size // args.max_seq_len) // dev_count
        else:
            max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count

        warmup_steps = int(max_train_steps * args.warmup_proportion)
        log.info("Device count: %d" % dev_count)
        log.info("Num train examples: %d" % num_train_examples)
        log.info("Max train steps: %d" % max_train_steps)
        log.info("Num warmup steps: %d" % warmup_steps)

        train_program = fluid.Program()

        with fluid.program_guard(train_program, startup_prog):
            with fluid.unique_name.guard():
                train_pyreader, graph_vars = create_model(
                    args,
                    pyreader_name='train_reader',
                    ernie_config=ernie_config)
                scheduled_lr, loss_scaling = optimization(
                    loss=graph_vars["loss"],
                    warmup_steps=warmup_steps,
                    num_train_steps=max_train_steps,
                    learning_rate=args.learning_rate,
                    train_program=train_program,
                    startup_prog=startup_prog,
                    weight_decay=args.weight_decay,
                    scheduler=args.lr_scheduler,
                    use_fp16=args.use_fp16,
                    use_dynamic_loss_scaling=args.use_dynamic_loss_scaling,
                    init_loss_scaling=args.init_loss_scaling,
                    incr_every_n_steps=args.incr_every_n_steps,
                    decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf,
                    incr_ratio=args.incr_ratio,
                    decr_ratio=args.decr_ratio)

        if args.verbose:
            if args.in_tokens:
                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                    program=train_program,
                    batch_size=args.batch_size // args.max_seq_len)
            else:
                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                    program=train_program, batch_size=args.batch_size)
            log.info("Theoretical memory usage in training: %.3f - %.3f %s" %
                     (lower_mem, upper_mem, unit))

    if args.do_val or args.do_test:
        test_prog = fluid.Program()
        with fluid.program_guard(test_prog, startup_prog):
            with fluid.unique_name.guard():
                test_pyreader, graph_vars = create_model(
                    args,
                    pyreader_name='test_reader',
                    ernie_config=ernie_config)

        test_prog = test_prog.clone(for_test=True)

    nccl2_num_trainers = 1
    nccl2_trainer_id = 0
    if args.is_distributed:
        trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
        worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS")
        current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
        worker_endpoints = worker_endpoints_env.split(",")
        trainers_num = len(worker_endpoints)

        log.info("worker_endpoints:{} trainers_num:{} current_endpoint:{} \
              trainer_id:{}".format(worker_endpoints, trainers_num,
                                    current_endpoint, trainer_id))

        # prepare nccl2 env.
        config = fluid.DistributeTranspilerConfig()
        config.mode = "nccl2"
        t = fluid.DistributeTranspiler(config=config)
        t.transpile(trainer_id,
                    trainers=worker_endpoints_env,
                    current_endpoint=current_endpoint,
                    program=train_program if args.do_train else test_prog,
                    startup_program=startup_prog)
        nccl2_num_trainers = trainers_num
        nccl2_trainer_id = trainer_id

    exe = fluid.Executor(place)
    exe.run(startup_prog)

    if args.do_train:
        if args.init_checkpoint:
            init_checkpoint(exe,
                            args.init_checkpoint,
                            main_program=startup_prog,
                            use_fp16=args.use_fp16)
    elif args.do_val or args.do_test:
        if not args.init_checkpoint:
            raise ValueError("args 'init_checkpoint' should be set if"
                             "only doing validation or testing!")
        init_checkpoint(exe,
                        args.init_checkpoint,
                        main_program=startup_prog,
                        use_fp16=args.use_fp16)

    if args.do_train:
        exec_strategy = fluid.ExecutionStrategy()
        if args.use_fast_executor:
            exec_strategy.use_experimental_executor = True
        exec_strategy.num_threads = dev_count
        exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope

        train_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda,
                                           loss_name=graph_vars["loss"].name,
                                           exec_strategy=exec_strategy,
                                           main_program=train_program,
                                           num_trainers=nccl2_num_trainers,
                                           trainer_id=nccl2_trainer_id)

        train_pyreader.set_batch_generator(train_data_generator)
    else:
        train_exe = None

    if args.do_val or args.do_test:
        test_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda,
                                          main_program=test_prog,
                                          share_vars_from=train_exe)

    if args.do_train:
        train_pyreader.start()
        steps = 0
        graph_vars["learning_rate"] = scheduled_lr

        time_begin = time.time()
        while True:
            try:
                steps += 1
                if steps % args.skip_steps != 0:
                    train_exe.run(fetch_list=[])
                else:
                    fetch_list = [
                        graph_vars["loss"].name,
                        graph_vars["logits"].name,
                        graph_vars["ent_masks"].name,
                        graph_vars["label_ids"].name,
                        graph_vars['learning_rate'].name,
                    ]

                    out = train_exe.run(fetch_list=fetch_list)
                    np_loss, np_logits, np_ent_masks, np_label_ids, np_lr = out
                    lr = float(np_lr[0])
                    loss = np_loss.mean()

                    f1 = batch_eval(np_logits, np_ent_masks, np_label_ids)
                    if args.verbose:
                        log.info(
                            "train pyreader queue size: %d, learning rate: %f"
                            % (train_pyreader.queue.size(),
                               lr if warmup_steps > 0 else args.learning_rate))

                    current_example, current_epoch = reader.get_train_progress(
                    )
                    time_end = time.time()
                    used_time = time_end - time_begin
                    log.info(
                        "epoch: %d, progress: %d/%d, step: %d, loss: %f, "
                        "f1: %f, speed: %f steps/s" %
                        (current_epoch, current_example, num_train_examples,
                         steps, loss, f1, args.skip_steps / used_time))
                    time_begin = time.time()

            except fluid.core.EOFException:
                save_path = os.path.join(args.save_checkpoints,
                                         "step_" + str(steps))
                log.info("saving to checkpoint: " +
                         str(args.save_checkpoints) + "/step_%d" % steps)
                fluid.io.save_persistables(exe, save_path, train_program)
                train_pyreader.reset()
                break

    # final eval on dev set
    if nccl2_trainer_id == 0 and args.do_val:
        if not args.do_train:
            current_example, current_epoch = reader.get_train_progress()
        evaluate_wrapper(reader, exe, test_prog, test_pyreader, graph_vars,
                         current_epoch, 'final')

    if nccl2_trainer_id == 0 and args.do_test:
        predict_wrapper(reader, exe, test_prog, test_pyreader, graph_vars)
コード例 #2
0
ファイル: train.py プロジェクト: nepeplwu/PaddleSeg
def train(cfg):
    startup_prog = fluid.Program()
    train_prog = fluid.Program()
    drop_last = True

    dataset = SegDataset(file_list=cfg.DATASET.TRAIN_FILE_LIST,
                         mode=ModelPhase.TRAIN,
                         shuffle=True,
                         data_dir=cfg.DATASET.DATA_DIR)

    def data_generator():
        if args.use_mpio:
            data_gen = dataset.multiprocess_generator(
                num_processes=cfg.DATALOADER.NUM_WORKERS,
                max_queue_size=cfg.DATALOADER.BUF_SIZE)
        else:
            data_gen = dataset.generator()

        batch_data = []
        for b in data_gen:
            batch_data.append(b)
            if len(batch_data) == cfg.BATCH_SIZE:
                for item in batch_data:
                    yield item[0], item[1], item[2]
                batch_data = []
        # If use sync batch norm strategy, drop last batch if number of samples
        # in batch_data is less then cfg.BATCH_SIZE to avoid NCCL hang issues
        if not cfg.TRAIN.SYNC_BATCH_NORM:
            for item in batch_data:
                yield item[0], item[1], item[2]

    # Get device environment
    places = fluid.cuda_places() if args.use_gpu else fluid.cpu_places()
    place = places[0]
    # Get number of GPU
    dev_count = len(places)
    print("#GPU-Devices: {}".format(dev_count))

    # Make sure BATCH_SIZE can divided by GPU cards
    assert cfg.BATCH_SIZE % dev_count == 0, (
        'BATCH_SIZE:{} not divisble by number of GPUs:{}'.format(
            cfg.BATCH_SIZE, dev_count))
    # If use multi-gpu training mode, batch data will allocated to each GPU evenly
    batch_size_per_dev = cfg.BATCH_SIZE // dev_count
    print("batch_size_per_dev: {}".format(batch_size_per_dev))

    py_reader, avg_loss, lr, pred, grts, masks = build_model(
        train_prog, startup_prog, phase=ModelPhase.TRAIN)
    py_reader.decorate_sample_generator(data_generator,
                                        batch_size=batch_size_per_dev,
                                        drop_last=drop_last)

    exe = fluid.Executor(place)
    exe.run(startup_prog)

    exec_strategy = fluid.ExecutionStrategy()
    # Clear temporary variables every 100 iteration
    if args.use_gpu:
        exec_strategy.num_threads = fluid.core.get_cuda_device_count()
    exec_strategy.num_iteration_per_drop_scope = 100
    build_strategy = fluid.BuildStrategy()
    if cfg.TRAIN.SYNC_BATCH_NORM and args.use_gpu:
        if dev_count > 1:
            # Apply sync batch norm strategy
            print("Sync BatchNorm strategy is effective.")
            build_strategy.sync_batch_norm = True
        else:
            print("Sync BatchNorm strategy will not be effective if GPU device"
                  " count <= 1")
    compiled_train_prog = fluid.CompiledProgram(train_prog).with_data_parallel(
        loss_name=avg_loss.name,
        exec_strategy=exec_strategy,
        build_strategy=build_strategy)

    # Resume training
    begin_epoch = cfg.SOLVER.BEGIN_EPOCH
    if cfg.TRAIN.RESUME:
        begin_epoch = load_checkpoint(exe, train_prog)
    # Load pretrained model
    elif os.path.exists(cfg.TRAIN.PRETRAINED_MODEL):
        print('Pretrained model dir:', cfg.TRAIN.PRETRAINED_MODEL)
        load_vars = []

        def var_shape_matched(var, shape):
            """
            Check whehter persitable variable shape is match with current network
            """
            var_exist = os.path.exists(
                os.path.join(cfg.TRAIN.PRETRAINED_MODEL, var.name))
            if var_exist:
                var_shape = parse_shape_from_file(
                    os.path.join(cfg.TRAIN.PRETRAINED_MODEL, var.name))
                if var_shape == shape:
                    return True
                else:
                    print(
                        "Variable[{}] shape does not match current network, skip"
                        " to load it.".format(var.name))
                    return False

        for x in train_prog.list_vars():
            if isinstance(x, fluid.framework.Parameter):
                shape = tuple(fluid.global_scope().find_var(
                    x.name).get_tensor().shape())
                if var_shape_matched(x, shape):
                    load_vars.append(x)
        if cfg.MODEL.FP16:
            # If open FP16 training mode, load FP16 var separate
            load_fp16_vars(exe, cfg.TRAIN.PRETRAINED_MODEL, train_prog)
        else:
            fluid.io.load_vars(exe,
                               dirname=cfg.TRAIN.PRETRAINED_MODEL,
                               vars=load_vars)
        print("Pretrained model loaded successfully!")
    else:
        print('Pretrained model dir {} not exists, training from scratch...'.
              format(cfg.TRAIN.PRETRAINED_MODEL))

    fetch_list = [avg_loss.name, lr.name]
    if args.debug:
        # Fetch more variable info and use streaming confusion matrix to
        # calculate IoU results if in debug mode
        np.set_printoptions(precision=4,
                            suppress=True,
                            linewidth=160,
                            floatmode="fixed")
        fetch_list.extend([pred.name, grts.name, masks.name])
        cm = ConfusionMatrix(cfg.DATASET.NUM_CLASSES, streaming=True)

    if args.use_tb:
        if not args.tb_log_dir:
            print("Please specify the log directory by --tb_log_dir.")
            exit(1)

        from tb_paddle import SummaryWriter

        if os.path.exists(args.tb_log_dir):
            shutil.rmtree(args.tb_log_dir)
        log_writer = SummaryWriter(args.tb_log_dir)

    global_step = 0
    all_step = cfg.DATASET.TRAIN_TOTAL_IMAGES // cfg.BATCH_SIZE
    if cfg.DATASET.TRAIN_TOTAL_IMAGES % cfg.BATCH_SIZE and drop_last != True:
        all_step += 1
    all_step *= (cfg.SOLVER.NUM_EPOCHS - begin_epoch + 1)

    avg_loss = 0.0
    timer = Timer()
    timer.start()
    if begin_epoch > cfg.SOLVER.NUM_EPOCHS:
        raise ValueError((
            "begin epoch[{}] is larger than cfg.SOLVER.NUM_EPOCHS[{}]").format(
                begin_epoch, cfg.SOLVER.NUM_EPOCHS))

    if args.use_mpio:
        print("Use multiprocess reader")
    else:
        print("Use multi-thread reader")

    for epoch in range(begin_epoch, cfg.SOLVER.NUM_EPOCHS + 1):
        py_reader.start()
        while True:
            try:
                if args.debug:
                    # Print category IoU and accuracy to check whether the
                    # traning process is corresponed to expectation
                    loss, lr, pred, grts, masks = exe.run(
                        program=compiled_train_prog,
                        fetch_list=fetch_list,
                        return_numpy=True)

                    cm.calculate(pred, grts, masks)
                    avg_loss += np.mean(np.array(loss))
                    global_step += 1

                    if global_step % args.log_steps == 0:
                        speed = args.log_steps / timer.elapsed_time()
                        avg_loss /= args.log_steps
                        category_acc, mean_acc = cm.accuracy()
                        category_iou, mean_iou = cm.mean_iou()

                        print((
                            "epoch={} step={} lr={:.5f} loss={:.4f} acc={:.5f} mIoU={:.5f} step/sec={:.3f} | ETA {}"
                        ).format(epoch, global_step, lr[0], avg_loss, mean_acc,
                                 mean_iou, speed,
                                 calculate_eta(all_step - global_step, speed)))
                        print("Category IoU:", category_iou)
                        print("Category Acc:", category_acc)
                        if args.use_tb:
                            log_writer.add_scalar('Train/mean_iou', mean_iou,
                                                  global_step)
                            log_writer.add_scalar('Train/mean_acc', mean_acc,
                                                  global_step)
                            log_writer.add_scalar('Train/loss', avg_loss,
                                                  global_step)
                            log_writer.add_scalar('Train/lr', lr[0],
                                                  global_step)
                            log_writer.add_scalar('Train/step/sec', speed,
                                                  global_step)
                        sys.stdout.flush()
                        avg_loss = 0.0
                        cm.zero_matrix()
                        timer.restart()
                else:
                    # If not in debug mode, avoid unnessary log and calculate
                    loss, lr = exe.run(program=compiled_train_prog,
                                       fetch_list=fetch_list,
                                       return_numpy=True)
                    avg_loss += np.mean(np.array(loss))
                    global_step += 1

                    if global_step % args.log_steps == 0:
                        avg_loss /= args.log_steps
                        speed = args.log_steps / timer.elapsed_time()
                        print((
                            "epoch={} step={} lr={:.5f} loss={:.4f} step/sec={:.3f} | ETA {}"
                        ).format(epoch, global_step, lr[0], avg_loss, speed,
                                 calculate_eta(all_step - global_step, speed)))
                        if args.use_tb:
                            log_writer.add_scalar('Train/loss', avg_loss,
                                                  global_step)
                            log_writer.add_scalar('Train/lr', lr[0],
                                                  global_step)
                            log_writer.add_scalar('Train/speed', speed,
                                                  global_step)
                        sys.stdout.flush()
                        avg_loss = 0.0
                        timer.restart()

            except fluid.core.EOFException:
                py_reader.reset()
                break
            except Exception as e:
                print(e)

        if epoch % cfg.TRAIN.SNAPSHOT_EPOCH == 0:
            ckpt_dir = save_checkpoint(exe, train_prog, epoch)

            if args.do_eval:
                print("Evaluation start")
                _, mean_iou, _, mean_acc = evaluate(cfg=cfg,
                                                    ckpt_dir=ckpt_dir,
                                                    use_gpu=args.use_gpu,
                                                    use_mpio=args.use_mpio)
                if args.use_tb:
                    log_writer.add_scalar('Evaluate/mean_iou', mean_iou,
                                          global_step)
                    log_writer.add_scalar('Evaluate/mean_acc', mean_acc,
                                          global_step)

            # Use Tensorboard to visualize results
            if args.use_tb and cfg.DATASET.VIS_FILE_LIST is not None:
                visualize(cfg=cfg,
                          use_gpu=args.use_gpu,
                          vis_file_list=cfg.DATASET.VIS_FILE_LIST,
                          vis_dir="visual",
                          ckpt_dir=ckpt_dir,
                          log_writer=log_writer)

    # save final model
    save_checkpoint(exe, train_prog, 'final')
コード例 #3
0
def train():
    learning_rate = cfg.learning_rate
    image_shape = [3, cfg.TRAIN.max_size, cfg.TRAIN.max_size]

    if cfg.enable_ce:
        fluid.default_startup_program().random_seed = 1000
        fluid.default_main_program().random_seed = 1000
        import random
        random.seed(0)
        np.random.seed(0)

    devices = os.getenv("CUDA_VISIBLE_DEVICES") or ""
    devices_num = len(devices.split(","))
    total_batch_size = devices_num * cfg.TRAIN.im_per_batch

    use_random = True
    if cfg.enable_ce:
        use_random = False
    model = model_builder.RCNN(
        add_conv_body_func=resnet.add_ResNet50_conv4_body,
        add_roi_box_head_func=resnet.add_ResNet_roi_conv5_head,
        use_pyreader=cfg.use_pyreader,
        use_random=use_random)
    model.build_model(image_shape)
    losses, keys = model.loss()
    loss = losses[0]
    fetch_list = losses

    boundaries = cfg.lr_steps
    gamma = cfg.lr_gamma
    step_num = len(cfg.lr_steps)
    values = [learning_rate * (gamma**i) for i in range(step_num + 1)]

    lr = exponential_with_warmup_decay(learning_rate=learning_rate,
                                       boundaries=boundaries,
                                       values=values,
                                       warmup_iter=cfg.warm_up_iter,
                                       warmup_factor=cfg.warm_up_factor)
    optimizer = fluid.optimizer.Momentum(
        learning_rate=lr,
        regularization=fluid.regularizer.L2Decay(cfg.weight_decay),
        momentum=cfg.momentum)
    optimizer.minimize(loss)
    fetch_list = fetch_list + [lr]

    for var in fetch_list:
        var.persistable = True

    #fluid.memory_optimize(fluid.default_main_program(), skip_opt_set=set(fetch_list))

    place = fluid.CUDAPlace(0) if cfg.use_gpu else fluid.CPUPlace()
    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())

    if cfg.pretrained_model:

        def if_exist(var):
            return os.path.exists(os.path.join(cfg.pretrained_model, var.name))

        fluid.io.load_vars(exe, cfg.pretrained_model, predicate=if_exist)

    if cfg.parallel:
        build_strategy = fluid.BuildStrategy()
        build_strategy.memory_optimize = False
        build_strategy.enable_inplace = False

        exec_strategy = fluid.ExecutionStrategy()
        exec_strategy.use_experimental_executor = True
        train_exe = fluid.ParallelExecutor(use_cuda=bool(cfg.use_gpu),
                                           loss_name=loss.name,
                                           build_strategy=build_strategy,
                                           exec_strategy=exec_strategy)
    else:
        train_exe = exe

    shuffle = True
    if cfg.enable_ce:
        shuffle = False
    if cfg.use_pyreader:
        train_reader = reader.train(batch_size=cfg.TRAIN.im_per_batch,
                                    total_batch_size=total_batch_size,
                                    padding_total=cfg.TRAIN.padding_minibatch,
                                    shuffle=shuffle)
        py_reader = model.py_reader
        py_reader.decorate_paddle_reader(train_reader)
    else:
        train_reader = reader.train(batch_size=total_batch_size,
                                    shuffle=shuffle)
        feeder = fluid.DataFeeder(place=place, feed_list=model.feeds())

    def save_model(postfix):
        model_path = os.path.join(cfg.model_save_dir, postfix)
        if os.path.isdir(model_path):
            shutil.rmtree(model_path)
        fluid.io.save_persistables(exe, model_path)

    def train_loop_pyreader():
        py_reader.start()
        train_stats = TrainingStats(cfg.log_window, keys)
        try:
            start_time = time.time()
            prev_start_time = start_time
            for iter_id in range(cfg.max_iter):
                prev_start_time = start_time
                start_time = time.time()
                outs = train_exe.run(fetch_list=[v.name for v in fetch_list])
                stats = {
                    k: np.array(v).mean()
                    for k, v in zip(keys, outs[:-1])
                }
                train_stats.update(stats)
                logs = train_stats.log()
                strs = '{}, iter: {}, lr: {:.5f}, {}, time: {:.3f}'.format(
                    now_time(), iter_id, np.mean(outs[-1]), logs,
                    start_time - prev_start_time)
                print(strs)
                sys.stdout.flush()
                if (iter_id + 1) % cfg.TRAIN.snapshot_iter == 0:
                    save_model("model_iter{}".format(iter_id))
            end_time = time.time()
            total_time = end_time - start_time
            last_loss = np.array(outs[0]).mean()
            if cfg.enable_ce:
                gpu_num = devices_num
                epoch_idx = iter_id + 1
                loss = last_loss
                print("kpis\teach_pass_duration_card%s\t%s" %
                      (gpu_num, total_time / epoch_idx))
                print("kpis\ttrain_loss_card%s\t%s" % (gpu_num, loss))
        except (StopIteration, fluid.core.EOFException):
            py_reader.reset()

    def train_loop():
        start_time = time.time()
        prev_start_time = start_time
        start = start_time
        train_stats = TrainingStats(cfg.log_window, keys)
        for iter_id, data in enumerate(train_reader()):
            prev_start_time = start_time
            start_time = time.time()
            outs = train_exe.run(fetch_list=[v.name for v in fetch_list],
                                 feed=feeder.feed(data))
            stats = {k: np.array(v).mean() for k, v in zip(keys, outs[:-1])}
            train_stats.update(stats)
            logs = train_stats.log()
            strs = '{}, iter: {}, lr: {:.5f}, {}, time: {:.3f}'.format(
                now_time(), iter_id, np.mean(outs[-1]), logs,
                start_time - prev_start_time)
            print(strs)
            sys.stdout.flush()
            if (iter_id + 1) % cfg.TRAIN.snapshot_iter == 0:
                save_model("model_iter{}".format(iter_id))
            if (iter_id + 1) == cfg.max_iter:
                break
        end_time = time.time()
        total_time = end_time - start_time
        last_loss = np.array(outs[0]).mean()
        # only for ce
        if cfg.enable_ce:
            gpu_num = devices_num
            epoch_idx = iter_id + 1
            loss = last_loss
            print("kpis\teach_pass_duration_card%s\t%s" %
                  (gpu_num, total_time / epoch_idx))
            print("kpis\ttrain_loss_card%s\t%s" % (gpu_num, loss))

        return np.mean(every_pass_loss)

    if cfg.use_pyreader:
        train_loop_pyreader()
    else:
        train_loop()
    save_model('model_final')
コード例 #4
0
def main():
    env = os.environ
    FLAGS.dist = 'PADDLE_TRAINER_ID' in env and 'PADDLE_TRAINERS_NUM' in env
    if FLAGS.dist:
        trainer_id = int(env['PADDLE_TRAINER_ID'])
        local_seed = (99 + trainer_id)
        random.seed(local_seed)
        np.random.seed(local_seed)

    if FLAGS.enable_ce:
        random.seed(0)
        np.random.seed(0)

    cfg = load_config(FLAGS.config)
    merge_config(FLAGS.opt)
    check_config(cfg)
    # check if set use_gpu=True in paddlepaddle cpu version
    cfg.use_gpu = False
    check_gpu(cfg.use_gpu)
    # check if paddlepaddle version is satisfied
    check_version()

    save_only = getattr(cfg, 'save_prediction_only', False)
    if save_only:
        raise NotImplementedError('The config file only support prediction,'
                                  ' training stage is not implemented now')
    main_arch = cfg.architecture

    if cfg.use_gpu:
        devices_num = fluid.core.get_cuda_device_count()
    else:
        devices_num = int(os.environ.get('CPU_NUM', 1))

    if 'FLAGS_selected_gpus' in env:
        device_id = int(env['FLAGS_selected_gpus'])
    else:
        device_id = 0
    place = fluid.CUDAPlace(device_id) if cfg.use_gpu else fluid.CPUPlace()
    exe = fluid.Executor(place)

    lr_builder = create('LearningRate')
    optim_builder = create('OptimizerBuilder')

    # build program
    startup_prog = fluid.Program()
    train_prog = fluid.Program()
    if FLAGS.enable_ce:
        startup_prog.random_seed = 1000
        train_prog.random_seed = 1000
    with fluid.program_guard(train_prog, startup_prog):
        with fluid.unique_name.guard():
            model = create(main_arch)
            if FLAGS.fp16:
                assert (getattr(model.backbone, 'norm_type', None)
                        != 'affine_channel'), \
                    '--fp16 currently does not support affine channel, ' \
                    ' please modify backbone settings to use batch norm'

            with mixed_precision_context(FLAGS.loss_scale, FLAGS.fp16) as ctx:
                inputs_def = cfg['TrainReader']['inputs_def']
                feed_vars, train_loader = model.build_inputs(**inputs_def)
                train_fetches = model.train(feed_vars)
                loss = train_fetches['loss']
                if FLAGS.fp16:
                    loss *= ctx.get_loss_scale_var()
                lr = lr_builder()
                optimizer = optim_builder(lr)
                optimizer.minimize(loss)

                if FLAGS.fp16:
                    loss /= ctx.get_loss_scale_var()

            if 'use_ema' in cfg and cfg['use_ema']:
                global_steps = _decay_step_counter()
                ema = ExponentialMovingAverage(cfg['ema_decay'],
                                               thres_steps=global_steps)
                ema.update()

    # parse train fetches
    train_keys, train_values, _ = parse_fetches(train_fetches)
    train_values.append(lr)

    if FLAGS.eval:
        eval_prog = fluid.Program()
        with fluid.program_guard(eval_prog, startup_prog):
            with fluid.unique_name.guard():
                model = create(main_arch)
                inputs_def = cfg['EvalReader']['inputs_def']
                feed_vars, eval_loader = model.build_inputs(**inputs_def)
                fetches = model.eval(feed_vars)
        eval_prog = eval_prog.clone(True)

        eval_reader = create_reader(cfg.EvalReader, devices_num=1)
        eval_loader.set_sample_list_generator(eval_reader, place)

        # parse eval fetches
        extra_keys = []
        if cfg.metric == 'COCO':
            extra_keys = ['im_info', 'im_id', 'im_shape']
        if cfg.metric == 'VOC':
            extra_keys = ['gt_bbox', 'gt_class', 'is_difficult']
        if cfg.metric == 'WIDERFACE':
            extra_keys = ['im_id', 'im_shape', 'gt_bbox']
        eval_keys, eval_values, eval_cls = parse_fetches(
            fetches, eval_prog, extra_keys)

    # compile program for multi-devices
    build_strategy = fluid.BuildStrategy()
    build_strategy.fuse_all_optimizer_ops = False
    # only enable sync_bn in multi GPU devices
    sync_bn = getattr(model.backbone, 'norm_type', None) == 'sync_bn'
    build_strategy.sync_batch_norm = sync_bn and devices_num > 1 \
        and cfg.use_gpu

    exec_strategy = fluid.ExecutionStrategy()
    # iteration number when CompiledProgram tries to drop local execution scopes.
    # Set it to be 1 to save memory usages, so that unused variables in
    # local execution scopes can be deleted after each iteration.
    exec_strategy.num_iteration_per_drop_scope = 1
    if FLAGS.dist:
        dist_utils.prepare_for_multi_process(exe, build_strategy, startup_prog,
                                             train_prog)
        exec_strategy.num_threads = 1

    exe.run(startup_prog)
    compiled_train_prog = fluid.CompiledProgram(train_prog).with_data_parallel(
        loss_name=loss.name,
        build_strategy=build_strategy,
        exec_strategy=exec_strategy)

    if FLAGS.eval:
        compiled_eval_prog = fluid.CompiledProgram(eval_prog)

    fuse_bn = getattr(model.backbone, 'norm_type', None) == 'affine_channel'

    ignore_params = cfg.finetune_exclude_pretrained_params \
                 if 'finetune_exclude_pretrained_params' in cfg else []

    start_iter = 0
    if FLAGS.resume_checkpoint:
        checkpoint.load_checkpoint(exe, train_prog, FLAGS.resume_checkpoint)
        start_iter = checkpoint.global_step()
    elif cfg.pretrain_weights and fuse_bn and not ignore_params:
        checkpoint.load_and_fusebn(exe, train_prog, cfg.pretrain_weights)
    elif cfg.pretrain_weights:
        checkpoint.load_params(exe,
                               train_prog,
                               cfg.pretrain_weights,
                               ignore_params=ignore_params)

    train_reader = create_reader(cfg.TrainReader,
                                 (cfg.max_iters - start_iter) * devices_num,
                                 cfg,
                                 devices_num=devices_num)
    train_loader.set_sample_list_generator(train_reader, place)

    # whether output bbox is normalized in model output layer
    is_bbox_normalized = False
    if hasattr(model, 'is_bbox_normalized') and \
            callable(model.is_bbox_normalized):
        is_bbox_normalized = model.is_bbox_normalized()

    # if map_type not set, use default 11point, only use in VOC eval
    map_type = cfg.map_type if 'map_type' in cfg else '11point'

    train_stats = TrainingStats(cfg.log_smooth_window, train_keys)
    train_loader.start()
    start_time = time.time()
    end_time = time.time()

    cfg_name = os.path.basename(FLAGS.config).split('.')[0]
    save_dir = os.path.join(cfg.save_dir, cfg_name)
    time_stat = deque(maxlen=cfg.log_smooth_window)
    best_box_ap_list = [0.0, 0]  #[map, iter]

    # use VisualDL to log data
    if FLAGS.use_vdl:
        assert six.PY3, "VisualDL requires Python >= 3.5"
        from visualdl import LogWriter
        vdl_writer = LogWriter(FLAGS.vdl_log_dir)
        vdl_loss_step = 0
        vdl_mAP_step = 0

    for it in range(start_iter, cfg.max_iters):
        start_time = end_time
        end_time = time.time()
        time_stat.append(end_time - start_time)
        time_cost = np.mean(time_stat)
        eta_sec = (cfg.max_iters - it) * time_cost
        eta = str(datetime.timedelta(seconds=int(eta_sec)))
        outs = exe.run(compiled_train_prog, fetch_list=train_values)
        stats = {k: np.array(v).mean() for k, v in zip(train_keys, outs[:-1])}

        # use vdl-paddle to log loss
        if FLAGS.use_vdl:
            if it % cfg.log_iter == 0:
                for loss_name, loss_value in stats.items():
                    vdl_writer.add_scalar(loss_name, loss_value, vdl_loss_step)
                vdl_loss_step += 1

        train_stats.update(stats)
        logs = train_stats.log()
        if it % cfg.log_iter == 0 and (not FLAGS.dist or trainer_id == 0):
            strs = 'iter: {}, lr: {:.6f}, {}, time: {:.3f}, eta: {}'.format(
                it, np.mean(outs[-1]), logs, time_cost, eta)
            logger.info(strs)

        # NOTE : profiler tools, used for benchmark
        if FLAGS.is_profiler and it == 5:
            profiler.start_profiler("All")
        elif FLAGS.is_profiler and it == 10:
            profiler.stop_profiler("total", FLAGS.profiler_path)
            return


        if (it > 0 and it % cfg.snapshot_iter == 0 or it == cfg.max_iters - 1) \
           and (not FLAGS.dist or trainer_id == 0):
            save_name = str(it) if it != cfg.max_iters - 1 else "model_final"
            if 'use_ema' in cfg and cfg['use_ema']:
                exe.run(ema.apply_program)
            checkpoint.save(exe, train_prog, os.path.join(save_dir, save_name))

            if FLAGS.eval:
                # evaluation
                resolution = None
                if 'Mask' in cfg.architecture:
                    resolution = model.mask_head.resolution
                results = eval_run(exe,
                                   compiled_eval_prog,
                                   eval_loader,
                                   eval_keys,
                                   eval_values,
                                   eval_cls,
                                   cfg,
                                   resolution=resolution)
                box_ap_stats = eval_results(results, cfg.metric,
                                            cfg.num_classes, resolution,
                                            is_bbox_normalized,
                                            FLAGS.output_eval, map_type,
                                            cfg['EvalReader']['dataset'])

                # use vdl_paddle to log mAP
                if FLAGS.use_vdl:
                    vdl_writer.add_scalar("mAP", box_ap_stats[0], vdl_mAP_step)
                    vdl_mAP_step += 1

                if box_ap_stats[0] > best_box_ap_list[0]:
                    best_box_ap_list[0] = box_ap_stats[0]
                    best_box_ap_list[1] = it
                    checkpoint.save(exe, train_prog,
                                    os.path.join(save_dir, "best_model"))
                logger.info("Best test box ap: {}, in iter: {}".format(
                    best_box_ap_list[0], best_box_ap_list[1]))

            if 'use_ema' in cfg and cfg['use_ema']:
                exe.run(ema.restore_program)

    train_loader.reset()
コード例 #5
0
ファイル: main.py プロジェクト: ljl8999/DALI
def main():
    env = os.environ
    FLAGS.local_rank = int(env.get('PADDLE_TRAINER_ID', 0))
    FLAGS.world_size = int(env.get('PADDLE_TRAINERS_NUM', 1))
    FLAGS.device_id = int(env['FLAGS_selected_gpus'])
    FLAGS.whole_batch_size = FLAGS.world_size * FLAGS.batch_size

    pipe = create_dali_pipeline(batch_size=FLAGS.batch_size,
                                num_threads=FLAGS.num_threads,
                                device_id=FLAGS.device_id,
                                data_dir=os.path.join(FLAGS.data, 'train'),
                                crop=224,
                                size=256,
                                dali_cpu=False,
                                shard_id=FLAGS.local_rank,
                                num_shards=FLAGS.world_size,
                                is_training=True)
    pipe.build()
    sample_per_shard = pipe.epoch_size("Reader") // FLAGS.world_size
    train_loader = DALIClassificationIterator(pipe, reader_name="Reader")

    if FLAGS.local_rank == 0:
        pipe = create_dali_pipeline(batch_size=FLAGS.batch_size,
                                    num_threads=FLAGS.num_threads,
                                    device_id=FLAGS.device_id,
                                    data_dir=os.path.join(FLAGS.data, 'val'),
                                    crop=224,
                                    size=256,
                                    dali_cpu=False,
                                    shard_id=0,
                                    num_shards=1,
                                    is_training=False)
        pipe.build()
        val_loader = DALIClassificationIterator(pipe, reader_name="Reader")

    place = fluid.CUDAPlace(FLAGS.device_id)
    exe = fluid.Executor(place)
    startup_prog = fluid.Program()
    train_prog = fluid.Program()
    eval_prog = fluid.Program()

    step_per_epoch = int(math.ceil(sample_per_shard / FLAGS.batch_size))
    milestones = [step_per_epoch * e for e in (30, 60, 80)]
    values = [FLAGS.lr * (0.1**i) for i in range(len(milestones) + 1)]

    with fluid.program_guard(train_prog, startup_prog):
        with fluid.unique_name.guard():
            train_fetch_list = build()
            learning_rate = fluid.layers.piecewise_decay(boundaries=milestones,
                                                         values=values)
            learning_rate = fluid.layers.linear_lr_warmup(
                learning_rate=learning_rate,
                warmup_steps=5 * step_per_epoch,
                start_lr=0.,
                end_lr=FLAGS.lr)
            decay = FLAGS.weight_decay
            optimizer = fluid.optimizer.Momentum(
                learning_rate=learning_rate,
                momentum=FLAGS.momentum,
                regularization=fluid.regularizer.L2Decay(decay))
            avg_loss = train_fetch_list[0]
            optimizer.minimize(avg_loss)

    with fluid.program_guard(eval_prog, startup_prog):
        with fluid.unique_name.guard():
            eval_fetch_list = build()
        eval_prog = eval_prog.clone(True)

    build_strategy = fluid.BuildStrategy()
    build_strategy.trainer_id = FLAGS.local_rank
    build_strategy.num_trainers = FLAGS.world_size
    config = fluid.DistributeTranspilerConfig()
    config.mode = "nccl2"
    t = fluid.DistributeTranspiler(config=config)
    t.transpile(FLAGS.local_rank,
                trainers=os.environ.get('PADDLE_TRAINER_ENDPOINTS'),
                current_endpoint=os.environ.get('PADDLE_CURRENT_ENDPOINT'),
                startup_program=startup_prog,
                program=train_prog)

    exec_strategy = fluid.ExecutionStrategy()

    exe.run(startup_prog)
    compiled_train_prog = fluid.CompiledProgram(train_prog).with_data_parallel(
        loss_name=avg_loss.name,
        build_strategy=build_strategy,
        exec_strategy=exec_strategy)
    compiled_eval_prog = fluid.compiler.CompiledProgram(eval_prog)

    total_time = AverageMeter()

    for epoch in range(FLAGS.epochs):
        if FLAGS.local_rank == 0:
            print("==== train epoch {:02d} ====".format(epoch + 1))
        avg_time, _, _ = run(exe, compiled_train_prog, train_fetch_list,
                             train_loader, epoch)
        total_time.update(avg_time)
        # reset DALI iterators
        train_loader.reset()

        if FLAGS.local_rank == 0:
            print("==== validation epoch {:02d} ====".format(epoch + 1))
            _, prec1, prec5 = run(exe, compiled_eval_prog, eval_fetch_list,
                                  val_loader, epoch)

            val_loader.reset()

            ckpt_path = os.path.join('checkpoint', "{:02d}".format(epoch + 1))
            if os.path.isdir(ckpt_path):
                shutil.rmtree(ckpt_path)
            print('Save model to {}.'.format(ckpt_path))
            fluid.io.save_persistables(exe, ckpt_path, train_prog)

            time_per_sample = FLAGS.whole_batch_size / total_time.avg
            if epoch == FLAGS.epochs - 1:
                print('##Top-1 {0}\n'
                      '##Top-5 {1}\n'
                      '##Perf  {2}'.format(prec1 * 100, prec5 * 100,
                                           time_per_sample))
コード例 #6
0
def train(cfg):
    # startup_prog = fluid.Program()
    # train_prog = fluid.Program()

    drop_last = True

    dataset = SegDataset(
        file_list=cfg.DATASET.TRAIN_FILE_LIST,
        mode=ModelPhase.TRAIN,
        shuffle=True,
        data_dir=cfg.DATASET.DATA_DIR)

    def data_generator():
        if args.use_mpio:
            data_gen = dataset.multiprocess_generator(
                num_processes=cfg.DATALOADER.NUM_WORKERS,
                max_queue_size=cfg.DATALOADER.BUF_SIZE)
        else:
            data_gen = dataset.generator()

        batch_data = []
        for b in data_gen:
            batch_data.append(b)
            if len(batch_data) == (cfg.BATCH_SIZE // cfg.NUM_TRAINERS):
                for item in batch_data:
                    yield item[0], item[1], item[2]
                batch_data = []
        # If use sync batch norm strategy, drop last batch if number of samples
        # in batch_data is less then cfg.BATCH_SIZE to avoid NCCL hang issues
        if not cfg.TRAIN.SYNC_BATCH_NORM:
            for item in batch_data:
                yield item[0], item[1], item[2]

    # Get device environment
    # places = fluid.cuda_places() if args.use_gpu else fluid.cpu_places()
    # place = places[0]
    gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0))
    place = fluid.CUDAPlace(gpu_id) if args.use_gpu else fluid.CPUPlace()
    places = fluid.cuda_places() if args.use_gpu else fluid.cpu_places()

    # Get number of GPU
    dev_count = cfg.NUM_TRAINERS if cfg.NUM_TRAINERS > 1 else len(places)
    print_info("#Device count: {}".format(dev_count))

    # Make sure BATCH_SIZE can divided by GPU cards
    assert cfg.BATCH_SIZE % dev_count == 0, (
        'BATCH_SIZE:{} not divisble by number of GPUs:{}'.format(
            cfg.BATCH_SIZE, dev_count))
    # If use multi-gpu training mode, batch data will allocated to each GPU evenly
    batch_size_per_dev = cfg.BATCH_SIZE // dev_count
    print_info("batch_size_per_dev: {}".format(batch_size_per_dev))

    data_loader, loss, lr, pred, grts, masks, image = build_model(
        phase=ModelPhase.TRAIN)
    data_loader.set_sample_generator(
        data_generator, batch_size=batch_size_per_dev, drop_last=drop_last)

    exe = fluid.Executor(place)

    cfg.update_from_file(args.teacher_cfg_file)
    # teacher_arch = teacher_cfg.architecture
    teacher_program = fluid.Program()
    teacher_startup_program = fluid.Program()

    with fluid.program_guard(teacher_program, teacher_startup_program):
        with fluid.unique_name.guard():
            _, teacher_loss, _, _, _, _, _ = build_model(
                teacher_program,
                teacher_startup_program,
                phase=ModelPhase.TRAIN,
                image=image,
                label=grts,
                mask=masks)

    exe.run(teacher_startup_program)

    teacher_program = teacher_program.clone(for_test=True)
    ckpt_dir = cfg.SLIM.KNOWLEDGE_DISTILL_TEACHER_MODEL_DIR
    assert ckpt_dir is not None
    print('load teacher model:', ckpt_dir)
    if os.path.exists(ckpt_dir):
        try:
            fluid.load(teacher_program, os.path.join(ckpt_dir, 'model'), exe)
        except:
            fluid.io.load_params(exe, ckpt_dir, main_program=teacher_program)

    # cfg = load_config(FLAGS.config)
    cfg.update_from_file(args.cfg_file)
    data_name_map = {
        'image': 'image',
        'label': 'label',
        'mask': 'mask',
    }
    merge(teacher_program, fluid.default_main_program(), data_name_map, place)
    distill_pairs = [[
        'teacher_bilinear_interp_2.tmp_0', 'bilinear_interp_0.tmp_0'
    ]]

    def distill(pairs, weight):
        """
        Add 3 pairs of distillation losses, each pair of feature maps is the
        input of teacher and student's yolov3_loss respectively
        """
        loss = l2_loss(pairs[0][0], pairs[0][1])
        weighted_loss = loss * weight
        return weighted_loss

    distill_loss = distill(distill_pairs, 0.1)
    cfg.update_from_file(args.cfg_file)
    optimizer = solver.Solver(None, None)
    all_loss = loss + distill_loss
    lr = optimizer.optimise(all_loss)

    exe.run(fluid.default_startup_program())

    exec_strategy = fluid.ExecutionStrategy()
    # Clear temporary variables every 100 iteration
    if args.use_gpu:
        exec_strategy.num_threads = fluid.core.get_cuda_device_count()
    exec_strategy.num_iteration_per_drop_scope = 100
    build_strategy = fluid.BuildStrategy()
    build_strategy.fuse_all_reduce_ops = False
    build_strategy.fuse_all_optimizer_ops = False
    build_strategy.fuse_elewise_add_act_ops = True
    if cfg.NUM_TRAINERS > 1 and args.use_gpu:
        dist_utils.prepare_for_multi_process(exe, build_strategy,
                                             fluid.default_main_program())
        exec_strategy.num_threads = 1

    if cfg.TRAIN.SYNC_BATCH_NORM and args.use_gpu:
        if dev_count > 1:
            # Apply sync batch norm strategy
            print_info("Sync BatchNorm strategy is effective.")
            build_strategy.sync_batch_norm = True
        else:
            print_info(
                "Sync BatchNorm strategy will not be effective if GPU device"
                " count <= 1")
    compiled_train_prog = fluid.CompiledProgram(
        fluid.default_main_program()).with_data_parallel(
            loss_name=all_loss.name,
            exec_strategy=exec_strategy,
            build_strategy=build_strategy)

    # Resume training
    begin_epoch = cfg.SOLVER.BEGIN_EPOCH
    if cfg.TRAIN.RESUME_MODEL_DIR:
        begin_epoch = load_checkpoint(exe, fluid.default_main_program())
    # Load pretrained model
    elif os.path.exists(cfg.TRAIN.PRETRAINED_MODEL_DIR):
        load_pretrained_weights(exe, fluid.default_main_program(),
                                cfg.TRAIN.PRETRAINED_MODEL_DIR)
    else:
        print_info(
            'Pretrained model dir {} not exists, training from scratch...'.
            format(cfg.TRAIN.PRETRAINED_MODEL_DIR))

    #fetch_list = [avg_loss.name, lr.name]
    fetch_list = [
        loss.name, 'teacher_' + teacher_loss.name, distill_loss.name, lr.name
    ]

    if args.debug:
        # Fetch more variable info and use streaming confusion matrix to
        # calculate IoU results if in debug mode
        np.set_printoptions(
            precision=4, suppress=True, linewidth=160, floatmode="fixed")
        fetch_list.extend([pred.name, grts.name, masks.name])
        cm = ConfusionMatrix(cfg.DATASET.NUM_CLASSES, streaming=True)

    if args.use_vdl:
        if not args.vdl_log_dir:
            print_info("Please specify the log directory by --vdl_log_dir.")
            exit(1)

        from visualdl import LogWriter
        log_writer = LogWriter(args.vdl_log_dir)

    # trainer_id = int(os.getenv("PADDLE_TRAINER_ID", 0))
    # num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1))
    step = 0
    all_step = cfg.DATASET.TRAIN_TOTAL_IMAGES // cfg.BATCH_SIZE
    if cfg.DATASET.TRAIN_TOTAL_IMAGES % cfg.BATCH_SIZE and drop_last != True:
        all_step += 1
    all_step *= (cfg.SOLVER.NUM_EPOCHS - begin_epoch + 1)

    avg_loss = 0.0
    avg_t_loss = 0.0
    avg_d_loss = 0.0
    best_mIoU = 0.0

    timer = Timer()
    timer.start()
    if begin_epoch > cfg.SOLVER.NUM_EPOCHS:
        raise ValueError(
            ("begin epoch[{}] is larger than cfg.SOLVER.NUM_EPOCHS[{}]").format(
                begin_epoch, cfg.SOLVER.NUM_EPOCHS))

    if args.use_mpio:
        print_info("Use multiprocess reader")
    else:
        print_info("Use multi-thread reader")

    for epoch in range(begin_epoch, cfg.SOLVER.NUM_EPOCHS + 1):
        data_loader.start()
        while True:
            try:
                if args.debug:
                    # Print category IoU and accuracy to check whether the
                    # traning process is corresponed to expectation
                    loss, lr, pred, grts, masks = exe.run(
                        program=compiled_train_prog,
                        fetch_list=fetch_list,
                        return_numpy=True)
                    cm.calculate(pred, grts, masks)
                    avg_loss += np.mean(np.array(loss))
                    step += 1

                    if step % args.log_steps == 0:
                        speed = args.log_steps / timer.elapsed_time()
                        avg_loss /= args.log_steps
                        category_acc, mean_acc = cm.accuracy()
                        category_iou, mean_iou = cm.mean_iou()

                        print_info((
                            "epoch={} step={} lr={:.5f} loss={:.4f} acc={:.5f} mIoU={:.5f} step/sec={:.3f} | ETA {}"
                        ).format(epoch, step, lr[0], avg_loss, mean_acc,
                                 mean_iou, speed,
                                 calculate_eta(all_step - step, speed)))
                        print_info("Category IoU: ", category_iou)
                        print_info("Category Acc: ", category_acc)
                        if args.use_vdl:
                            log_writer.add_scalar('Train/mean_iou', mean_iou,
                                                  step)
                            log_writer.add_scalar('Train/mean_acc', mean_acc,
                                                  step)
                            log_writer.add_scalar('Train/loss', avg_loss, step)
                            log_writer.add_scalar('Train/lr', lr[0], step)
                            log_writer.add_scalar('Train/step/sec', speed, step)
                        sys.stdout.flush()
                        avg_loss = 0.0
                        cm.zero_matrix()
                        timer.restart()
                else:
                    # If not in debug mode, avoid unnessary log and calculate
                    loss, t_loss, d_loss, lr = exe.run(
                        program=compiled_train_prog,
                        fetch_list=fetch_list,
                        return_numpy=True)
                    avg_loss += np.mean(np.array(loss))
                    avg_t_loss += np.mean(np.array(t_loss))
                    avg_d_loss += np.mean(np.array(d_loss))
                    step += 1

                    if step % args.log_steps == 0 and cfg.TRAINER_ID == 0:
                        avg_loss /= args.log_steps
                        avg_t_loss /= args.log_steps
                        avg_d_loss /= args.log_steps
                        speed = args.log_steps / timer.elapsed_time()
                        print((
                            "epoch={} step={} lr={:.5f} loss={:.4f} teacher loss={:.4f} distill loss={:.4f} step/sec={:.3f} | ETA {}"
                        ).format(epoch, step, lr[0], avg_loss, avg_t_loss,
                                 avg_d_loss, speed,
                                 calculate_eta(all_step - step, speed)))
                        if args.use_vdl:
                            log_writer.add_scalar('Train/loss', avg_loss, step)
                            log_writer.add_scalar('Train/lr', lr[0], step)
                            log_writer.add_scalar('Train/speed', speed, step)
                        sys.stdout.flush()
                        avg_loss = 0.0
                        avg_t_loss = 0.0
                        avg_d_loss = 0.0
                        timer.restart()

            except fluid.core.EOFException:
                data_loader.reset()
                break
            except Exception as e:
                print(e)

        if (epoch % cfg.TRAIN.SNAPSHOT_EPOCH == 0
                or epoch == cfg.SOLVER.NUM_EPOCHS) and cfg.TRAINER_ID == 0:
            ckpt_dir = save_checkpoint(fluid.default_main_program(), epoch)

            if args.do_eval:
                print("Evaluation start")
                _, mean_iou, _, mean_acc = evaluate(
                    cfg=cfg,
                    ckpt_dir=ckpt_dir,
                    use_gpu=args.use_gpu,
                    use_mpio=args.use_mpio)
                if args.use_vdl:
                    log_writer.add_scalar('Evaluate/mean_iou', mean_iou, step)
                    log_writer.add_scalar('Evaluate/mean_acc', mean_acc, step)

                if mean_iou > best_mIoU:
                    best_mIoU = mean_iou
                    update_best_model(ckpt_dir)
                    print_info("Save best model {} to {}, mIoU = {:.4f}".format(
                        ckpt_dir,
                        os.path.join(cfg.TRAIN.MODEL_SAVE_DIR, 'best_model'),
                        mean_iou))

            # Use VisualDL to visualize results
            if args.use_vdl and cfg.DATASET.VIS_FILE_LIST is not None:
                visualize(
                    cfg=cfg,
                    use_gpu=args.use_gpu,
                    vis_file_list=cfg.DATASET.VIS_FILE_LIST,
                    vis_dir="visual",
                    ckpt_dir=ckpt_dir,
                    log_writer=log_writer)
        if cfg.TRAINER_ID == 0:
            ckpt_dir = save_checkpoint(fluid.default_main_program(), epoch)

    # save final model
    if cfg.TRAINER_ID == 0:
        save_checkpoint(fluid.default_main_program(), 'final')
コード例 #7
0
    def _build_programs(self):
        """
        Build programs.

        Build train_program, eval_program and inference_program. Only use in static graph mode.
        """
        if self.run_infer:
            self.startup_program = fluid.Program()
            # build infer program
            self.infer_program = fluid.Program()
            with fluid.program_guard(self.infer_program, self.startup_program):
                with fluid.unique_name.guard():
                    self.infer_feed_dict = inputs = self._get_feed_dict(
                        is_infer=True)
                    outputs = self.forward(inputs, is_infer=True)
                    predictions = self.infer(inputs, outputs)
                    self.infer_fetch_dict = predictions
            self.infer_program = self.infer_program.clone(for_test=True)

            self.program = self.infer_program
        else:
            if self.is_distributed:
                exec_strategy = fluid.ExecutionStrategy()
                exec_strategy.use_experimental_executor = True
                exec_strategy.num_threads = 4
                exec_strategy.num_iteration_per_drop_scope = 1

                dist_strategy = DistributedStrategy()
                dist_strategy.exec_strategy = exec_strategy
                dist_strategy.nccl_comm_num = 1
                dist_strategy.fuse_all_reduce_ops = True
                if self.use_recompute:
                    dist_strategy.forward_recompute = True
                    dist_strategy.enable_sequential_execution = True
                if self.use_amp:
                    dist_strategy.use_amp = True
                    dist_strategy.amp_loss_scaling = self.amp_loss_scaling
                self.dist_strategy = dist_strategy

            self.startup_program = fluid.Program()
            # build train program
            self.train_program = fluid.Program()
            with fluid.program_guard(self.train_program, self.startup_program):
                with fluid.unique_name.guard():
                    self.feed_dict = inputs = self._get_feed_dict()
                    outputs = self.forward(inputs)
                    if self.is_distributed and self.use_recompute:
                        self.dist_strategy.recompute_checkpoints = outputs[
                            "checkpoints"]
                    metrics, statistics = self.get_metrics_and_statistics(
                        inputs, outputs)

                    # build eval program
                    self.eval_program = self.train_program.clone(for_test=True)
                    self.eval_fetch_dict = {**metrics, **statistics}

                    scheduled_lr = self.optimize(metrics)
                    metrics["scheduled_lr"] = scheduled_lr
                    self.train_fetch_dict = metrics

            self.program = self.train_program
            if self.is_distributed:
                self.train_program = fleet.main_program

        self.exe.run(self.startup_program)
        if self.init_pretraining_params != "":
            init_pretraining_params(self.exe, self.init_pretraining_params,
                                    self.program)
        elif self.init_checkpoint != "":
            init_checkpoint(self.exe, self.init_checkpoint, self.program)
        return
コード例 #8
0
def do_train(args):
    """
    Main Function
    """
    ernie_config = ErnieConfig(args.ernie_config_path)
    ernie_config.print_config()

    if args.use_cuda:
        place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0')))
        dev_count = 1
    else:
        dev_count = min(multiprocessing.cpu_count(), args.cpu_num)
        if (dev_count < args.cpu_num):
            print("WARNING: The total CPU NUM in this machine is %d, which is less than cpu_num parameter you set. "
                  "Change the cpu_num from %d to %d"%(dev_count, args.cpu_num, dev_count))
        os.environ['CPU_NUM'] = str(dev_count)
        place = fluid.CPUPlace()

    exe = fluid.Executor(place)


    startup_prog = fluid.Program()
    if args.random_seed is not None:
        startup_prog.random_seed = args.random_seed

    train_program = fluid.Program()
    with fluid.program_guard(train_program, startup_prog):
        with fluid.unique_name.guard():
            # user defined model based on ernie embeddings
            train_ret = creator.create_ernie_model(args, ernie_config)

            # ernie pyreader
            train_pyreader = creator.create_pyreader(args, file_name=args.train_data,
                                                  feed_list=train_ret['feed_list'],
                                                  model="ernie",
                                                  place=place)

            test_program = train_program.clone(for_test=True)
            test_pyreader = creator.create_pyreader(args, file_name=args.test_data,
                                                  feed_list=train_ret['feed_list'],
                                                  model="ernie",
                                                  place=place)

            optimizer = fluid.optimizer.Adam(learning_rate=args.base_learning_rate)
            fluid.clip.set_gradient_clip(clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0))
            optimizer.minimize(train_ret["avg_cost"])


    lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
        program=train_program, batch_size=args.batch_size)
    print("Theoretical memory usage in training: %.3f - %.3f %s" %
        (lower_mem, upper_mem, unit))
    print("Device count: %d" % dev_count)

    exe.run(startup_prog)
    # load checkpoints
    if args.init_checkpoint and args.init_pretraining_params:
        print("WARNING: args 'init_checkpoint' and 'init_pretraining_params' "
                "both are set! Only arg 'init_checkpoint' is made valid.")
    if args.init_checkpoint:
        utils.init_checkpoint(exe, args.init_checkpoint, startup_prog)
    elif args.init_pretraining_params:
        utils.init_pretraining_params(exe, args.init_pretraining_params, startup_prog)

    if dev_count>1 and not args.use_cuda:
        device = "GPU" if args.use_cuda else "CPU"
        print("%d %s are used to train model"%(dev_count, device))

        # multi cpu/gpu config
        exec_strategy = fluid.ExecutionStrategy()
        build_strategy = fluid.BuildStrategy()
        compiled_prog = fluid.compiler.CompiledProgram(train_program).with_data_parallel(
            loss_name=train_ret['avg_cost'].name,
            build_strategy=build_strategy,
            exec_strategy=exec_strategy)
    else:
        compiled_prog = fluid.compiler.CompiledProgram(train_program)

    # start training
    steps = 0
    for epoch_id in range(args.epoch):
        for data in train_pyreader():
            steps += 1
            if steps % args.print_steps == 0:
                fetch_list = [
                    train_ret["avg_cost"],
                    train_ret["precision"],
                    train_ret["recall"],
                    train_ret["f1_score"],
                ]
            else:
                fetch_list = []

            start_time = time.time()
            outputs = exe.run(program=compiled_prog, feed=data[0], fetch_list=fetch_list)
            end_time = time.time()
            if steps % args.print_steps == 0:
                loss, precision, recall, f1_score = [np.mean(x) for x in outputs]
                print("[train] batch_id = %d, loss = %.5f, P: %.5f, R: %.5f, F1: %.5f, elapsed time %.5f, "
                        "pyreader queue_size: %d " % (steps, loss, precision, recall, f1_score,
                        end_time - start_time, train_pyreader.queue.size()))

            if steps % args.save_steps == 0:
                save_path = os.path.join(args.model_save_dir, "step_" + str(steps))
                print("\tsaving model as %s" % (save_path))
                fluid.io.save_persistables(exe, save_path, train_program)

            if steps % args.validation_steps == 0:
                evaluate(exe, test_program, test_pyreader, train_ret)

    save_path = os.path.join(args.model_save_dir, "step_" + str(steps))
    fluid.io.save_persistables(exe, save_path, train_program)
コード例 #9
0
ファイル: train_search.py プロジェクト: baiyfbupt/DARTS
def main(args):
    devices = os.getenv("CUDA_VISIBLE_DEVICES") or ""
    devices_num = len(devices.split(","))
    step_per_epoch = int(args.trainset_num * args.train_portion /
                         args.batch_size)
    is_shuffle = True

    startup_prog = fluid.Program()
    data_prog = fluid.Program()
    test_prog = fluid.Program()

    image_shape = [int(m) for m in args.image_shape.split(",")]
    logger.info("Constructing graph...")
    with fluid.unique_name.guard():
        with fluid.program_guard(data_prog, startup_prog):
            image_train = fluid.data(name="image_train",
                                     shape=[None] + image_shape,
                                     dtype="float32")
            label_train = fluid.data(name="label_train",
                                     shape=[None, 1],
                                     dtype="int64")
            image_val = fluid.data(name="image_val",
                                   shape=[None] + image_shape,
                                   dtype="float32")
            label_val = fluid.data(name="label_val",
                                   shape=[None, 1],
                                   dtype="int64")
            train_loader = fluid.io.DataLoader.from_generator(
                feed_list=[image_train, label_train],
                capacity=64,
                use_double_buffer=True,
                iterable=True)
            valid_loader = fluid.io.DataLoader.from_generator(
                feed_list=[image_val, label_val],
                capacity=64,
                use_double_buffer=True,
                iterable=True)
            learning_rate = fluid.layers.cosine_decay(args.learning_rate,
                                                      4 * step_per_epoch,
                                                      args.epochs)
            # Pytorch CosineAnnealingLR
            learning_rate = learning_rate / args.learning_rate * (
                args.learning_rate -
                args.learning_rate_min) + args.learning_rate_min

        arch_progs_list, fetch = architect.compute_unrolled_step(
            image_train, label_train, image_val, label_val, data_prog,
            startup_prog, learning_rate, args)

        train_prog = data_prog.clone()
        with fluid.program_guard(train_prog, startup_prog):
            logits, loss = model(image_train,
                                 label_train,
                                 args.init_channels,
                                 args.class_num,
                                 args.layers,
                                 name="model")
            top1 = fluid.layers.accuracy(input=logits, label=label_train, k=1)
            top5 = fluid.layers.accuracy(input=logits, label=label_train, k=5)
            logger.info("param size = {:.6f}MB".format(
                utility.count_parameters_in_MB(
                    train_prog.global_block().all_parameters(), 'model')))
            test_prog = train_prog.clone(for_test=True)

            model_var = utility.get_parameters(
                train_prog.global_block().all_parameters(), 'model')[1]

            clip = fluid.clip.GradientClipByGlobalNorm(
                clip_norm=args.grad_clip)
            follower_opt = fluid.optimizer.MomentumOptimizer(
                learning_rate,
                args.momentum,
                regularization=fluid.regularizer.L2DecayRegularizer(
                    args.weight_decay),
                grad_clip=clip)
            follower_opt.minimize(loss,
                                  parameter_list=[v.name for v in model_var])

    logger.info("Construct graph done")
    place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
    exe = fluid.Executor(place)
    exe.run(startup_prog)
    train_reader, valid_reader = reader.train_search(
        batch_size=args.batch_size,
        train_portion=args.train_portion,
        is_shuffle=is_shuffle,
        args=args)
    places = fluid.cuda_places() if args.use_gpu else fluid.cpu_places()
    train_loader.set_batch_generator(train_reader, places=places)
    valid_loader.set_batch_generator(valid_reader, places=places)

    exec_strategy = fluid.ExecutionStrategy()
    exec_strategy.num_threads = 4 * devices_num
    build_strategy = fluid.BuildStrategy()
    if args.with_mem_opt:
        learning_rate.persistable = True
        loss.persistable = True
        top1.persistable = True
        top5.persistable = True
        build_strategy.enable_inplace = True
        build_strategy.memory_optimize = True
    arch_progs_list[0] = fluid.CompiledProgram(
        arch_progs_list[0]).with_data_parallel(loss_name=fetch[0].name,
                                               build_strategy=build_strategy,
                                               exec_strategy=exec_strategy)
    arch_progs_list[1] = fluid.CompiledProgram(
        arch_progs_list[1]).with_data_parallel(loss_name=fetch[1].name,
                                               build_strategy=build_strategy,
                                               exec_strategy=exec_strategy)
    parallel_train_prog = fluid.CompiledProgram(train_prog).with_data_parallel(
        loss_name=loss.name,
        build_strategy=build_strategy,
        exec_strategy=exec_strategy)
    compiled_test_prog = fluid.CompiledProgram(test_prog).with_data_parallel(
        build_strategy=build_strategy, exec_strategy=exec_strategy)

    def save_model(postfix, program):
        model_path = os.path.join(args.model_save_dir, postfix)
        if os.path.isdir(model_path):
            shutil.rmtree(model_path)
        logger.info('save models to %s' % (model_path))
        fluid.io.save_persistables(exe, model_path, main_program=program)

    best_acc = 0
    for epoch_id in range(args.epochs):
        # get genotype
        genotype(test_prog, exe, place)
        train_fetch_list = [learning_rate, loss, top1, top5]
        train_top1 = train(epoch_id, train_loader, valid_loader,
                           train_fetch_list, arch_progs_list,
                           parallel_train_prog, exe)
        logger.info("Epoch {}, train_acc {:.6f}".format(epoch_id, train_top1))
        valid_fetch_list = [loss, top1, top5]
        valid_top1 = valid(epoch_id, valid_loader, valid_fetch_list,
                           compiled_test_prog, exe)
        if valid_top1 > best_acc:
            best_acc = valid_top1
        logger.info("Epoch {}, valid_acc {:.6f}, best_valid_acc {:6f}".format(
            epoch_id, valid_top1, best_acc))
コード例 #10
0
ファイル: train.py プロジェクト: zouxiaoshi/PaddleDetection
def main():
    if FLAGS.eval is False:
        raise ValueError(
            "Currently only supports `--eval==True` while training in `quantization`."
        )
    env = os.environ
    FLAGS.dist = 'PADDLE_TRAINER_ID' in env \
                    and 'PADDLE_TRAINERS_NUM' in env \
                    and int(env['PADDLE_TRAINERS_NUM']) > 1
    num_trainers = int(env.get('PADDLE_TRAINERS_NUM', 1))
    if FLAGS.dist:
        trainer_id = int(env['PADDLE_TRAINER_ID'])
        import random
        local_seed = (99 + trainer_id)
        random.seed(local_seed)
        np.random.seed(local_seed)

    cfg = load_config(FLAGS.config)
    merge_config(FLAGS.opt)
    check_config(cfg)
    # check if set use_gpu=True in paddlepaddle cpu version
    check_gpu(cfg.use_gpu)
    # check if paddlepaddle version is satisfied
    check_version()

    main_arch = cfg.architecture

    if cfg.use_gpu:
        devices_num = fluid.core.get_cuda_device_count()
    else:
        devices_num = int(os.environ.get('CPU_NUM', 1))

    if 'FLAGS_selected_gpus' in env:
        device_id = int(env['FLAGS_selected_gpus'])
    else:
        device_id = 0
    place = fluid.CUDAPlace(device_id) if cfg.use_gpu else fluid.CPUPlace()
    exe = fluid.Executor(place)

    lr_builder = create('LearningRate')
    optim_builder = create('OptimizerBuilder')

    # build program
    startup_prog = fluid.Program()
    train_prog = fluid.Program()
    with fluid.program_guard(train_prog, startup_prog):
        with fluid.unique_name.guard():
            model = create(main_arch)
            inputs_def = cfg['TrainReader']['inputs_def']
            feed_vars, train_loader = model.build_inputs(**inputs_def)
            if FLAGS.use_pact:
                feed_vars['image'].stop_gradient = False
            train_fetches = model.train(feed_vars)
            loss = train_fetches['loss']
            lr = lr_builder()
            optimizer = optim_builder(lr)
            optimizer.minimize(loss)

    # parse train fetches
    train_keys, train_values, _ = parse_fetches(train_fetches)
    train_values.append(lr)

    if FLAGS.eval:
        eval_prog = fluid.Program()
        with fluid.program_guard(eval_prog, startup_prog):
            with fluid.unique_name.guard():
                model = create(main_arch)
                inputs_def = cfg['EvalReader']['inputs_def']
                feed_vars, eval_loader = model.build_inputs(**inputs_def)
                fetches = model.eval(feed_vars)
        eval_prog = eval_prog.clone(True)

        eval_reader = create_reader(cfg.EvalReader)
        # When iterable mode, set set_sample_list_generator(eval_reader, place)
        eval_loader.set_sample_list_generator(eval_reader)

        # parse eval fetches
        extra_keys = []
        if cfg.metric == 'COCO':
            extra_keys = ['im_info', 'im_id', 'im_shape']
        if cfg.metric == 'VOC':
            extra_keys = ['gt_bbox', 'gt_class', 'is_difficult']
        if cfg.metric == 'WIDERFACE':
            extra_keys = ['im_id', 'im_shape', 'gt_bbox']
        eval_keys, eval_values, eval_cls = parse_fetches(
            fetches, eval_prog, extra_keys)

    # compile program for multi-devices
    build_strategy = fluid.BuildStrategy()
    build_strategy.fuse_all_optimizer_ops = False
    build_strategy.fuse_elewise_add_act_ops = True
    build_strategy.fuse_all_reduce_ops = False

    # only enable sync_bn in multi GPU devices
    sync_bn = getattr(model.backbone, 'norm_type', None) == 'sync_bn'
    sync_bn = False
    build_strategy.sync_batch_norm = sync_bn and devices_num > 1 \
        and cfg.use_gpu

    exec_strategy = fluid.ExecutionStrategy()
    # iteration number when CompiledProgram tries to drop local execution scopes.
    # Set it to be 1 to save memory usages, so that unused variables in
    # local execution scopes can be deleted after each iteration.
    exec_strategy.num_iteration_per_drop_scope = 1
    if FLAGS.dist:
        dist_utils.prepare_for_multi_process(exe, build_strategy, startup_prog,
                                             train_prog)
        exec_strategy.num_threads = 1

    exe.run(startup_prog)
    not_quant_pattern = []
    if FLAGS.not_quant_pattern:
        not_quant_pattern = FLAGS.not_quant_pattern
    config = {
        'weight_quantize_type': 'channel_wise_abs_max',
        'activation_quantize_type': 'moving_average_abs_max',
        'quantize_op_types': ['depthwise_conv2d', 'mul', 'conv2d'],
        'not_quant_pattern': not_quant_pattern
    }

    ignore_params = cfg.finetune_exclude_pretrained_params \
                 if 'finetune_exclude_pretrained_params' in cfg else []

    fuse_bn = getattr(model.backbone, 'norm_type', None) == 'affine_channel'

    if cfg.pretrain_weights and fuse_bn and not ignore_params:
        checkpoint.load_and_fusebn(exe, train_prog, cfg.pretrain_weights)
    elif cfg.pretrain_weights:
        checkpoint.load_params(exe,
                               train_prog,
                               cfg.pretrain_weights,
                               ignore_params=ignore_params)

    if FLAGS.use_pact:
        act_preprocess_func = pact
        optimizer_func = get_optimizer
        executor = exe
    else:
        act_preprocess_func = None
        optimizer_func = None
        executor = None
    # insert quantize op in train_prog, return type is CompiledProgram
    train_prog_quant = quant_aware(train_prog,
                                   place,
                                   config,
                                   scope=None,
                                   act_preprocess_func=act_preprocess_func,
                                   optimizer_func=optimizer_func,
                                   executor=executor,
                                   for_test=False)

    compiled_train_prog = train_prog_quant.with_data_parallel(
        loss_name=loss.name,
        build_strategy=build_strategy,
        exec_strategy=exec_strategy)

    if FLAGS.eval:
        # insert quantize op in eval_prog
        eval_prog = quant_aware(eval_prog,
                                place,
                                config,
                                scope=None,
                                act_preprocess_func=act_preprocess_func,
                                optimizer_func=optimizer_func,
                                executor=executor,
                                for_test=True)
        compiled_eval_prog = fluid.CompiledProgram(eval_prog)

    start_iter = 0

    train_reader = create_reader(cfg.TrainReader,
                                 (cfg.max_iters - start_iter) * devices_num,
                                 cfg,
                                 devices_num=devices_num,
                                 num_trainers=num_trainers)
    # When iterable mode, set set_sample_list_generator(train_reader, place)
    train_loader.set_sample_list_generator(train_reader)

    # whether output bbox is normalized in model output layer
    is_bbox_normalized = False
    if hasattr(model, 'is_bbox_normalized') and \
            callable(model.is_bbox_normalized):
        is_bbox_normalized = model.is_bbox_normalized()

    # if map_type not set, use default 11point, only use in VOC eval
    map_type = cfg.map_type if 'map_type' in cfg else '11point'

    train_stats = TrainingStats(cfg.log_iter, train_keys)
    train_loader.start()
    start_time = time.time()
    end_time = time.time()

    cfg_name = os.path.basename(FLAGS.config).split('.')[0]
    save_dir = os.path.join(cfg.save_dir, cfg_name)
    time_stat = deque(maxlen=cfg.log_iter)
    best_box_ap_list = [0.0, 0]  #[map, iter]

    for it in range(start_iter, cfg.max_iters):
        start_time = end_time
        end_time = time.time()
        time_stat.append(end_time - start_time)
        time_cost = np.mean(time_stat)
        eta_sec = (cfg.max_iters - it) * time_cost
        eta = str(datetime.timedelta(seconds=int(eta_sec)))
        outs = exe.run(compiled_train_prog, fetch_list=train_values)
        stats = {k: np.array(v).mean() for k, v in zip(train_keys, outs[:-1])}

        train_stats.update(stats)
        logs = train_stats.log()
        if it % cfg.log_iter == 0 and (not FLAGS.dist or trainer_id == 0):
            strs = 'iter: {}, lr: {:.6f}, {}, time: {:.3f}, eta: {}'.format(
                it, np.mean(outs[-1]), logs, time_cost, eta)
            logger.info(strs)

        if (it > 0 and it % cfg.snapshot_iter == 0 or it == cfg.max_iters - 1) \
           and (not FLAGS.dist or trainer_id == 0):
            save_name = str(it) if it != cfg.max_iters - 1 else "model_final"

            if FLAGS.eval:
                # evaluation
                results = eval_run(exe,
                                   compiled_eval_prog,
                                   eval_loader,
                                   eval_keys,
                                   eval_values,
                                   eval_cls,
                                   cfg=cfg)
                resolution = None
                if 'mask' in results[0]:
                    resolution = model.mask_head.resolution
                box_ap_stats = eval_results(results, cfg.metric,
                                            cfg.num_classes, resolution,
                                            is_bbox_normalized,
                                            FLAGS.output_eval, map_type,
                                            cfg['EvalReader']['dataset'])

                if box_ap_stats[0] > best_box_ap_list[0]:
                    best_box_ap_list[0] = box_ap_stats[0]
                    best_box_ap_list[1] = it
                    save_checkpoint(exe, eval_prog,
                                    os.path.join(save_dir, "best_model"),
                                    train_prog)
                logger.info("Best test box ap: {}, in iter: {}".format(
                    best_box_ap_list[0], best_box_ap_list[1]))

    train_loader.reset()
コード例 #11
0
ファイル: program.py プロジェクト: sandyhouse/PaddleClas
def compile(config, program, loss_name=None, share_prog=None):
    """
    Compile the program

    Args:
        config(dict): config
        program(): the program which is wrapped by
        loss_name(str): loss name
        share_prog(): the shared program, used for evaluation during training

    Returns:
        compiled_program(): a compiled program
    """
    build_strategy = fluid.compiler.BuildStrategy()
    exec_strategy = fluid.ExecutionStrategy()

    exec_strategy.num_threads = 1
    exec_strategy.num_iteration_per_drop_scope = 10

    use_fp16 = config.get('use_fp16', False)
    fuse_bn_act_ops = config.get('fuse_bn_act_ops', True)
    fuse_elewise_add_act_ops = config.get('fuse_elewise_add_act_ops', True)
    fuse_bn_add_act_ops = config.get('fuse_bn_add_act_ops', True)
    enable_addto = config.get('enable_addto', True)

    if use_fp16:
        try:
            build_strategy.fuse_bn_act_ops = fuse_bn_act_ops
        except Exception as e:
            logger.info(
                "PaddlePaddle version 1.7.0 or higher is "
                "required when you want to fuse batch_norm and activation_op.")
        try:
            build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops
        except Exception as e:
            logger.info(
                "PaddlePaddle version 1.7.0 or higher is "
                "required when you want to fuse elewise_add_act and activation_op."
            )

        try:
            build_strategy.fuse_bn_add_act_ops = fuse_bn_add_act_ops
        except Exception as e:
            logger.info(
                "PaddlePaddle 2.0-rc or higher is "
                "required when you want to enable fuse_bn_add_act_ops strategy."
            )
        try:

            build_strategy.enable_addto = enable_addto
        except Exception as e:
            logger.info("PaddlePaddle 2.0-rc or higher is "
                        "required when you want to enable addto strategy.")

    compiled_program = fluid.CompiledProgram(program).with_data_parallel(
        share_vars_from=share_prog,
        loss_name=loss_name,
        build_strategy=build_strategy,
        exec_strategy=exec_strategy)

    return compiled_program
コード例 #12
0
def main():
    env = os.environ
    cfg = load_config(FLAGS.config)
    merge_config(FLAGS.opt)
    check_config(cfg)
    # check if set use_gpu=True in paddlepaddle cpu version
    check_gpu(cfg.use_gpu)
    check_version()

    main_arch = cfg.architecture

    if cfg.use_gpu:
        devices_num = fluid.core.get_cuda_device_count()
    else:
        devices_num = int(os.environ.get('CPU_NUM', 1))

    if 'FLAGS_selected_gpus' in env:
        device_id = int(env['FLAGS_selected_gpus'])
    else:
        device_id = 0
    place = fluid.CUDAPlace(device_id) if cfg.use_gpu else fluid.CPUPlace()
    exe = fluid.Executor(place)

    # build program
    model = create(main_arch)
    inputs_def = cfg['TrainReader']['inputs_def']
    train_feed_vars, train_loader = model.build_inputs(**inputs_def)
    train_fetches = model.train(train_feed_vars)
    loss = train_fetches['loss']

    start_iter = 0
    train_reader = create_reader(cfg.TrainReader, (cfg.max_iters - start_iter) *
                                 devices_num, cfg)
    # When iterable mode, set set_sample_list_generator(train_reader, place)
    train_loader.set_sample_list_generator(train_reader)

    # get all student variables
    student_vars = []
    for v in fluid.default_main_program().list_vars():
        try:
            student_vars.append((v.name, v.shape))
        except:
            pass
    # uncomment the following lines to print all student variables
    # print("="*50 + "student_model_vars" + "="*50)
    # print(student_vars)

    eval_prog = fluid.Program()
    with fluid.program_guard(eval_prog, fluid.default_startup_program()):
        with fluid.unique_name.guard():
            model = create(main_arch)
            inputs_def = cfg['EvalReader']['inputs_def']
            test_feed_vars, eval_loader = model.build_inputs(**inputs_def)
            fetches = model.eval(test_feed_vars)
    eval_prog = eval_prog.clone(True)

    eval_reader = create_reader(cfg.EvalReader)
    # When iterable mode, set set_sample_list_generator(eval_reader, place)
    eval_loader.set_sample_list_generator(eval_reader)

    # parse eval fetches
    extra_keys = []
    if cfg.metric == 'COCO':
        extra_keys = ['im_info', 'im_id', 'im_shape']
    if cfg.metric == 'VOC':
        extra_keys = ['gt_bbox', 'gt_class', 'is_difficult']
    eval_keys, eval_values, eval_cls = parse_fetches(fetches, eval_prog,
                                                     extra_keys)

    teacher_cfg = load_config(FLAGS.teacher_config)
    merge_config(FLAGS.opt)
    teacher_arch = teacher_cfg.architecture
    teacher_program = fluid.Program()
    teacher_startup_program = fluid.Program()

    with fluid.program_guard(teacher_program, teacher_startup_program):
        with fluid.unique_name.guard():
            teacher_feed_vars = OrderedDict()
            for name, var in train_feed_vars.items():
                teacher_feed_vars[name] = teacher_program.global_block(
                )._clone_variable(
                    var, force_persistable=False)
            model = create(teacher_arch)
            train_fetches = model.train(teacher_feed_vars)
            teacher_loss = train_fetches['loss']

    # get all teacher variables
    teacher_vars = []
    for v in teacher_program.list_vars():
        try:
            teacher_vars.append((v.name, v.shape))
        except:
            pass
    # uncomment the following lines to print all teacher variables
    # print("="*50 + "teacher_model_vars" + "="*50)
    # print(teacher_vars)

    exe.run(teacher_startup_program)
    assert FLAGS.teacher_pretrained, "teacher_pretrained should be set"
    checkpoint.load_params(exe, teacher_program, FLAGS.teacher_pretrained)
    teacher_program = teacher_program.clone(for_test=True)

    cfg = load_config(FLAGS.config)
    merge_config(FLAGS.opt)
    data_name_map = {
        'target0': 'target0',
        'target1': 'target1',
        'target2': 'target2',
        'image': 'image',
        'gt_bbox': 'gt_bbox',
        'gt_class': 'gt_class',
        'gt_score': 'gt_score'
    }
    merge(teacher_program, fluid.default_main_program(), data_name_map, place)

    yolo_output_names = [
        'strided_slice_0.tmp_0', 'strided_slice_1.tmp_0',
        'strided_slice_2.tmp_0', 'strided_slice_3.tmp_0',
        'strided_slice_4.tmp_0', 'transpose_0.tmp_0', 'strided_slice_5.tmp_0',
        'strided_slice_6.tmp_0', 'strided_slice_7.tmp_0',
        'strided_slice_8.tmp_0', 'strided_slice_9.tmp_0', 'transpose_2.tmp_0',
        'strided_slice_10.tmp_0', 'strided_slice_11.tmp_0',
        'strided_slice_12.tmp_0', 'strided_slice_13.tmp_0',
        'strided_slice_14.tmp_0', 'transpose_4.tmp_0'
    ]

    distill_pairs = [['teacher_conv2d_6.tmp_1', 'conv2d_20.tmp_1'],
                     ['teacher_conv2d_14.tmp_1', 'conv2d_28.tmp_1'],
                     ['teacher_conv2d_22.tmp_1', 'conv2d_36.tmp_1']]

    distill_loss = l2_distill(
        distill_pairs, 100) if not cfg.use_fine_grained_loss else split_distill(
            yolo_output_names, 1000)
    loss = distill_loss + loss
    lr_builder = create('LearningRate')
    optim_builder = create('OptimizerBuilder')
    lr = lr_builder()
    opt = optim_builder(lr)
    opt.minimize(loss)

    exe.run(fluid.default_startup_program())
    fuse_bn = getattr(model.backbone, 'norm_type', None) == 'affine_channel'
    ignore_params = cfg.finetune_exclude_pretrained_params \
                 if 'finetune_exclude_pretrained_params' in cfg else []
    if FLAGS.resume_checkpoint:
        checkpoint.load_checkpoint(exe,
                                   fluid.default_main_program(),
                                   FLAGS.resume_checkpoint)
        start_iter = checkpoint.global_step()
    elif cfg.pretrain_weights and fuse_bn and not ignore_params:
        checkpoint.load_and_fusebn(exe,
                                   fluid.default_main_program(),
                                   cfg.pretrain_weights)
    elif cfg.pretrain_weights:
        checkpoint.load_params(
            exe,
            fluid.default_main_program(),
            cfg.pretrain_weights,
            ignore_params=ignore_params)

    build_strategy = fluid.BuildStrategy()
    build_strategy.fuse_all_reduce_ops = False
    build_strategy.fuse_all_optimizer_ops = False
    # only enable sync_bn in multi GPU devices
    sync_bn = getattr(model.backbone, 'norm_type', None) == 'sync_bn'
    build_strategy.sync_batch_norm = sync_bn and devices_num > 1 \
        and cfg.use_gpu

    exec_strategy = fluid.ExecutionStrategy()
    # iteration number when CompiledProgram tries to drop local execution scopes.
    # Set it to be 1 to save memory usages, so that unused variables in
    # local execution scopes can be deleted after each iteration.
    exec_strategy.num_iteration_per_drop_scope = 1

    parallel_main = fluid.CompiledProgram(fluid.default_main_program(
    )).with_data_parallel(
        loss_name=loss.name,
        build_strategy=build_strategy,
        exec_strategy=exec_strategy)

    compiled_eval_prog = fluid.CompiledProgram(eval_prog)

    # whether output bbox is normalized in model output layer
    is_bbox_normalized = False
    if hasattr(model, 'is_bbox_normalized') and \
            callable(model.is_bbox_normalized):
        is_bbox_normalized = model.is_bbox_normalized()
    map_type = cfg.map_type if 'map_type' in cfg else '11point'
    best_box_ap_list = [0.0, 0]  #[map, iter]
    cfg_name = os.path.basename(FLAGS.config).split('.')[0]
    save_dir = os.path.join(cfg.save_dir, cfg_name)

    train_loader.start()
    for step_id in range(start_iter, cfg.max_iters):
        teacher_loss_np, distill_loss_np, loss_np, lr_np = exe.run(
            parallel_main,
            fetch_list=[
                'teacher_' + teacher_loss.name, distill_loss.name, loss.name,
                lr.name
            ])
        if step_id % cfg.log_iter == 0:
            logger.info(
                "step {} lr {:.6f}, loss {:.6f}, distill_loss {:.6f}, teacher_loss {:.6f}".
                format(step_id, lr_np[0], loss_np[0], distill_loss_np[0],
                       teacher_loss_np[0]))
        if step_id % cfg.snapshot_iter == 0 and step_id != 0 or step_id == cfg.max_iters - 1:
            save_name = str(
                step_id) if step_id != cfg.max_iters - 1 else "model_final"
            checkpoint.save(exe,
                            fluid.default_main_program(),
                            os.path.join(save_dir, save_name))
            if FLAGS.save_inference:
                feeded_var_names = ['image', 'im_size']
                targets = list(fetches.values())
                fluid.io.save_inference_model(save_dir + '/infer',
                                              feeded_var_names, targets, exe,
                                              eval_prog)
            # eval
            results = eval_run(exe, compiled_eval_prog, eval_loader, eval_keys,
                               eval_values, eval_cls, cfg)
            resolution = None
            box_ap_stats = eval_results(results, cfg.metric, cfg.num_classes,
                                        resolution, is_bbox_normalized,
                                        FLAGS.output_eval, map_type,
                                        cfg['EvalReader']['dataset'])

            if box_ap_stats[0] > best_box_ap_list[0]:
                best_box_ap_list[0] = box_ap_stats[0]
                best_box_ap_list[1] = step_id
                checkpoint.save(exe,
                                fluid.default_main_program(),
                                os.path.join(save_dir, "best_model"))
                if FLAGS.save_inference:
                    feeded_var_names = ['image', 'im_size']
                    targets = list(fetches.values())
                    fluid.io.save_inference_model(save_dir + '/infer',
                                                  feeded_var_names, targets,
                                                  exe, eval_prog)
            logger.info("Best test box ap: {}, in step: {}".format(
                best_box_ap_list[0], best_box_ap_list[1]))
    train_loader.reset()
コード例 #13
0
ファイル: train.py プロジェクト: zhaoyuchen2018/benchmark
def train(args):
    # parameters from arguments
    model_name = args.model
    checkpoint = args.checkpoint
    pretrained_model = args.pretrained_model
    with_memory_optimization = args.with_mem_opt
    model_save_dir = args.model_save_dir
    use_ngraph = os.getenv('FLAGS_use_ngraph')

    startup_prog = fluid.Program()
    train_prog = fluid.Program()
    test_prog = fluid.Program()
    if args.enable_ce:
        startup_prog.random_seed = 1000
        train_prog.random_seed = 1000

    train_py_reader, train_cost, train_acc1, train_acc5, global_lr = build_program(
        is_train=True,
        main_prog=train_prog,
        startup_prog=startup_prog,
        args=args)
    test_py_reader, test_cost, test_acc1, test_acc5 = build_program(
        is_train=False,
        main_prog=test_prog,
        startup_prog=startup_prog,
        args=args)
    test_prog = test_prog.clone(for_test=True)

    if with_memory_optimization and use_ngraph:
        fluid.memory_optimize(train_prog)
        fluid.memory_optimize(test_prog)

    gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0))
    place = fluid.CUDAPlace(gpu_id) if args.use_gpu else fluid.CPUPlace()
    exe = fluid.Executor(place)
    exe.run(startup_prog)


    if checkpoint is not None:
        fluid.io.load_persistables(exe, checkpoint, main_program=train_prog)

    if pretrained_model:

        def if_exist(var):
            return os.path.exists(os.path.join(pretrained_model, var.name))

        fluid.io.load_vars(
            exe, pretrained_model, main_program=train_prog, predicate=if_exist)

    if args.use_gpu:
        device_num = get_device_num()
    else:
        device_num = 1
    train_batch_size = args.batch_size / device_num

    test_batch_size = 16
    if not args.enable_ce:
        # NOTE: the order of batch data generated by batch_reader
        # must be the same in the respective processes.
        shuffle_seed = 1 if num_trainers > 1 else None
        train_reader = reader.train(batch_size=train_batch_size, shuffle_seed=shuffle_seed)
        test_reader = reader.val(batch_size=test_batch_size)
    else:
        # use flowers dataset for CE and set use_xmap False to avoid disorder data
        # but it is time consuming. For faster speed, need another dataset.
        import random
        random.seed(0)
        np.random.seed(0)
        train_reader = paddle.batch(
            flowers.train(use_xmap=False),
            batch_size=train_batch_size,
            drop_last=True)
        test_reader = paddle.batch(
            flowers.test(use_xmap=False), batch_size=test_batch_size)

    train_py_reader.decorate_paddle_reader(train_reader)
    test_py_reader.decorate_paddle_reader(test_reader)

    if not use_ngraph:
        build_strategy = fluid.BuildStrategy()
        build_strategy.memory_optimize = args.with_mem_opt
        build_strategy.enable_inplace = args.with_inplace
        build_strategy.fuse_all_reduce_ops=1
        exec_strategy = fluid.ExecutionStrategy()
        exec_strategy.num_threads = device_num
        exec_strategy.num_iteration_per_drop_scope = 10

        if num_trainers > 1 and args.use_gpu:
            dist_utils.prepare_for_multi_process(exe, build_strategy, train_prog)
            # NOTE: the process is fast when num_threads is 1
            # for multi-process training.
            exec_strategy.num_threads = 1

        train_exe = fluid.ParallelExecutor(
            main_program=train_prog,
            use_cuda=bool(args.use_gpu),
            loss_name=train_cost.name,
            build_strategy=build_strategy,
            exec_strategy=exec_strategy)
    else:
        train_exe = exe

    train_fetch_vars = [train_cost, train_acc1, train_acc5, global_lr]
    train_fetch_list = []
    for var in train_fetch_vars:
        var.persistable=True
        train_fetch_list.append(var.name)

    test_fetch_vars = [test_cost, test_acc1, test_acc5]
    test_fetch_list = []
    for var in test_fetch_vars:
        var.persistable=True
        test_fetch_list.append(var.name)

    params = models.__dict__[args.model]().params
    for pass_id in range(params["num_epochs"]):
        train_py_reader.start()

        train_info = [[], [], []]
        test_info = [[], [], []]
        train_time = []
        batch_id = 0
        time_record=[]
        try:
            while True:
                t1 = time.time()

                if use_ngraph:
                    loss, acc1, acc5, lr = train_exe.run(
                        train_prog, fetch_list=train_fetch_list)
                else:
                    loss, acc1, acc5, lr = train_exe.run(
                        fetch_list=train_fetch_list)
                t2 = time.time()
                time_record.append(t2 - t1)
                loss = np.mean(np.array(loss))
                acc1 = np.mean(np.array(acc1))
                acc5 = np.mean(np.array(acc5))
                train_info[0].append(loss)
                train_info[1].append(acc1)
                train_info[2].append(acc5)
                lr = np.mean(np.array(lr))
                train_time.append(t2-t1)

                if batch_id % 10 == 0:
                    period = np.mean(time_record)
                    time_record=[]
                    print("Pass {0}, trainbatch {1}, loss {2}, \
                        acc1 {3}, acc5 {4}, lr {5}, time {6}"
                          .format(pass_id, batch_id, "%.5f"%loss, "%.5f"%acc1, "%.5f"%acc5, "%.5f" %
                                  lr, "%2.2f sec" % period))
                    sys.stdout.flush()
                batch_id += 1
        except fluid.core.EOFException:
            train_py_reader.reset()

        train_loss = np.array(train_info[0]).mean()
        train_acc1 = np.array(train_info[1]).mean()
        train_acc5 = np.array(train_info[2]).mean()
        train_speed = np.array(train_time).mean() / (train_batch_size *
                                                     device_num)

        test_py_reader.start()

        test_batch_id = 0
        try:
            while True:
                t1 = time.time()
                loss, acc1, acc5 = exe.run(program=test_prog,
                                           fetch_list=test_fetch_list)
                t2 = time.time()
                period = t2 - t1
                loss = np.mean(loss)
                acc1 = np.mean(acc1)
                acc5 = np.mean(acc5)
                test_info[0].append(loss)
                test_info[1].append(acc1)
                test_info[2].append(acc5)
                if test_batch_id % 10 == 0:
                    print("Pass {0},testbatch {1},loss {2}, \
                        acc1 {3},acc5 {4},time {5}"
                          .format(pass_id, test_batch_id, "%.5f"%loss,"%.5f"%acc1, "%.5f"%acc5,
                                  "%2.2f sec" % period))
                    sys.stdout.flush()
                test_batch_id += 1
        except fluid.core.EOFException:
            test_py_reader.reset()

        test_loss = np.array(test_info[0]).mean()
        test_acc1 = np.array(test_info[1]).mean()
        test_acc5 = np.array(test_info[2]).mean()

        print("End pass {0}, train_loss {1}, train_acc1 {2}, train_acc5 {3}, "
              "test_loss {4}, test_acc1 {5}, test_acc5 {6}".format(
                  pass_id, "%.5f"%train_loss, "%.5f"%train_acc1, "%.5f"%train_acc5, "%.5f"%test_loss,
                  "%.5f"%test_acc1, "%.5f"%test_acc5))
        sys.stdout.flush()

        model_path = os.path.join(model_save_dir + '/' + model_name,
                                  str(pass_id))
        if not os.path.isdir(model_path):
            os.makedirs(model_path)
        fluid.io.save_persistables(exe, model_path, main_program=train_prog)

        # This is for continuous evaluation only
        if args.enable_ce and pass_id == args.num_epochs - 1:
            if device_num == 1:
                # Use the mean cost/acc for training
                print("kpis	train_cost	%s" % train_loss)
                print("kpis	train_acc_top1	%s" % train_acc1)
                print("kpis	train_acc_top5	%s" % train_acc5)
                # Use the mean cost/acc for testing
                print("kpis	test_cost	%s" % test_loss)
                print("kpis	test_acc_top1	%s" % test_acc1)
                print("kpis	test_acc_top5	%s" % test_acc5)
                print("kpis	train_speed	%s" % train_speed)
            else:
                # Use the mean cost/acc for training
                print("kpis	train_cost_card%s	%s" % (device_num, train_loss))
                print("kpis	train_acc_top1_card%s	%s" %
                      (device_num, train_acc1))
                print("kpis	train_acc_top5_card%s	%s" %
                      (device_num, train_acc5))
                # Use the mean cost/acc for testing
                print("kpis	test_cost_card%s	%s" % (device_num, test_loss))
                print("kpis	test_acc_top1_card%s	%s" % (device_num, test_acc1))
                print("kpis	test_acc_top5_card%s	%s" % (device_num, test_acc5))
                print("kpis	train_speed_card%s	%s" % (device_num, train_speed))
コード例 #14
0
ファイル: train.py プロジェクト: zhousanfu/paddle-demo
def main():
    args = parse_args()

    # check if set use_gpu=True in paddlepaddle cpu version
    check_cuda(args.use_gpu)
    # check if paddlepaddle version is satisfied
    check_version()

    logger = logging.getLogger("lm")
    logger.setLevel(logging.INFO)
    formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    if args.log_path:
        file_handler = logging.FileHandler(args.log_path)
        file_handler.setLevel(logging.INFO)
        file_handler.setFormatter(formatter)
        logger.addHandler(file_handler)
    else:
        console_handler = logging.StreamHandler()
        console_handler.setLevel(logging.INFO)
        console_handler.setFormatter(formatter)
        logger.addHandler(console_handler)
    logger.info('Running with args : {}'.format(args))

    config = RNNConfig(args)

    if not os.path.exists(args.save_model_dir):
        mkpath(args.save_model_dir)

    # define train program
    main_program = fluid.Program()
    startup_program = fluid.Program()
    if args.enable_ce:
        startup_program.random_seed = SEED
    with fluid.program_guard(main_program, startup_program):
        with fluid.unique_name.guard():
            res_vars = lm_model.lm_model(config.hidden_size,
                                         config.vocab_size,
                                         num_layers=config.num_layers,
                                         num_steps=config.num_steps,
                                         init_scale=config.init_scale,
                                         dropout=config.dropout,
                                         rnn_model=config.rnn_model,
                                         use_dataloader=args.use_dataloader)

            if args.use_dataloader:
                dataloader = res_vars[-1]
                res_vars = res_vars[:-1]
            loss, last_hidden, last_cell, feed_order = res_vars

            fluid.clip.set_gradient_clip(
                clip=fluid.clip.GradientClipByGlobalNorm(
                    clip_norm=config.max_grad_norm))

            learning_rate = fluid.layers.create_global_var(
                name="learning_rate",
                shape=[1],
                value=1.0,
                dtype='float32',
                persistable=True)

            optimizer = fluid.optimizer.SGD(learning_rate=learning_rate)
            optimizer.minimize(loss)

    # define inference program
    inference_program = fluid.Program()
    inference_startup_program = fluid.Program()
    with fluid.program_guard(inference_program, inference_startup_program):
        with fluid.unique_name.guard():
            lm_model.lm_model(config.hidden_size,
                              config.vocab_size,
                              num_layers=config.num_layers,
                              num_steps=config.num_steps,
                              init_scale=config.init_scale,
                              dropout=config.dropout,
                              rnn_model=config.rnn_model,
                              use_dataloader=False)
    # Some op behaves differently for train and inference, we need to call
    # this clone function to ensure every op is right for inference.
    inference_program = inference_program.clone(for_test=True)

    place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
    exe = Executor(place)
    exe.run(startup_program)

    if args.init_from_pretrain_model:
        if not os.path.exists(args.init_from_pretrain_model + '.pdparams'):
            print(args.init_from_pretrain_model)
            raise Warning("The pretrained params do not exist.")
            return
        fluid.load(main_program, args.init_from_pretrain_model)
        print("finish initing model from pretrained params from %s" %
              (args.init_from_pretrain_model))

    device_count = len(fluid.cuda_places()) if args.use_gpu else len(
        fluid.cpu_places())

    exec_strategy = fluid.ExecutionStrategy()
    exec_strategy.num_threads = device_count
    exec_strategy.num_iteration_per_drop_scope = 100

    build_strategy = fluid.BuildStrategy()
    build_strategy.fuse_all_optimizer_ops = True
    try:
        fluid.require_version(min_version='1.7.0')
        build_strategy.enable_auto_fusion = args.enable_auto_fusion
    except Exception as e:
        logger.info("PaddlePaddle version 1.7.0 or higher is "
                    "required when you want to enable fusion_group.")

    if args.parallel:
        train_program = fluid.compiler.CompiledProgram(
            main_program).with_data_parallel(loss_name=loss.name,
                                             build_strategy=build_strategy,
                                             exec_strategy=exec_strategy)
    else:
        train_program = fluid.compiler.CompiledProgram(main_program)

    data_path = args.data_path
    print("begin to load data")
    ptb_data = reader.get_ptb_data(data_path)
    print("finished load data")
    train_data, valid_data, test_data = ptb_data

    def generate_init_data():
        batch_size = config.batch_size * device_count
        init_hidden = np.zeros(
            (batch_size, config.num_layers, config.hidden_size),
            dtype='float32')
        init_cell = np.zeros(
            (batch_size, config.num_layers, config.hidden_size),
            dtype='float32')
        return init_hidden, init_cell

    def generate_new_lr(epoch_id=0, device_count=1):
        new_lr = config.base_learning_rate * (config.lr_decay**max(
            epoch_id + 1 - config.epoch_start_decay, 0.0))
        lr = np.ones((device_count), dtype='float32') * new_lr
        return lr

    def prepare_input(batch,
                      init_hidden=None,
                      init_cell=None,
                      epoch_id=0,
                      with_lr=True,
                      device_count=1):
        x, y = batch
        x = x.reshape((-1, config.num_steps, 1))
        y = y.reshape((-1, 1))

        res = {}
        res['x'] = x
        res['y'] = y
        if init_hidden is not None:
            res['init_hidden'] = init_hidden
        if init_cell is not None:
            res['init_cell'] = init_cell
        if with_lr:
            res['learning_rate'] = generate_new_lr(epoch_id, device_count)

        return res

    def eval(data):
        # when eval the batch_size set to 1
        eval_data_iter = reader.get_data_iter(data,
                                              config.batch_size * device_count,
                                              config.num_steps)
        total_loss = 0.0
        iters = 0
        init_hidden, init_cell = generate_init_data()
        for batch_id, batch in enumerate(eval_data_iter):
            input_data_feed = prepare_input(batch,
                                            init_hidden,
                                            init_cell,
                                            epoch_id=0,
                                            with_lr=False)
            fetch_outs = exe.run(
                program=inference_program,
                feed=input_data_feed,
                fetch_list=[loss.name, last_hidden.name, last_cell.name],
                use_program_cache=False)

            cost_eval = np.array(fetch_outs[0])
            init_hidden = np.array(fetch_outs[1])
            init_cell = np.array(fetch_outs[2])

            total_loss += cost_eval
            iters += config.num_steps

        ppl = np.exp(total_loss / iters)
        return ppl

    def get_log_interval(data_len):
        num_batchs = data_len // config.batch_size
        epoch_size = (num_batchs - 1) // config.num_steps
        log_interval = max(1, epoch_size // 10)
        return log_interval

    def train_an_epoch(epoch_id, batch_times):
        # get train epoch size
        log_interval = get_log_interval(len(train_data))
        train_data_iter = reader.get_data_iter(
            train_data, config.batch_size * device_count, config.num_steps)

        total_loss = 0
        iters = 0

        init_hidden, init_cell = generate_init_data()
        for batch_id, batch in enumerate(train_data_iter):
            input_data_feed = prepare_input(batch,
                                            init_hidden=init_hidden,
                                            init_cell=init_cell,
                                            epoch_id=epoch_id,
                                            with_lr=True,
                                            device_count=device_count)
            batch_start_time = time.time()
            fetch_outs = exe.run(train_program,
                                 feed=input_data_feed,
                                 fetch_list=[
                                     loss.name, "learning_rate",
                                     last_hidden.name, last_cell.name
                                 ],
                                 use_program_cache=True)
            batch_time = time.time() - batch_start_time
            batch_times.append(batch_time)

            cost_train = np.array(fetch_outs[0])
            lr = np.array(fetch_outs[1])
            init_hidden = np.array(fetch_outs[2])
            init_cell = np.array(fetch_outs[3])
            total_loss += cost_train
            iters += config.num_steps
            if batch_id > 0 and batch_id % log_interval == 0:
                ppl = np.exp(total_loss / iters)
                print(
                    "-- Epoch:[%d]; Batch:[%d]; Time: %.5f s; ppl: %.5f, lr: %.5f"
                    % (epoch_id, batch_id, batch_time, ppl[0], lr[0]))

            # profiler tools for benchmark
            if args.profile and batch_id == log_interval:
                profiler.reset_profiler()
            elif args.profile and batch_id == (log_interval + 5):
                break
        ppl = np.exp(total_loss / iters)
        return ppl

    def train_an_epoch_dataloader(epoch_id, batch_times):
        # get train epoch size
        log_interval = get_log_interval(len(train_data))

        init_hidden, init_cell = generate_init_data()

        total_loss = 0
        iters = 0

        dataloader.start()
        batch_id = 0
        try:
            while True:
                data_feeds = {}
                if batch_id == 0:
                    batch_time = 0
                    batch_start_time = time.time()
                else:
                    batch_time = time.time() - batch_start_time
                    batch_times.append(batch_time)
                    batch_start_time = time.time()

                new_lr = generate_new_lr(epoch_id, device_count)
                data_feeds['learning_rate'] = new_lr
                data_feeds["init_hidden"] = init_hidden
                data_feeds["init_cell"] = init_cell

                fetch_outs = exe.run(train_program,
                                     feed=data_feeds,
                                     fetch_list=[
                                         loss.name, "learning_rate",
                                         last_hidden.name, last_cell.name
                                     ],
                                     use_program_cache=True)

                cost_train = np.array(fetch_outs[0])
                lr = np.array(fetch_outs[1])
                init_hidden = np.array(fetch_outs[2])
                init_cell = np.array(fetch_outs[3])

                total_loss += cost_train
                iters += config.num_steps
                if batch_id > 0 and (log_interval == 0
                                     or batch_id % log_interval == 0):
                    ppl = np.exp(total_loss / iters)
                    print(
                        "-- Epoch:[%d]; Batch:[%d]; Time: %.5f s; ppl: %.5f, lr: %.5f"
                        % (epoch_id, batch_id, batch_time, ppl[0], lr[0]))

                batch_id += 1
                # profiler tools for benchmark
                if args.profile and batch_id == log_interval:
                    profiler.reset_profiler()
                elif args.profile and batch_id == (log_interval + 5):
                    break
        except fluid.core.EOFException:
            dataloader.reset()

        batch_times.append(time.time() - batch_start_time)
        ppl = np.exp(total_loss / iters)
        return ppl

    def train():
        if args.use_dataloader:

            def data_gen():
                data_iter_size = config.batch_size
                train_batches = reader.get_data_iter(train_data,
                                                     data_iter_size,
                                                     config.num_steps)
                for batch in train_batches:
                    x, y = batch
                    x = x.reshape((-1, config.num_steps, 1))
                    y = y.reshape((-1, 1))
                    yield x, y

            dataloader.set_batch_generator(data_gen)

        total_time = 0.0
        for epoch_id in range(config.max_epoch):
            batch_times = []
            epoch_start_time = time.time()
            if args.use_dataloader:
                train_ppl = train_an_epoch_dataloader(epoch_id, batch_times)
            else:
                train_ppl = train_an_epoch(epoch_id, batch_times)
            epoch_time = time.time() - epoch_start_time
            total_time += epoch_time
            print(
                "\nTrain epoch:[%d]; epoch Time: %.5f; ppl: %.5f; avg_time: %.5f steps/s \n"
                % (epoch_id, epoch_time, train_ppl[0],
                   len(batch_times) / sum(batch_times)))

            # FIXME(zjl): ppl[0] increases as batch_size increases.
            # We should find a better way to calculate ppl by normalizing batch_size.
            if device_count == 1 and config.batch_size <= 20 and epoch_id == 0 and train_ppl[
                    0] > 1000:
                # for bad init, after first epoch, the loss is over 1000
                # no more need to continue
                print(
                    "Parameters are randomly initialized and not good this time because the loss is over 1000 after the first epoch."
                )
                print("Abort this training process and please start again.")
                return

            if epoch_id == config.max_epoch - 1 and args.enable_ce:
                # kpis
                print("ptblm\tlstm_language_model_%s_duration_card%d\t%s" %
                      (args.rnn_model, device_count,
                       total_time / config.max_epoch))
                print("ptblm\tlstm_language_model_%s_loss_card%d\t%s" %
                      (args.rnn_model, device_count, train_ppl[0]))

            if not args.profile:
                # NOTE(zjl): sometimes we have not enough data for eval if batch_size is large, i.e., 2100
                # Just skip to avoid error
                def is_valid_data(data, batch_size, num_steps):
                    data_len = len(data)
                    batch_len = data_len // batch_size
                    epoch_size = (batch_len - 1) // num_steps
                    return epoch_size >= 1

                valid_data_valid = is_valid_data(valid_data, config.batch_size,
                                                 config.num_steps)
                if valid_data_valid:
                    valid_ppl = eval(valid_data)
                    print("Valid ppl: %.5f" % valid_ppl[0])
                else:
                    print(
                        'WARNING: length of valid_data is {}, which is not enough for batch_size {} and num_steps {}'
                        .format(len(valid_data), config.batch_size,
                                config.num_steps))

                save_model_dir = os.path.join(args.save_model_dir,
                                              str(epoch_id))
                if not os.path.exists(save_model_dir):
                    mkpath(save_model_dir)
                save_model_dir = os.path.join(save_model_dir, 'params')

                fluid.save(main_program, save_model_dir)
                print("Saved model to: %s.\n" % save_model_dir)

    with profile_context(args.profile, args.profiler_path):
        train()

    test_ppl = eval(test_data)
    print("Test ppl:", test_ppl[0])
コード例 #15
0
    def do_training(self, fleet, args):
        """
        training_from_pyreader
        Args:
            fleet (DistributedTranspiler): DistributedTranspiler inherited base class Fleet
            args (ArgumentParser): run args to config dist fleet.
        Returns:
            list
        """
#        if args.run_params.get("increment", False):
#            global DATA_PATH
#            DATA_PATH = DATA_PATH + '.increment'
        exe = fluid.Executor(paddle.CPUPlace())
        exe.run(paddle.static.default_startup_program())
        fleet.init_worker()
        train_generator = py_reader1.CriteoDataset(1000001)
        file_list = [str(DATA_PATH)] * 2
        train_reader = paddle.batch(
            train_generator.train(file_list, args.trainers, args.current_id),
            batch_size=4)
        self.pyreader.decorate_paddle_reader(train_reader)
        exec_strategy = fluid.ExecutionStrategy()
        exec_strategy.num_threads = int(2)
        # build_strategy = fluid.BuildStrategy()
        # build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
        # compiled_prog = fluid.compiler.CompiledProgram(
        #     paddle.static.default_main_program()).with_data_parallel(
        #         loss_name=self.avg_cost.name,
        #         build_strategy=build_strategy,
        #         exec_strategy=exec_strategy)
       # compiled_prog = paddle.static.CompiledProgram(
            # paddle.static.default_main_program()).with_data_parallel(
            #     loss_name=self.avg_cost.name,
            #     build_strategy=build_strategy,
            #     exec_strategy=exec_strategy)
        # Notice: py_reader should use try & catch EOFException method to enter the dataset
        # reader.start() must declare in advance
        self.pyreader.start()
        train_info = []
        batch_id = 0
        # with open("./file", 'w') as f:
        #    f.write(str(paddle.static.default_main_program()))
        try:
            while True:
                avg_cost = exe.run(program=paddle.static.default_main_program(),
                                   fetch_list=[self.avg_cost.name])
                avg_cost = np.mean(avg_cost)
                train_info.append(avg_cost)
                batch_id += 1
                # save has a bug, waitting for fix.
                # if batch_id == args.run_params.get("total_batch_size", 5):
                #     if params["is_first_trainer"]:
                #         if params["is_pyreader_train"]:
                #             model_path = str(params["model_path"] + "/final" +
                #                              "_pyreader")
                #             fleet.save_persistables(
                #                 executor=fluid.Executor(fluid.CPUPlace()),
                #                 dirname=model_path)
                #         elif params["is_dataset_train"]:
                #             model_path = str(params["model_path"] + '/final' +
                #                              "_dataset")
                #             fleet.save_persistables(
                #                 executor=fluid.Executor(paddle.CPUPlace()),
                #                 dirname=model_path)
                #         else:
                #             raise ValueError(
                #                 "Program must has Date feed method: is_pyreader_train / is_dataset_train"
                #             )
                #     break
        except fluid.core.EOFException:
            self.pyreader.reset()
        fleet.stop_worker()
        return train_info
コード例 #16
0
ファイル: train.py プロジェクト: zhyq/LARK
def train(args):
    print("pretraining start")
    ernie_config = ErnieConfig(args.ernie_config_path)
    ernie_config.print_config()

    train_program = fluid.Program()
    startup_prog = fluid.Program()
    with fluid.program_guard(train_program, startup_prog):
        with fluid.unique_name.guard():
            train_pyreader, next_sent_acc, mask_lm_loss, total_loss = create_model(
                pyreader_name='train_reader', ernie_config=ernie_config)
            scheduled_lr = optimization(loss=total_loss,
                                        warmup_steps=args.warmup_steps,
                                        num_train_steps=args.num_train_steps,
                                        learning_rate=args.learning_rate,
                                        train_program=train_program,
                                        startup_prog=startup_prog,
                                        weight_decay=args.weight_decay,
                                        scheduler=args.lr_scheduler,
                                        use_fp16=args.use_fp16,
                                        loss_scaling=args.loss_scaling)

            fluid.memory_optimize(input_program=train_program,
                                  skip_opt_set=[
                                      next_sent_acc.name, mask_lm_loss.name,
                                      total_loss.name
                                  ])

    test_prog = fluid.Program()
    with fluid.program_guard(test_prog, startup_prog):
        with fluid.unique_name.guard():
            test_pyreader, next_sent_acc, mask_lm_loss, total_loss = create_model(
                pyreader_name='test_reader', ernie_config=ernie_config)

    test_prog = test_prog.clone(for_test=True)

    if args.use_cuda:
        place = fluid.CUDAPlace(0)
        dev_count = fluid.core.get_cuda_device_count()
    else:
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))

    print("Device count %d" % dev_count)
    print("theoretical memory usage: ")
    print(
        fluid.contrib.memory_usage(program=train_program,
                                   batch_size=args.batch_size //
                                   args.max_seq_len))

    nccl2_num_trainers = 1
    nccl2_trainer_id = 0
    print("args.is_distributed:", args.is_distributed)
    if args.is_distributed:
        worker_endpoints_env = os.getenv("worker_endpoints")
        worker_endpoints = worker_endpoints_env.split(",")
        trainers_num = len(worker_endpoints)
        current_endpoint = os.getenv("current_endpoint")
        trainer_id = worker_endpoints.index(current_endpoint)
        if trainer_id == 0:
            print("train_id == 0, sleep 60s")
            time.sleep(60)
        print("worker_endpoints:{} trainers_num:{} current_endpoint:{} \
              trainer_id:{}".format(worker_endpoints, trainers_num,
                                    current_endpoint, trainer_id))

        # prepare nccl2 env.
        config = fluid.DistributeTranspilerConfig()
        config.mode = "nccl2"
        t = fluid.DistributeTranspiler(config=config)
        t.transpile(trainer_id,
                    trainers=worker_endpoints_env,
                    current_endpoint=current_endpoint,
                    program=train_program,
                    startup_program=startup_prog)
        nccl2_num_trainers = trainers_num
        nccl2_trainer_id = trainer_id

    exe = fluid.Executor(place)
    exe.run(startup_prog)

    if args.init_checkpoint and args.init_checkpoint != "":
        init_checkpoint(exe, args.init_checkpoint, train_program,
                        args.use_fp16)

    data_reader = ErnieDataReader(filelist=args.train_filelist,
                                  batch_size=args.batch_size,
                                  vocab_path=args.vocab_path,
                                  voc_size=ernie_config['vocab_size'],
                                  epoch=args.epoch,
                                  max_seq_len=args.max_seq_len,
                                  generate_neg_sample=args.generate_neg_sample)

    exec_strategy = fluid.ExecutionStrategy()
    if args.use_fast_executor:
        exec_strategy.use_experimental_executor = True
    exec_strategy.num_threads = dev_count
    exec_strategy.num_iteration_per_drop_scope = min(10, args.skip_steps)

    build_strategy = fluid.BuildStrategy()
    build_strategy.remove_unnecessary_lock = False

    train_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda,
                                       loss_name=total_loss.name,
                                       build_strategy=build_strategy,
                                       exec_strategy=exec_strategy,
                                       main_program=train_program,
                                       num_trainers=nccl2_num_trainers,
                                       trainer_id=nccl2_trainer_id)

    if args.valid_filelist and args.valid_filelist != "":
        predict = predict_wrapper(args,
                                  exe,
                                  ernie_config,
                                  test_prog=test_prog,
                                  pyreader=test_pyreader,
                                  fetch_list=[
                                      next_sent_acc.name, mask_lm_loss.name,
                                      total_loss.name
                                  ])

    train_pyreader.decorate_tensor_provider(data_reader.data_generator())
    train_pyreader.start()
    steps = 0
    cost = []
    lm_cost = []
    acc = []
    time_begin = time.time()
    while steps < args.num_train_steps:
        try:
            steps += nccl2_num_trainers
            skip_steps = args.skip_steps * nccl2_num_trainers

            if nccl2_trainer_id != 0:
                train_exe.run(fetch_list=[])
                continue

            if steps % skip_steps != 0:
                train_exe.run(fetch_list=[])
            else:
                each_next_acc, each_mask_lm_cost, each_total_cost, np_lr = train_exe.run(
                    fetch_list=[
                        next_sent_acc.name, mask_lm_loss.name, total_loss.name,
                        scheduled_lr.name
                    ])
                acc.extend(each_next_acc)
                lm_cost.extend(each_mask_lm_cost)
                cost.extend(each_total_cost)

                print("feed_queue size", train_pyreader.queue.size())
                time_end = time.time()
                used_time = time_end - time_begin
                epoch, current_file_index, total_file, current_file, mask_type = data_reader.get_progress(
                )
                print("current learning_rate:%f" % np_lr[0])
                print(
                    "epoch: %d, progress: %d/%d, step: %d, loss: %f, "
                    "ppl: %f, next_sent_acc: %f, speed: %f steps/s, file: %s, mask_type: %s"
                    %
                    (epoch, current_file_index, total_file, steps,
                     np.mean(np.array(cost)), np.mean(np.exp(
                         np.array(lm_cost))), np.mean(np.array(acc)),
                     skip_steps / used_time, current_file, mask_type))
                cost = []
                lm_cost = []
                acc = []
                time_begin = time.time()

            if steps % args.save_steps == 0:
                save_path = os.path.join(args.checkpoints,
                                         "step_" + str(steps))
                fluid.io.save_persistables(exe, save_path, train_program)

            if args.valid_filelist and steps % args.validation_steps == 0:
                vali_cost, vali_lm_cost, vali_acc, vali_steps, vali_speed = predict(
                )
                print("[validation_set] epoch: %d, step: %d, "
                      "loss: %f, global ppl: %f, batch-averged ppl: %f, "
                      "next_sent_acc: %f, speed: %f steps/s" %
                      (epoch, steps, np.mean(np.array(vali_cost) / vali_steps),
                       np.exp(np.mean(np.array(vali_lm_cost) / vali_steps)),
                       np.mean(np.exp(np.array(vali_lm_cost) / vali_steps)),
                       np.mean(np.array(vali_acc) / vali_steps), vali_speed))

        except fluid.core.EOFException:
            train_pyreader.reset()
            break
コード例 #17
0
ファイル: train_executor.py プロジェクト: zzz2010/Contrib
def main(args):
    image_shape = args.crop_size
    image = fluid.layers.data(name='image',
                              shape=[3, image_shape, image_shape],
                              dtype='float32')
    label = fluid.layers.data(name='label',
                              shape=[image_shape, image_shape],
                              dtype='int64')

    batch_size = args.batch_size
    epoch_num = args.epoch_num
    num_classes = args.num_classes
    data_root = args.data_folder
    if args.cuda:
        num = fluid.core.get_cuda_device_count()
        print('The number of GPU: {}'.format(num))
    else:
        num = _cpu_num()
        print('The number of CPU: {}'.format(num))

    # program
    start_prog = fluid.default_startup_program()
    train_prog = fluid.default_main_program()

    start_prog.random_seed = args.seed
    train_prog.random_seed = args.seed
    np.random.seed(args.seed)
    random.seed(args.seed)

    # clone
    test_prog = train_prog.clone(for_test=True)

    logging.basicConfig(
        level=logging.INFO,
        filename='DANet_{}_train_executor.log'.format(args.backbone),
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    logging.info('DANet')
    logging.info(args)

    with fluid.program_guard(train_prog, start_prog):
        with fluid.unique_name.guard():
            train_py_reader = fluid.io.PyReader(feed_list=[image, label],
                                                capacity=64,
                                                use_double_buffer=True,
                                                iterable=False)
            train_data = cityscapes_train(data_root=data_root,
                                          base_size=args.base_size,
                                          crop_size=args.crop_size,
                                          scale=args.scale,
                                          xmap=True,
                                          batch_size=batch_size,
                                          gpu_num=num)
            batch_train_data = paddle.batch(paddle.reader.shuffle(
                train_data, buf_size=batch_size * 16),
                                            batch_size=batch_size,
                                            drop_last=True)
            train_py_reader.decorate_sample_list_generator(batch_train_data)

            model = get_model(args)
            pred, pred2, pred3 = model(image)
            train_loss = loss_fn(pred,
                                 pred2,
                                 pred3,
                                 label,
                                 num_classes=num_classes)
            train_avg_loss = fluid.layers.mean(train_loss)
            optimizer = optimizer_setting(args)
            optimizer.minimize(train_avg_loss)
            # miou不是真实的
            miou, wrong, correct = mean_iou(pred,
                                            label,
                                            num_classes=num_classes)

    with fluid.program_guard(test_prog, start_prog):
        with fluid.unique_name.guard():
            test_py_reader = fluid.io.PyReader(feed_list=[image, label],
                                               capacity=64,
                                               iterable=False,
                                               use_double_buffer=True)
            val_data = cityscapes_val(data_root=data_root,
                                      base_size=args.base_size,
                                      crop_size=args.crop_size,
                                      scale=args.scale,
                                      xmap=True)
            batch_test_data = paddle.batch(val_data,
                                           batch_size=batch_size,
                                           drop_last=True)
            test_py_reader.decorate_sample_list_generator(batch_test_data)

            model = get_model(args)
            pred, pred2, pred3 = model(image)
            test_loss = loss_fn(pred,
                                pred2,
                                pred3,
                                label,
                                num_classes=num_classes)
            test_avg_loss = fluid.layers.mean(test_loss)
            # miou不是真实的
            miou, wrong, correct = mean_iou(pred,
                                            label,
                                            num_classes=num_classes)

    place = fluid.CUDAPlace(0) if args.cuda else fluid.CPUPlace()
    exe = fluid.Executor(place)
    exe.run(start_prog)

    if args.use_data_parallel and args.cuda:
        exec_strategy = fluid.ExecutionStrategy()
        exec_strategy.num_threads = fluid.core.get_cuda_device_count()
        exec_strategy.num_iteration_per_drop_scope = 100
        build_strategy = fluid.BuildStrategy()
        build_strategy.sync_batch_norm = True
        print("sync_batch_norm = True!")
        compiled_train_prog = fluid.compiler.CompiledProgram(
            train_prog).with_data_parallel(loss_name=train_avg_loss.name,
                                           build_strategy=build_strategy,
                                           exec_strategy=exec_strategy)
    else:
        compiled_train_prog = fluid.compiler.CompiledProgram(train_prog)

    # 加载预训练模型
    if args.load_pretrained_model:
        assert os.path.exists(
            args.save_model
        ), "your input save_model: {} ,but '{}' is not exists".format(
            args.save_model, args.save_model)
        load_model(args.save_model, exe, program=train_prog)
        print('load pretrained model!')

    # 加载最优模型
    if args.load_better_model:
        assert os.path.exists(
            args.save_model
        ), "your input save_model: {} ,but '{}' is not exists".format(
            args.save_model, args.save_model)
        load_model(args.save_model, exe, program=train_prog)
        print('load better model!')

    train_iou_manager = fluid.metrics.Accuracy()
    train_avg_loss_manager = fluid.metrics.Accuracy()
    test_iou_manager = fluid.metrics.Accuracy()
    test_avg_loss_manager = fluid.metrics.Accuracy()
    better_miou_train = 0
    better_miou_test = 0

    train_loss_title = 'Train_loss'
    test_loss_title = 'Test_loss'

    train_iou_title = 'Train_mIOU'
    test_iou_title = 'Test_mIOU'

    plot_loss = Ploter(train_loss_title, test_loss_title)
    plot_iou = Ploter(train_iou_title, test_iou_title)

    for epoch in range(epoch_num):
        prev_time = datetime.now()
        train_avg_loss_manager.reset()
        train_iou_manager.reset()
        logging.info('training, epoch = {}'.format(epoch + 1))
        train_py_reader.start()
        batch_id = 0
        while True:
            try:
                train_fetch_list = [train_avg_loss, miou, wrong, correct]
                train_avg_loss_value, train_iou_value, w, c = exe.run(
                    program=compiled_train_prog, fetch_list=train_fetch_list)

                train_iou_manager.update(train_iou_value,
                                         weight=int(batch_size * num))
                train_avg_loss_manager.update(train_avg_loss_value,
                                              weight=int(batch_size * num))
                batch_train_str = "epoch: {}, batch: {}, train_avg_loss: {:.6f}, " \
                                  "train_miou: {:.6f}.".format(epoch + 1,
                                                               batch_id + 1,
                                                               train_avg_loss_value[0],
                                                               train_iou_value[0])
                if batch_id % 40 == 0:
                    logging.info(batch_train_str)
                    print(batch_train_str)
                batch_id += 1
            except fluid.core.EOFException:
                train_py_reader.reset()
                break
        cur_time = datetime.now()
        h, remainder = divmod((cur_time - prev_time).seconds, 3600)
        m, s = divmod(remainder, 60)
        time_str = " Time %02d:%02d:%02d" % (h, m, s)
        train_str = "epoch: {}, train_avg_loss: {:.6f}, " \
                    "train_miou: {:.6f}.".format(epoch + 1,
                                                 train_avg_loss_manager.eval()[0],
                                                 train_iou_manager.eval()[0])
        print(train_str + time_str + '\n')
        logging.info(train_str + time_str)
        plot_loss.append(train_loss_title, epoch,
                         train_avg_loss_manager.eval()[0])
        plot_loss.plot('./DANet_loss_executor.jpg')
        plot_iou.append(train_iou_title, epoch, train_iou_manager.eval()[0])
        plot_iou.plot('./DANet_miou_executor.jpg')

        # save_model
        if better_miou_train < train_iou_manager.eval()[0]:
            shutil.rmtree('./checkpoint/DANet_better_train_{:.4f}'.format(
                better_miou_train),
                          ignore_errors=True)
            better_miou_train = train_iou_manager.eval()[0]
            logging.warning(
                '-----------train---------------better_train: {:.6f}, epoch: {}, -----------Train model saved successfully!\n'
                .format(better_miou_train, epoch + 1))
            save_dir = './checkpoint/DANet_better_train_{:.4f}'.format(
                better_miou_train)
            save_model(save_dir, exe, program=train_prog)
        if (epoch + 1) % 5 == 0:
            save_dir = './checkpoint/DANet_epoch_train'
            save_model(save_dir, exe, program=train_prog)

        # test
        test_py_reader.start()
        test_iou_manager.reset()
        test_avg_loss_manager.reset()
        prev_time = datetime.now()
        logging.info('testing, epoch = {}'.format(epoch + 1))
        batch_id = 0
        while True:
            try:
                test_fetch_list = [test_avg_loss, miou, wrong, correct]
                test_avg_loss_value, test_iou_value, _, _ = exe.run(
                    program=test_prog, fetch_list=test_fetch_list)
                test_iou_manager.update(test_iou_value,
                                        weight=int(batch_size * num))
                test_avg_loss_manager.update(test_avg_loss_value,
                                             weight=int(batch_size * num))
                batch_test_str = "epoch: {}, batch: {}, test_avg_loss: {:.6f}, " \
                                 "test_miou: {:.6f}. ".format(epoch + 1,
                                                              batch_id + 1,
                                                              test_avg_loss_value[0],
                                                              test_iou_value[0])
                if batch_id % 40 == 0:
                    logging.info(batch_test_str)
                    print(batch_test_str)
                batch_id += 1
            except fluid.core.EOFException:
                test_py_reader.reset()
                break
        cur_time = datetime.now()
        h, remainder = divmod((cur_time - prev_time).seconds, 3600)
        m, s = divmod(remainder, 60)
        time_str = " Time %02d:%02d:%02d" % (h, m, s)
        test_str = "epoch: {}, test_avg_loss: {:.6f}, " \
                   "test_miou: {:.6f}.".format(epoch + 1,
                                               test_avg_loss_manager.eval()[0],
                                               test_iou_manager.eval()[0])
        print(test_str + time_str + '\n')
        logging.info(test_str + time_str)
        plot_loss.append(test_loss_title, epoch,
                         test_avg_loss_manager.eval()[0])
        plot_loss.plot('./DANet_loss_executor.jpg')
        plot_iou.append(test_iou_title, epoch, test_iou_manager.eval()[0])
        plot_iou.plot('./DANet_miou_executor.jpg')

        # save_model_infer
        if better_miou_test < test_iou_manager.eval()[0]:
            shutil.rmtree('./checkpoint/infer/DANet_better_test_{:.4f}'.format(
                better_miou_test),
                          ignore_errors=True)
            better_miou_test = test_iou_manager.eval()[0]
            logging.warning(
                '------------test-------------infer better_test: {:.6f}, epoch: {}, ----------------Inference model saved successfully!\n'
                .format(better_miou_test, epoch + 1))
            save_dir = './checkpoint/infer/DANet_better_test_{:.4f}'.format(
                better_miou_test)
            # save_model(save_dir, exe, program=test_prog)
            fluid.io.save_inference_model(save_dir, [image.name],
                                          [pred, pred2, pred3], exe)
            print('Inference model saved successfully')
コード例 #18
0
def main():
    env = os.environ
    FLAGS.dist = 'PADDLE_TRAINER_ID' in env and 'PADDLE_TRAINERS_NUM' in env
    if FLAGS.dist:
        trainer_id = int(env['PADDLE_TRAINER_ID'])
        import random
        local_seed = (99 + trainer_id)
        random.seed(local_seed)
        np.random.seed(local_seed)

    cfg = load_config(FLAGS.config)
    if 'architecture' in cfg:
        main_arch = cfg.architecture
    else:
        raise ValueError("'architecture' not specified in config file.")

    merge_config(FLAGS.opt)

    if 'log_iter' not in cfg:
        cfg.log_iter = 20

    # check if set use_gpu=True in paddlepaddle cpu version
    check_gpu(cfg.use_gpu)
    if not FLAGS.dist or trainer_id == 0:
        print_total_cfg(cfg)

    if cfg.use_gpu:
        devices_num = fluid.core.get_cuda_device_count()
    else:
        devices_num = int(os.environ.get('CPU_NUM', 1))

    if 'train_feed' not in cfg:
        train_feed = create(main_arch + 'TrainFeed')
    else:
        train_feed = create(cfg.train_feed)

    if FLAGS.eval:
        if 'eval_feed' not in cfg:
            eval_feed = create(main_arch + 'EvalFeed')
        else:
            eval_feed = create(cfg.eval_feed)

    if 'FLAGS_selected_gpus' in env:
        device_id = int(env['FLAGS_selected_gpus'])
    else:
        device_id = 0
    place = fluid.CUDAPlace(device_id) if cfg.use_gpu else fluid.CPUPlace()
    exe = fluid.Executor(place)

    lr_builder = create('LearningRate')
    optim_builder = create('OptimizerBuilder')

    # build program
    startup_prog = fluid.Program()
    train_prog = fluid.Program()
    with fluid.program_guard(train_prog, startup_prog):
        with fluid.unique_name.guard():
            model = create(main_arch)
            train_pyreader, feed_vars = create_feed(train_feed)

            if FLAGS.fp16:
                assert (getattr(model.backbone, 'norm_type', None)
                        != 'affine_channel'), \
                    '--fp16 currently does not support affine channel, ' \
                    ' please modify backbone settings to use batch norm'

            with mixed_precision_context(FLAGS.loss_scale, FLAGS.fp16) as ctx:
                train_fetches = model.train(feed_vars)

                loss = train_fetches['loss']
                if FLAGS.fp16:
                    loss *= ctx.get_loss_scale_var()
                lr = lr_builder()
                optimizer = optim_builder(lr)
                optimizer.minimize(loss)
                if FLAGS.fp16:
                    loss /= ctx.get_loss_scale_var()

    # parse train fetches
    train_keys, train_values, _ = parse_fetches(train_fetches)
    train_values.append(lr)

    if FLAGS.eval:
        eval_prog = fluid.Program()
        with fluid.program_guard(eval_prog, startup_prog):
            with fluid.unique_name.guard():
                model = create(main_arch)
                eval_pyreader, feed_vars = create_feed(eval_feed)
                fetches = model.eval(feed_vars)
        eval_prog = eval_prog.clone(True)

        eval_reader = create_reader(eval_feed, args_path=FLAGS.dataset_dir)
        eval_pyreader.decorate_sample_list_generator(eval_reader, place)

        # parse eval fetches
        extra_keys = []
        if cfg.metric == 'COCO':
            extra_keys = ['im_info', 'im_id', 'im_shape']
        if cfg.metric == 'VOC':
            extra_keys = ['gt_box', 'gt_label', 'is_difficult']
        if cfg.metric == 'WIDERFACE':
            extra_keys = ['im_id', 'im_shape', 'gt_box']
        eval_keys, eval_values, eval_cls = parse_fetches(
            fetches, eval_prog, extra_keys)

    # compile program for multi-devices
    build_strategy = fluid.BuildStrategy()
    build_strategy.fuse_all_optimizer_ops = False
    build_strategy.fuse_elewise_add_act_ops = True
    # only enable sync_bn in multi GPU devices
    sync_bn = getattr(model.backbone, 'norm_type', None) == 'sync_bn'
    build_strategy.sync_batch_norm = sync_bn and devices_num > 1 \
        and cfg.use_gpu

    exec_strategy = fluid.ExecutionStrategy()
    # iteration number when CompiledProgram tries to drop local execution scopes.
    # Set it to be 1 to save memory usages, so that unused variables in
    # local execution scopes can be deleted after each iteration.
    exec_strategy.num_iteration_per_drop_scope = 1
    if FLAGS.dist:
        dist_utils.prepare_for_multi_process(exe, build_strategy, startup_prog,
                                             train_prog)
        exec_strategy.num_threads = 1

    exe.run(startup_prog)
    compiled_train_prog = fluid.CompiledProgram(train_prog).with_data_parallel(
        loss_name=loss.name,
        build_strategy=build_strategy,
        exec_strategy=exec_strategy)

    if FLAGS.eval:
        compiled_eval_prog = fluid.compiler.CompiledProgram(eval_prog)

    fuse_bn = getattr(model.backbone, 'norm_type', None) == 'affine_channel'

    ignore_params = cfg.finetune_exclude_pretrained_params \
                 if 'finetune_exclude_pretrained_params' in cfg else []

    start_iter = 0
    if FLAGS.resume_checkpoint:
        checkpoint.load_checkpoint(exe, train_prog, FLAGS.resume_checkpoint)
        start_iter = checkpoint.global_step()
    elif cfg.pretrain_weights and fuse_bn and not ignore_params:
        checkpoint.load_and_fusebn(exe, train_prog, cfg.pretrain_weights)
    elif cfg.pretrain_weights:
        checkpoint.load_params(exe,
                               train_prog,
                               cfg.pretrain_weights,
                               ignore_params=ignore_params)

    train_reader = create_reader(train_feed,
                                 (cfg.max_iters - start_iter) * devices_num,
                                 FLAGS.dataset_dir)
    train_pyreader.decorate_sample_list_generator(train_reader, place)

    # whether output bbox is normalized in model output layer
    is_bbox_normalized = False
    if hasattr(model, 'is_bbox_normalized') and \
            callable(model.is_bbox_normalized):
        is_bbox_normalized = model.is_bbox_normalized()

    # if map_type not set, use default 11point, only use in VOC eval
    map_type = cfg.map_type if 'map_type' in cfg else '11point'

    train_stats = TrainingStats(cfg.log_smooth_window, train_keys)
    train_pyreader.start()
    start_time = time.time()
    end_time = time.time()

    cfg_name = os.path.basename(FLAGS.config).split('.')[0]
    save_dir = os.path.join(cfg.save_dir, cfg_name)
    time_stat = deque(maxlen=cfg.log_smooth_window)
    best_box_ap_list = [0.0, 0]  #[map, iter]

    # use tb-paddle to log data
    if FLAGS.use_tb:
        from tb_paddle import SummaryWriter
        tb_writer = SummaryWriter(FLAGS.tb_log_dir)
        tb_loss_step = 0
        tb_mAP_step = 0

    for it in range(start_iter, cfg.max_iters):
        start_time = end_time
        end_time = time.time()
        time_stat.append(end_time - start_time)
        time_cost = np.mean(time_stat)
        eta_sec = (cfg.max_iters - it) * time_cost
        eta = str(datetime.timedelta(seconds=int(eta_sec)))
        outs = exe.run(compiled_train_prog, fetch_list=train_values)
        stats = {k: np.array(v).mean() for k, v in zip(train_keys, outs[:-1])}

        # use tb-paddle to log loss
        if FLAGS.use_tb:
            if it % cfg.log_iter == 0:
                for loss_name, loss_value in stats.items():
                    tb_writer.add_scalar(loss_name, loss_value, tb_loss_step)
                tb_loss_step += 1

        train_stats.update(stats)
        logs = train_stats.log()
        if it % cfg.log_iter == 0 and (not FLAGS.dist or trainer_id == 0):
            strs = 'iter: {}, lr: {:.6f}, {}, time: {:.3f}, eta: {}'.format(
                it, np.mean(outs[-1]), logs, time_cost, eta)
            logger.info(strs)

        if (it > 0 and it % cfg.snapshot_iter == 0 or it == cfg.max_iters - 1) \
           and (not FLAGS.dist or trainer_id == 0):
            save_name = str(it) if it != cfg.max_iters - 1 else "model_final"
            checkpoint.save(exe, train_prog, os.path.join(save_dir, save_name))

            if FLAGS.eval:
                # evaluation
                results = eval_run(exe, compiled_eval_prog, eval_pyreader,
                                   eval_keys, eval_values, eval_cls)
                resolution = None
                if 'mask' in results[0]:
                    resolution = model.mask_head.resolution
                box_ap_stats = eval_results(results, eval_feed, cfg.metric,
                                            cfg.num_classes, resolution,
                                            is_bbox_normalized,
                                            FLAGS.output_eval, map_type)

                # use tb_paddle to log mAP
                if FLAGS.use_tb:
                    tb_writer.add_scalar("mAP", box_ap_stats[0], tb_mAP_step)
                    tb_mAP_step += 1

                if box_ap_stats[0] > best_box_ap_list[0]:
                    best_box_ap_list[0] = box_ap_stats[0]
                    best_box_ap_list[1] = it
                    checkpoint.save(exe, train_prog,
                                    os.path.join(save_dir, "best_model"))
                logger.info("Best test box ap: {}, in iter: {}".format(
                    best_box_ap_list[0], best_box_ap_list[1]))

    train_pyreader.reset()
コード例 #19
0
ファイル: train_wandb.py プロジェクト: suyeH/PaddleSeg
def train(cfg):
    startup_prog = fluid.Program()
    train_prog = fluid.Program()
    test_prog = fluid.Program()
    if args.enable_ce:
        startup_prog.random_seed = 1000
        train_prog.random_seed = 1000
    drop_last = True

    dataset = SegDataset(file_list=cfg.DATASET.TRAIN_FILE_LIST,
                         mode=ModelPhase.TRAIN,
                         shuffle=True,
                         data_dir=cfg.DATASET.DATA_DIR)

    def data_generator():
        if args.use_mpio:
            data_gen = dataset.multiprocess_generator(
                num_processes=cfg.DATALOADER.NUM_WORKERS,
                max_queue_size=cfg.DATALOADER.BUF_SIZE)
        else:
            data_gen = dataset.generator()

        batch_data = []
        for b in data_gen:
            batch_data.append(b)
            if len(batch_data) == (cfg.BATCH_SIZE // cfg.NUM_TRAINERS):
                for item in batch_data:
                    yield item[0], item[1], item[2]
                batch_data = []
        # If use sync batch norm strategy, drop last batch if number of samples
        # in batch_data is less then cfg.BATCH_SIZE to avoid NCCL hang issues
        if not cfg.TRAIN.SYNC_BATCH_NORM:
            for item in batch_data:
                yield item[0], item[1], item[2]

    # Get device environment
    gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0))
    place = fluid.CUDAPlace(gpu_id) if args.use_gpu else fluid.CPUPlace()
    places = fluid.cuda_places() if args.use_gpu else fluid.cpu_places()

    # Get number of GPU
    dev_count = cfg.NUM_TRAINERS if cfg.NUM_TRAINERS > 1 else len(places)
    print_info("#Device count: {}".format(dev_count))

    # Make sure BATCH_SIZE can divided by GPU cards
    assert cfg.BATCH_SIZE % dev_count == 0, (
        'BATCH_SIZE:{} not divisble by number of GPUs:{}'.format(
            cfg.BATCH_SIZE, dev_count))
    # If use multi-gpu training mode, batch data will allocated to each GPU evenly
    batch_size_per_dev = cfg.BATCH_SIZE // dev_count
    print_info("batch_size_per_dev: {}".format(batch_size_per_dev))

    data_loader, avg_loss, lr, pred, grts, masks = build_model(
        train_prog, startup_prog, phase=ModelPhase.TRAIN)
    build_model(test_prog, fluid.Program(), phase=ModelPhase.EVAL)
    data_loader.set_sample_generator(data_generator,
                                     batch_size=batch_size_per_dev,
                                     drop_last=drop_last)

    exe = fluid.Executor(place)
    exe.run(startup_prog)

    exec_strategy = fluid.ExecutionStrategy()
    # Clear temporary variables every 100 iteration
    if args.use_gpu:
        exec_strategy.num_threads = fluid.core.get_cuda_device_count()
    exec_strategy.num_iteration_per_drop_scope = 100
    build_strategy = fluid.BuildStrategy()

    if cfg.NUM_TRAINERS > 1 and args.use_gpu:
        dist_utils.prepare_for_multi_process(exe, build_strategy, train_prog)
        exec_strategy.num_threads = 1

    if cfg.TRAIN.SYNC_BATCH_NORM and args.use_gpu:
        if dev_count > 1:
            # Apply sync batch norm strategy
            print_info("Sync BatchNorm strategy is effective.")
            build_strategy.sync_batch_norm = True
        else:
            print_info(
                "Sync BatchNorm strategy will not be effective if GPU device"
                " count <= 1")
    compiled_train_prog = fluid.CompiledProgram(train_prog).with_data_parallel(
        loss_name=avg_loss.name,
        exec_strategy=exec_strategy,
        build_strategy=build_strategy)

    # Resume training
    begin_epoch = cfg.SOLVER.BEGIN_EPOCH
    if cfg.TRAIN.RESUME_MODEL_DIR:
        begin_epoch = load_checkpoint(exe, train_prog)
    # Load pretrained model
    elif os.path.exists(cfg.TRAIN.PRETRAINED_MODEL_DIR):
        load_pretrained_weights(exe, train_prog,
                                cfg.TRAIN.PRETRAINED_MODEL_DIR)
    else:
        print_info(
            'Pretrained model dir {} not exists, training from scratch...'.
            format(cfg.TRAIN.PRETRAINED_MODEL_DIR))

    fetch_list = [avg_loss.name, lr.name]
    if args.debug:
        # Fetch more variable info and use streaming confusion matrix to
        # calculate IoU results if in debug mode
        np.set_printoptions(precision=4,
                            suppress=True,
                            linewidth=160,
                            floatmode="fixed")
        fetch_list.extend([pred.name, grts.name, masks.name])
        cm = ConfusionMatrix(cfg.DATASET.NUM_CLASSES, streaming=True)

    if args.use_vdl:
        if not args.vdl_log_dir:
            print_info("Please specify the log directory by --vdl_log_dir.")
            exit(1)

        from visualdl import LogWriter
        log_writer = LogWriter(args.vdl_log_dir)

    # trainer_id = int(os.getenv("PADDLE_TRAINER_ID", 0))
    # num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1))
    step = 0
    all_step = cfg.DATASET.TRAIN_TOTAL_IMAGES // cfg.BATCH_SIZE
    if cfg.DATASET.TRAIN_TOTAL_IMAGES % cfg.BATCH_SIZE and drop_last != True:
        all_step += 1
    all_step *= (cfg.SOLVER.NUM_EPOCHS - begin_epoch + 1)

    avg_loss = 0.0
    best_mIoU = 0.0

    timer = Timer()
    timer.start()
    if begin_epoch > cfg.SOLVER.NUM_EPOCHS:
        raise ValueError((
            "begin epoch[{}] is larger than cfg.SOLVER.NUM_EPOCHS[{}]").format(
                begin_epoch, cfg.SOLVER.NUM_EPOCHS))

    if args.use_mpio:
        print_info("Use multiprocess reader")
    else:
        print_info("Use multi-thread reader")

    for epoch in range(begin_epoch, cfg.SOLVER.NUM_EPOCHS + 1):
        data_loader.start()
        while True:
            try:
                if args.debug:
                    # Print category IoU and accuracy to check whether the
                    # traning process is corresponed to expectation
                    loss, lr, pred, grts, masks = exe.run(
                        program=compiled_train_prog,
                        fetch_list=fetch_list,
                        return_numpy=True)
                    cm.calculate(pred, grts, masks)
                    avg_loss += np.mean(np.array(loss))
                    step += 1

                    if step % args.log_steps == 0:
                        speed = args.log_steps / timer.elapsed_time()
                        avg_loss /= args.log_steps
                        category_acc, mean_acc = cm.accuracy()
                        category_iou, mean_iou = cm.mean_iou()

                        print_info((
                            "epoch={} step={} lr={:.5f} loss={:.4f} acc={:.5f} mIoU={:.5f} step/sec={:.3f} | ETA {}"
                        ).format(epoch, step, lr[0], avg_loss, mean_acc,
                                 mean_iou, speed,
                                 calculate_eta(all_step - step, speed)))
                        print_info("Category IoU: ", category_iou)
                        print_info("Category Acc: ", category_acc)
                        if args.use_vdl:
                            log_writer.add_scalar('Train/mean_iou', mean_iou,
                                                  step)
                            log_writer.add_scalar('Train/mean_acc', mean_acc,
                                                  step)
                            log_writer.add_scalar('Train/loss', avg_loss, step)
                            log_writer.add_scalar('Train/lr', lr[0], step)
                            log_writer.add_scalar('Train/step/sec', speed,
                                                  step)

                            wandb.log({'epoch': epoch, 'loss': loss})
                        sys.stdout.flush()
                        avg_loss = 0.0
                        cm.zero_matrix()
                        timer.restart()
                else:
                    # If not in debug mode, avoid unnessary log and calculate
                    loss, lr = exe.run(program=compiled_train_prog,
                                       fetch_list=fetch_list,
                                       return_numpy=True)
                    avg_loss += np.mean(np.array(loss))
                    step += 1

                    if step % args.log_steps == 0 and cfg.TRAINER_ID == 0:
                        avg_loss /= args.log_steps
                        speed = args.log_steps / timer.elapsed_time()
                        print((
                            "epoch={} step={} lr={:.5f} loss={:.4f} step/sec={:.3f} | ETA {}"
                        ).format(epoch, step, lr[0], avg_loss, speed,
                                 calculate_eta(all_step - step, speed)))
                        if args.use_vdl:
                            log_writer.add_scalar('Train/loss', avg_loss, step)
                            log_writer.add_scalar('Train/lr', lr[0], step)
                            log_writer.add_scalar('Train/speed', speed, step)

                            wandb.log(
                                {
                                    'Train/loss': avg_loss,
                                    'Train/lr': lr[0],
                                    'Train/speed': speed
                                },
                                step=step)
                        sys.stdout.flush()
                        avg_loss = 0.0
                        timer.restart()

                    # NOTE : used for benchmark, profiler tools
                    if args.is_profiler and epoch == 1 and step == args.log_steps:
                        profiler.start_profiler("All")
                    elif args.is_profiler and epoch == 1 and step == args.log_steps + 5:
                        profiler.stop_profiler("total", args.profiler_path)
                        return

            except fluid.core.EOFException:
                data_loader.reset()
                break
            except Exception as e:
                print(e)

        if (epoch % cfg.TRAIN.SNAPSHOT_EPOCH == 0
                or epoch == cfg.SOLVER.NUM_EPOCHS) and cfg.TRAINER_ID == 0:
            ckpt_dir = save_checkpoint(train_prog, epoch)
            save_infer_program(test_prog, ckpt_dir)

            if args.do_eval:
                print("Evaluation start")
                _, mean_iou, _, mean_acc = evaluate(cfg=cfg,
                                                    ckpt_dir=ckpt_dir,
                                                    use_gpu=args.use_gpu,
                                                    use_mpio=args.use_mpio)
                if args.use_vdl:
                    log_writer.add_scalar('Evaluate/mean_iou', mean_iou, step)
                    log_writer.add_scalar('Evaluate/mean_acc', mean_acc, step)
                    wandb.log(
                        {
                            'Evaluate/mean_iou': mean_iou,
                            'Evaluate/mean_acc': mean_acc
                        },
                        step=step)

                if mean_iou > best_mIoU:
                    best_mIoU = mean_iou
                    update_best_model(ckpt_dir)
                    print_info(
                        "Save best model {} to {}, mIoU = {:.4f}".format(
                            ckpt_dir,
                            os.path.join(cfg.TRAIN.MODEL_SAVE_DIR,
                                         'best_model'), mean_iou))

            # Use VisualDL to visualize results
            if args.use_vdl and cfg.DATASET.VIS_FILE_LIST is not None:
                visualize(cfg=cfg,
                          use_gpu=args.use_gpu,
                          vis_file_list=cfg.DATASET.VIS_FILE_LIST,
                          vis_dir="visual",
                          ckpt_dir=ckpt_dir,
                          log_writer=log_writer)

    # save final model
    if cfg.TRAINER_ID == 0:
        ckpt_dir = save_checkpoint(train_prog, 'final')
        save_infer_program(test_prog, ckpt_dir)
コード例 #20
0
def main(args):
    ernie_config = ErnieConfig(args.ernie_config_path)
    ernie_config.print_config()

    if args.use_cuda:
        place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0')))
        dev_count = fluid.core.get_cuda_device_count()
    else:
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
    exe = fluid.Executor(place)

    reader = task_reader.SequenceLabelReader(
        vocab_path=args.vocab_path,
        label_map_config=args.label_map_config,
        max_seq_len=args.max_seq_len,
        do_lower_case=args.do_lower_case,
        in_tokens=args.in_tokens,
        random_seed=args.random_seed)

    if not (args.do_train or args.do_val or args.do_test):
        raise ValueError("For args `do_train`, `do_val` and `do_test`, at "
                         "least one of them must be True.")

    startup_prog = fluid.Program()
    if args.random_seed is not None:
        startup_prog.random_seed = args.random_seed

    if args.do_train:
        train_data_generator = reader.data_generator(
            input_file=args.train_set,
            batch_size=args.batch_size,
            epoch=args.epoch,
            shuffle=True,
            phase="train")

        num_train_examples = reader.get_num_examples(args.train_set)

        if args.in_tokens:
            max_train_steps = args.epoch * num_train_examples // (
                args.batch_size // args.max_seq_len) // dev_count
        else:
            max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count

        warmup_steps = int(max_train_steps * args.warmup_proportion)
        print("Device count: %d" % dev_count)
        print("Num train examples: %d" % num_train_examples)
        print("Max train steps: %d" % max_train_steps)
        print("Num warmup steps: %d" % warmup_steps)

        train_program = fluid.Program()

        with fluid.program_guard(train_program, startup_prog):
            with fluid.unique_name.guard():
                train_pyreader, graph_vars = create_model(
                    args,
                    pyreader_name='train_reader',
                    ernie_config=ernie_config)
                scheduled_lr = optimization(
                    loss=graph_vars["loss"],
                    warmup_steps=warmup_steps,
                    num_train_steps=max_train_steps,
                    learning_rate=args.learning_rate,
                    train_program=train_program,
                    startup_prog=startup_prog,
                    weight_decay=args.weight_decay,
                    scheduler=args.lr_scheduler,
                    use_fp16=args.use_fp16,
                    loss_scaling=args.loss_scaling)

                fluid.memory_optimize(
                    input_program=train_program,
                    skip_opt_set=[
                        graph_vars["loss"].name, graph_vars["labels"].name,
                        graph_vars["infers"].name, graph_vars["seq_lens"].name
                    ])

        if args.verbose:
            if args.in_tokens:
                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                    program=train_program,
                    batch_size=args.batch_size // args.max_seq_len)
            else:
                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                    program=train_program, batch_size=args.batch_size)
            print("Theoretical memory usage in training: %.3f - %.3f %s" %
                  (lower_mem, upper_mem, unit))

    if args.do_val or args.do_test:
        test_prog = fluid.Program()
        with fluid.program_guard(test_prog, startup_prog):
            with fluid.unique_name.guard():
                test_pyreader, graph_vars = create_model(
                    args,
                    pyreader_name='test_reader',
                    ernie_config=ernie_config)

        test_prog = test_prog.clone(for_test=True)

    exe.run(startup_prog)

    if args.do_train:
        if args.init_checkpoint and args.init_pretraining_params:
            print(
                "WARNING: args 'init_checkpoint' and 'init_pretraining_params' "
                "both are set! Only arg 'init_checkpoint' is made valid.")
        if args.init_checkpoint:
            init_checkpoint(
                exe,
                args.init_checkpoint,
                main_program=startup_prog,
                use_fp16=args.use_fp16)
        elif args.init_pretraining_params:
            init_pretraining_params(
                exe,
                args.init_pretraining_params,
                main_program=startup_prog,
                use_fp16=args.use_fp16)
    elif args.do_val or args.do_test:
        if not args.init_checkpoint:
            raise ValueError("args 'init_checkpoint' should be set if"
                             "only doing validation or testing!")
        init_checkpoint(
            exe,
            args.init_checkpoint,
            main_program=startup_prog,
            use_fp16=args.use_fp16)

    if args.do_train:
        exec_strategy = fluid.ExecutionStrategy()
        if args.use_fast_executor:
            exec_strategy.use_experimental_executor = True
        exec_strategy.num_threads = dev_count
        exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope

        train_exe = fluid.ParallelExecutor(
            use_cuda=args.use_cuda,
            loss_name=graph_vars["loss"].name,
            exec_strategy=exec_strategy,
            main_program=train_program)

        train_pyreader.decorate_tensor_provider(train_data_generator)
    else:
        train_exe = None

    if args.do_val or args.do_test:
        test_exe = fluid.ParallelExecutor(
            use_cuda=args.use_cuda,
            main_program=test_prog,
            share_vars_from=train_exe)

    if args.do_train:
        train_pyreader.start()
        steps = 0
        if warmup_steps > 0:
            graph_vars["learning_rate"] = scheduled_lr

        if args.save_log and args.log_path:
            if os.path.exists(args.log_path):
                raise FileExistsError("Logging file already exists!")
            with open(args.log_path, 'w') as logfile:
                logfile.write('%s\n' % time.asctime())
            print('Writing logs into %s' % args.log_path)

        time_begin = time.time()
        while True:
            try:
                steps += 1
                if steps % args.skip_steps != 0:
                    train_exe.run(fetch_list=[])
                else:
                    outputs = evaluate(train_exe, train_program, train_pyreader,
                                       graph_vars, args.num_labels, "train",
                                       dev_count)
                    if args.verbose:
                        verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size(
                        )
                        verbose += "learning rate: %f" % (
                            outputs["lr"]
                            if warmup_steps > 0 else args.learning_rate)
                        print(verbose)

                    current_example, current_epoch = reader.get_train_progress()
                    time_end = time.time()
                    used_time = time_end - time_begin
                    print("epoch: %d, progress: %d/%d, step: %d, loss: %f, "
                          "f1: %f, precision: %f, recall: %f, speed: %f steps/s"
                          % (current_epoch, current_example, num_train_examples,
                             steps, outputs["loss"], outputs["f1"],
                             outputs["precision"], outputs["recall"],
                             args.skip_steps / used_time))

                    if args.save_log and args.log_path:
                        with open(args.log_path, 'a') as logfile:
                            logfile.write("epoch: %d, progress: %d/%d, step: %d, loss: %f, "
                                          "f1: %f, precision: %f, recall: %f\n" % (
                                              current_epoch, current_example, num_train_examples,
                                              steps, outputs["loss"], outputs["f1"],
                                              outputs["precision"], outputs["recall"]))

                    time_begin = time.time()

                if steps % args.save_steps == 0:
                    save_path = os.path.join(args.checkpoints,
                                             "step_" + str(steps))
                    fluid.io.save_persistables(exe, save_path, train_program)

                if steps % args.validation_steps == 0:
                    # evaluate dev set
                    if args.do_val:
                        test_pyreader.decorate_tensor_provider(
                            reader.data_generator(
                                args.dev_set,
                                batch_size=args.batch_size,
                                epoch=1,
                                shuffle=False))
                        evaluate(exe, test_prog, test_pyreader, graph_vars,
                                 args.num_labels, "dev")
                    # evaluate test set
                    if args.do_test:
                        test_pyreader.decorate_tensor_provider(
                            reader.data_generator(
                                args.test_set,
                                batch_size=args.batch_size,
                                epoch=1,
                                shuffle=False))
                        evaluate(exe, test_prog, test_pyreader, graph_vars,
                                 args.num_labels, "test")

            except fluid.core.EOFException:
                save_path = os.path.join(args.checkpoints, "step_" + str(steps))
                fluid.io.save_persistables(exe, save_path, train_program)
                train_pyreader.reset()
                break

    # final eval on dev set
    if args.do_val:
        test_pyreader.decorate_tensor_provider(
            reader.data_generator(
                args.dev_set,
                batch_size=args.batch_size,
                epoch=1,
                shuffle=False))
        print("Final validation result:")
        evaluate(exe, test_prog, test_pyreader, graph_vars, args.num_labels,
                 "dev")
        if args.do_predict:
            print("Saving predicted results...")
            predict(exe, test_prog, test_pyreader, graph_vars, args.label_map_config,
                    "test", output_dir="./predicted_results")

    # final eval on test set
    if args.do_test:
        test_pyreader.decorate_tensor_provider(
            reader.data_generator(
                args.test_set,
                batch_size=args.batch_size,
                epoch=1,
                shuffle=False))
        print("Final test result:")
        evaluate(exe, test_prog, test_pyreader, graph_vars, args.num_labels,
                 "test")
        if args.do_predict:
            print("Saving predicted results...")
            predict(exe, test_prog, test_pyreader, graph_vars, args.label_map_config,
                    "test", output_dir="./predicted_results")
コード例 #21
0
ファイル: train.py プロジェクト: zhousanfu/paddle-demo
def train(args):
    print("pretraining start")
    bert_config = BertConfig(args.bert_config_path)
    bert_config.print_config()

    train_program = fluid.Program()
    startup_prog = fluid.Program()
    with fluid.program_guard(train_program, startup_prog):
        with fluid.unique_name.guard():
            train_data_loader, next_sent_acc, mask_lm_loss, total_loss = create_model(
                bert_config=bert_config)
            scheduled_lr, loss_scaling = optimization(
                loss=total_loss,
                warmup_steps=args.warmup_steps,
                num_train_steps=args.num_train_steps,
                learning_rate=args.learning_rate,
                train_program=train_program,
                startup_prog=startup_prog,
                weight_decay=args.weight_decay,
                scheduler=args.lr_scheduler,
                use_fp16=args.use_fp16,
                use_dynamic_loss_scaling=args.use_dynamic_loss_scaling,
                init_loss_scaling=args.init_loss_scaling,
                incr_every_n_steps=args.incr_every_n_steps,
                decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf,
                incr_ratio=args.incr_ratio,
                decr_ratio=args.decr_ratio)

    test_prog = fluid.Program()
    with fluid.program_guard(test_prog, startup_prog):
        with fluid.unique_name.guard():
            test_data_loader, next_sent_acc, mask_lm_loss, total_loss = create_model(
                bert_config=bert_config)

    test_prog = test_prog.clone(for_test=True)

    if args.use_cuda:
        place = fluid.CUDAPlace(0)
        dev_count = fluid.core.get_cuda_device_count()
    else:
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))

    print("Device count %d" % dev_count)

    nccl2_num_trainers = 1
    nccl2_trainer_id = 0
    print("args.is_distributed:", args.is_distributed)
    if args.is_distributed:
        worker_endpoints_env = os.getenv("worker_endpoints")
        worker_endpoints = worker_endpoints_env.split(",")
        trainers_num = len(worker_endpoints)
        current_endpoint = os.getenv("current_endpoint")
        trainer_id = worker_endpoints.index(current_endpoint)
        if trainer_id == 0:
            print("train_id == 0, sleep 60s")
            time.sleep(60)
        print("worker_endpoints:{} trainers_num:{} current_endpoint:{} \
              trainer_id:{}".format(worker_endpoints, trainers_num,
                                    current_endpoint, trainer_id))

        # prepare nccl2 env.
        config = fluid.DistributeTranspilerConfig()
        config.mode = "nccl2"
        t = fluid.DistributeTranspiler(config=config)
        t.transpile(trainer_id,
                    trainers=worker_endpoints_env,
                    current_endpoint=current_endpoint,
                    program=train_program,
                    startup_program=startup_prog)
        nccl2_num_trainers = trainers_num
        nccl2_trainer_id = trainer_id

    exe = fluid.Executor(place)
    exe.run(startup_prog)

    if args.init_checkpoint and args.init_checkpoint != "":
        init_checkpoint(exe, args.init_checkpoint, train_program,
                        args.use_fp16)

    data_reader = DataReader(data_dir=args.data_dir,
                             batch_size=args.batch_size,
                             in_tokens=args.in_tokens,
                             vocab_path=args.vocab_path,
                             voc_size=bert_config['vocab_size'],
                             epoch=args.epoch,
                             max_seq_len=args.max_seq_len,
                             generate_neg_sample=args.generate_neg_sample)

    exec_strategy = fluid.ExecutionStrategy()
    exec_strategy.use_experimental_executor = args.use_fast_executor
    exec_strategy.num_threads = dev_count
    exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope

    build_strategy = fluid.BuildStrategy()
    if not sys.platform == "win32":
        build_strategy.num_trainers = nccl2_num_trainers
    elif nccl2_num_trainers > 1:
        raise ValueError(
            "Windows platform doesn't support distributed training!")
    build_strategy.trainer_id = nccl2_trainer_id
    # use_ngraph is for CPU only, please refer to README_ngraph.md for details
    use_ngraph = os.getenv('FLAGS_use_ngraph')
    if not use_ngraph:
        train_compiled_program = fluid.CompiledProgram(
            train_program).with_data_parallel(loss_name=total_loss.name,
                                              exec_strategy=exec_strategy,
                                              build_strategy=build_strategy)

    if args.validation_set_dir and args.validation_set_dir != "":
        predict = predict_wrapper(args,
                                  exe,
                                  bert_config,
                                  test_prog=test_prog,
                                  data_loader=test_data_loader,
                                  fetch_list=[
                                      next_sent_acc.name, mask_lm_loss.name,
                                      total_loss.name
                                  ])

    train_data_loader.set_batch_generator(data_reader.data_generator())
    train_data_loader.start()
    steps = 0
    cost = []
    lm_cost = []
    acc = []
    time_begin = time.time()
    while steps < args.num_train_steps:
        try:
            steps += 1
            skip_steps = args.skip_steps * nccl2_num_trainers

            if nccl2_trainer_id != 0:
                if use_ngraph:
                    exe.run(fetch_list=[], program=train_program)
                else:
                    exe.run(fetch_list=[], program=train_compiled_program)
                continue

            if steps % args.skip_steps != 0:
                if use_ngraph:
                    exe.run(fetch_list=[], program=train_program)
                else:
                    exe.run(fetch_list=[], program=train_compiled_program)

            else:
                fetch_list = [
                    next_sent_acc.name, mask_lm_loss.name, total_loss.name,
                    scheduled_lr.name
                ]
                if args.use_fp16:
                    fetch_list.append(loss_scaling.name)

                if use_ngraph:
                    outputs = exe.run(fetch_list=fetch_list,
                                      program=train_program)
                else:
                    outputs = exe.run(fetch_list=fetch_list,
                                      program=train_compiled_program)

                if args.use_fp16:
                    each_next_acc, each_mask_lm_cost, each_total_cost, np_lr, np_scaling = outputs
                else:
                    each_next_acc, each_mask_lm_cost, each_total_cost, np_lr = outputs

                acc.extend(each_next_acc)
                lm_cost.extend(each_mask_lm_cost)
                cost.extend(each_total_cost)

                time_end = time.time()
                used_time = time_end - time_begin
                epoch, current_file_index, total_file, current_file = data_reader.get_progress(
                )
                if args.verbose:
                    verbose = "feed_queue size: %d, " % train_data_loader.queue.size(
                    )
                    verbose += "current learning_rate: %f, " % np_lr[0]
                    if args.use_fp16:
                        verbose += "loss scaling: %f" % np_scaling[0]
                    print(verbose)

                print(
                    "epoch: %d, progress: %d/%d, step: %d, loss: %f, "
                    "ppl: %f, next_sent_acc: %f, speed: %f steps/s, file: %s" %
                    (epoch, current_file_index, total_file, steps,
                     np.mean(np.array(cost)), np.mean(np.exp(
                         np.array(lm_cost))), np.mean(np.array(acc)),
                     skip_steps / used_time, current_file))
                cost = []
                lm_cost = []
                acc = []
                time_begin = time.time()

            if steps % args.save_steps == 0:
                save_path = os.path.join(args.checkpoints,
                                         "step_" + str(steps))
                fluid.save(program=train_program, model_path=save_path)

            if args.validation_set_dir and steps % args.validation_steps == 0:
                vali_cost, vali_lm_cost, vali_acc, vali_steps, vali_speed = predict(
                )
                print("[validation_set] epoch: %d, step: %d, "
                      "loss: %f, global ppl: %f, batch-averged ppl: %f, "
                      "next_sent_acc: %f, speed: %f steps/s" %
                      (epoch, steps, np.mean(np.array(vali_cost) / vali_steps),
                       np.exp(np.mean(np.array(vali_lm_cost) / vali_steps)),
                       np.mean(np.exp(np.array(vali_lm_cost) / vali_steps)),
                       np.mean(np.array(vali_acc) / vali_steps), vali_speed))

        except fluid.core.EOFException:
            train_data_loader.reset()
            break
コード例 #22
0
ファイル: run_classifier.py プロジェクト: wbsjg/ERNIE
def main(args):
    ernie_config = ErnieConfig(args.ernie_config_path)
    ernie_config.print_config()

    if args.use_cuda:
        dev_list = fluid.cuda_places()
        place = dev_list[0]
        dev_count = len(dev_list)
    else:
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
    exe = fluid.Executor(place)

    reader = task_reader.ClassifyReader(
        vocab_path=args.vocab_path,
        label_map_config=args.label_map_config,
        max_seq_len=args.max_seq_len,
        do_lower_case=args.do_lower_case,
        in_tokens=args.in_tokens,
        random_seed=args.random_seed,
        tokenizer=args.tokenizer,
        is_classify=args.is_classify,
        is_regression=args.is_regression,
        for_cn=args.for_cn,
        task_id=args.task_id)

    if not (args.do_train or args.do_val or args.do_test):
        raise ValueError("For args `do_train`, `do_val` and `do_test`, at "
                         "least one of them must be True.")

    if args.do_test:
        assert args.test_save is not None
    startup_prog = fluid.Program()
    if args.random_seed is not None:
        startup_prog.random_seed = args.random_seed

    if args.predict_batch_size == None:
        args.predict_batch_size = args.batch_size
    if args.do_train:
        train_data_generator = reader.data_generator(
            input_file=args.train_set,
            batch_size=args.batch_size,
            epoch=args.epoch,
            dev_count=dev_count,
            shuffle=True,
            phase="train")

        num_train_examples = reader.get_num_examples(args.train_set)

        if args.in_tokens:
            max_train_steps = args.epoch * num_train_examples // (
                args.batch_size // args.max_seq_len) // dev_count
        else:
            max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count

        warmup_steps = int(max_train_steps * args.warmup_proportion)
        log.info("Device count: %d" % dev_count)
        log.info("Num train examples: %d" % num_train_examples)
        log.info("Max train steps: %d" % max_train_steps)
        log.info("Num warmup steps: %d" % warmup_steps)

        train_program = fluid.Program()
        if args.random_seed is not None and args.enable_ce:
            train_program.random_seed = args.random_seed

        with fluid.program_guard(train_program, startup_prog):
            with fluid.unique_name.guard():
                train_pyreader, graph_vars = create_model(
                    args,
                    pyreader_name='train_reader',
                    ernie_config=ernie_config,
                    is_classify=args.is_classify,
                    is_regression=args.is_regression)
                scheduled_lr, loss_scaling = optimization(
                    loss=graph_vars["loss"],
                    warmup_steps=warmup_steps,
                    num_train_steps=max_train_steps,
                    learning_rate=args.learning_rate,
                    train_program=train_program,
                    startup_prog=startup_prog,
                    weight_decay=args.weight_decay,
                    scheduler=args.lr_scheduler,
		    use_fp16=args.use_fp16,
		    use_dynamic_loss_scaling=args.use_dynamic_loss_scaling,
		    init_loss_scaling=args.init_loss_scaling,
		    incr_every_n_steps=args.incr_every_n_steps,
		    decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf,
		    incr_ratio=args.incr_ratio,
		    decr_ratio=args.decr_ratio)

        if args.verbose:
            if args.in_tokens:
                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                    program=train_program,
                    batch_size=args.batch_size // args.max_seq_len)
            else:
                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                    program=train_program, batch_size=args.batch_size)
            log.info("Theoretical memory usage in training: %.3f - %.3f %s" %
                  (lower_mem, upper_mem, unit))

    if args.do_val or args.do_test:
        test_prog = fluid.Program()
        with fluid.program_guard(test_prog, startup_prog):
            with fluid.unique_name.guard():
                test_pyreader, graph_vars = create_model(
                    args,
                    pyreader_name='test_reader',
                    ernie_config=ernie_config,
                    is_classify=args.is_classify,
                    is_regression=args.is_regression)

        test_prog = test_prog.clone(for_test=True)
    nccl2_num_trainers = 1
    nccl2_trainer_id = 0
    if args.is_distributed:
        trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
        worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS")
        current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
        worker_endpoints = worker_endpoints_env.split(",")
        trainers_num = len(worker_endpoints)
        
        log.info("worker_endpoints:{} trainers_num:{} current_endpoint:{} \
              trainer_id:{}".format(worker_endpoints, trainers_num,
                                    current_endpoint, trainer_id))

        # prepare nccl2 env.
        config = fluid.DistributeTranspilerConfig()
        config.mode = "nccl2"
        t = fluid.DistributeTranspiler(config=config)
        t.transpile(
            trainer_id,
            trainers=worker_endpoints_env,
            current_endpoint=current_endpoint,
            program=train_program if args.do_train else test_prog,
            startup_program=startup_prog)
        nccl2_num_trainers = trainers_num
        nccl2_trainer_id = trainer_id

    exe = fluid.Executor(place)
    exe.run(startup_prog)

    if args.do_train:
        if args.init_checkpoint and args.init_pretraining_params:
            log.warning(
                "WARNING: args 'init_checkpoint' and 'init_pretraining_params' "
                "both are set! Only arg 'init_checkpoint' is made valid.")
        if args.init_checkpoint:
            init_checkpoint(
                exe,
                args.init_checkpoint,
                main_program=startup_prog,
                use_fp16=args.use_fp16)
        elif args.init_pretraining_params:
            init_pretraining_params(
                exe,
                args.init_pretraining_params,
                main_program=startup_prog,
                use_fp16=args.use_fp16)
    elif args.do_val or args.do_test:
        if not args.init_checkpoint:
            raise ValueError("args 'init_checkpoint' should be set if"
                             "only doing validation or testing!")
        init_checkpoint(
            exe,
            args.init_checkpoint,
            main_program=startup_prog,
            use_fp16=args.use_fp16)

    if args.do_train:
        exec_strategy = fluid.ExecutionStrategy()
        if args.use_fast_executor:
            exec_strategy.use_experimental_executor = True
        exec_strategy.num_threads = dev_count
        exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope

        train_exe = fluid.ParallelExecutor(
            use_cuda=args.use_cuda,
            loss_name=graph_vars["loss"].name,
            exec_strategy=exec_strategy,
            main_program=train_program,
            num_trainers=nccl2_num_trainers,
            trainer_id=nccl2_trainer_id)

        train_pyreader.decorate_tensor_provider(train_data_generator)
    else:
        train_exe = None

    test_exe = exe
    if args.do_val or args.do_test:
        if args.use_multi_gpu_test:
            test_exe = fluid.ParallelExecutor(
                use_cuda=args.use_cuda,
                main_program=test_prog,
                share_vars_from=train_exe)

    if args.do_train:
        train_pyreader.start()
        steps = 0
        if warmup_steps > 0:
            graph_vars["learning_rate"] = scheduled_lr

        ce_info = []
        time_begin = time.time()
        last_epoch = 0
        current_epoch = 0
        while True:
            try:
                steps += 1
                if steps % args.skip_steps != 0:
                    train_exe.run(fetch_list=[])
                else:
                    outputs = evaluate(
                        train_exe,
                        train_program,
                        train_pyreader,
                        graph_vars,
                        "train",
                        metric=args.metric,
                        is_classify=args.is_classify,
                        is_regression=args.is_regression)

                    if args.verbose:
                        verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size(
                        )
                        verbose += "learning rate: %f" % (
                            outputs["learning_rate"]
                            if warmup_steps > 0 else args.learning_rate)
                        log.info(verbose)

                    current_example, current_epoch = reader.get_train_progress()
                    time_end = time.time()
                    used_time = time_end - time_begin

                    if args.is_classify:
                        log.info(
                            "epoch: %d, progress: %d/%d, step: %d, ave loss: %f, "
                            "ave acc: %f, speed: %f steps/s" %
                            (current_epoch, current_example, num_train_examples,
                             steps, outputs["loss"], outputs["accuracy"],
                             args.skip_steps / used_time))
                        ce_info.append(
                            [outputs["loss"], outputs["accuracy"], used_time])
                    if args.is_regression:
                        log.info(
                            "epoch: %d, progress: %d/%d, step: %d, ave loss: %f, "
                            " speed: %f steps/s" %
                            (current_epoch, current_example, num_train_examples,
                             steps, outputs["loss"],
                             args.skip_steps / used_time))
                    time_begin = time.time()

                if nccl2_trainer_id == 0:
                    if steps % args.save_steps == 0:
                        save_path = os.path.join(args.checkpoints,
                                                 "step_" + str(steps))
                        fluid.io.save_persistables(exe, save_path, train_program)

                    if steps % args.validation_steps == 0 or last_epoch != current_epoch:
                        # evaluate dev set
                        if args.do_val:
                            evaluate_wrapper(args, reader, exe, test_prog,
                                             test_pyreader, graph_vars,
                                             current_epoch, steps)

                        if args.do_test:
                            predict_wrapper(args, reader, exe, test_prog,
                                            test_pyreader, graph_vars,
                                            current_epoch, steps)

                if last_epoch != current_epoch:
                    last_epoch = current_epoch

            except fluid.core.EOFException:
                save_path = os.path.join(args.checkpoints, "step_" + str(steps))
                fluid.io.save_persistables(exe, save_path, train_program)
                train_pyreader.reset()
                break
        if args.enable_ce:
            card_num = get_cards()
            ce_loss = 0
            ce_acc = 0
            ce_time = 0
            try:
                ce_loss = ce_info[-2][0]
                ce_acc = ce_info[-2][1]
                ce_time = ce_info[-2][2]
            except:
                log.info("ce info error")
            log.info("kpis\ttrain_duration_card%s\t%s" % (card_num, ce_time))
            log.info("kpis\ttrain_loss_card%s\t%f" % (card_num, ce_loss))
            log.info("kpis\ttrain_acc_card%s\t%f" % (card_num, ce_acc))

    # final eval on dev set
    if args.do_val:
        evaluate_wrapper(args, reader, exe, test_prog, test_pyreader,
                         graph_vars, current_epoch, steps)

    # final eval on test set
    if args.do_test:
        predict_wrapper(args, reader, exe, test_prog, test_pyreader, graph_vars,
                        current_epoch, steps)

    # final eval on dianostic, hack for glue-ax
    if args.diagnostic:
        test_pyreader.decorate_tensor_provider(
            reader.data_generator(
                args.diagnostic,
                batch_size=args.batch_size,
                epoch=1,
                dev_count=1,
                shuffle=False))

        log.info("Final diagnostic")
        qids, preds, probs = predict(
            test_exe,
            test_prog,
            test_pyreader,
            graph_vars,
            is_classify=args.is_classify,
            is_regression=args.is_regression)
        assert len(qids) == len(preds), '{} v.s. {}'.format(
            len(qids), len(preds))
        with open(args.diagnostic_save, 'w') as f:
            for id, s, p in zip(qids, preds, probs):
                f.write('{}\t{}\t{}\n'.format(id, s, p))

        log.info("Done final diagnostic, saving to {}".format(
            args.diagnostic_save))
コード例 #23
0
def main():
    env = os.environ
    cfg = load_config(FLAGS.config)
    merge_config(FLAGS.opt)
    check_config(cfg)
    # check if set use_gpu=True in paddlepaddle cpu version
    check_gpu(cfg.use_gpu)
    check_version()

    main_arch = cfg.architecture

    if cfg.use_gpu:
        devices_num = fluid.core.get_cuda_device_count()
    else:
        devices_num = int(os.environ.get('CPU_NUM', 1))

    if 'FLAGS_selected_gpus' in env:
        device_id = int(env['FLAGS_selected_gpus'])
    else:
        device_id = 0
    place = fluid.CUDAPlace(device_id) if cfg.use_gpu else fluid.CPUPlace()
    exe = fluid.Executor(place)

    # build program
    model = create(main_arch)
    inputs_def = cfg['TrainReader']['inputs_def']
    train_feed_vars, train_loader = model.build_inputs(**inputs_def)
    train_fetches = model.train(train_feed_vars)
    loss = train_fetches['loss']

    start_iter = 0
    train_reader = create_reader(cfg.TrainReader,
                                 (cfg.max_iters - start_iter) * devices_num,
                                 cfg)
    # When iterable mode, set set_sample_list_generator(train_reader, place)
    train_loader.set_sample_list_generator(train_reader)

    eval_prog = fluid.Program()
    with fluid.program_guard(eval_prog, fluid.default_startup_program()):
        with fluid.unique_name.guard():
            model = create(main_arch)
            inputs_def = cfg['EvalReader']['inputs_def']
            test_feed_vars, eval_loader = model.build_inputs(**inputs_def)
            fetches = model.eval(test_feed_vars)
    eval_prog = eval_prog.clone(True)

    eval_reader = create_reader(cfg.EvalReader)
    # When iterable mode, set set_sample_list_generator(eval_reader, place)
    eval_loader.set_sample_list_generator(eval_reader)

    teacher_cfg = load_config(FLAGS.teacher_config)
    merge_config(FLAGS.opt)
    teacher_arch = teacher_cfg.architecture
    teacher_program = fluid.Program()
    teacher_startup_program = fluid.Program()

    with fluid.program_guard(teacher_program, teacher_startup_program):
        with fluid.unique_name.guard():
            teacher_feed_vars = OrderedDict()
            for name, var in train_feed_vars.items():
                teacher_feed_vars[name] = teacher_program.global_block(
                )._clone_variable(var, force_persistable=False)
            model = create(teacher_arch)
            train_fetches = model.train(teacher_feed_vars)
            teacher_loss = train_fetches['loss']

    exe.run(teacher_startup_program)
    assert FLAGS.teacher_pretrained, "teacher_pretrained should be set"
    checkpoint.load_params(exe, teacher_program, FLAGS.teacher_pretrained)
    teacher_program = teacher_program.clone(for_test=True)

    target_number = len(model.yolo_head.anchor_masks)

    data_name_map = {
        'image': 'image',
        'gt_bbox': 'gt_bbox',
        'gt_class': 'gt_class',
        'gt_score': 'gt_score'
    }
    for i in range(target_number):
        data_name_map['target{}'.format(i)] = 'target{}'.format(i)

    merge(teacher_program, fluid.default_main_program(), data_name_map, place)

    output_names = [
        [
            'strided_slice_0.tmp_0', 'strided_slice_1.tmp_0',
            'strided_slice_2.tmp_0', 'strided_slice_3.tmp_0',
            'strided_slice_4.tmp_0', 'transpose_0.tmp_0'
        ],
        [
            'strided_slice_5.tmp_0', 'strided_slice_6.tmp_0',
            'strided_slice_7.tmp_0', 'strided_slice_8.tmp_0',
            'strided_slice_9.tmp_0', 'transpose_2.tmp_0'
        ],
        [
            'strided_slice_10.tmp_0', 'strided_slice_11.tmp_0',
            'strided_slice_12.tmp_0', 'strided_slice_13.tmp_0',
            'strided_slice_14.tmp_0', 'transpose_4.tmp_0'
        ],
    ]

    yolo_output_names = []
    for i in range(target_number):
        yolo_output_names.extend(output_names[i])

    assert cfg.use_fine_grained_loss, \
        "Only support use_fine_grained_loss=True, Please set it in config file or '-o use_fine_grained_loss=true'"
    distill_loss = split_distill(yolo_output_names, 1000, target_number)
    loss = distill_loss + loss
    lr_builder = create('LearningRate')
    optim_builder = create('OptimizerBuilder')
    lr = lr_builder()
    opt = optim_builder(lr)
    opt.minimize(loss)

    exe.run(fluid.default_startup_program())
    checkpoint.load_params(exe, fluid.default_main_program(),
                           cfg.pretrain_weights)


    assert FLAGS.pruned_params is not None, \
        "FLAGS.pruned_params is empty!!! Please set it by '--pruned_params' option."
    pruned_params = FLAGS.pruned_params.strip().split(",")
    logger.info("pruned params: {}".format(pruned_params))
    pruned_ratios = [float(n) for n in FLAGS.pruned_ratios.strip().split(",")]
    logger.info("pruned ratios: {}".format(pruned_ratios))
    assert len(pruned_params) == len(pruned_ratios), \
        "The length of pruned params and pruned ratios should be equal."
    assert pruned_ratios > [0] * len(pruned_ratios) and pruned_ratios < [1] * len(pruned_ratios), \
        "The elements of pruned ratios should be in range (0, 1)."

    assert FLAGS.prune_criterion in ['l1_norm', 'geometry_median'], \
            "unsupported prune criterion {}".format(FLAGS.prune_criterion)
    pruner = Pruner(criterion=FLAGS.prune_criterion)
    distill_prog = pruner.prune(fluid.default_main_program(),
                                fluid.global_scope(),
                                params=pruned_params,
                                ratios=pruned_ratios,
                                place=place,
                                only_graph=False)[0]

    base_flops = flops(eval_prog)
    eval_prog = pruner.prune(eval_prog,
                             fluid.global_scope(),
                             params=pruned_params,
                             ratios=pruned_ratios,
                             place=place,
                             only_graph=True)[0]
    pruned_flops = flops(eval_prog)
    logger.info("FLOPs -{}; total FLOPs: {}; pruned FLOPs: {}".format(
        float(base_flops - pruned_flops) / base_flops, base_flops,
        pruned_flops))

    build_strategy = fluid.BuildStrategy()
    build_strategy.fuse_all_reduce_ops = False
    build_strategy.fuse_all_optimizer_ops = False
    build_strategy.fuse_elewise_add_act_ops = True
    # only enable sync_bn in multi GPU devices
    sync_bn = getattr(model.backbone, 'norm_type', None) == 'sync_bn'
    build_strategy.sync_batch_norm = sync_bn and devices_num > 1 \
        and cfg.use_gpu

    exec_strategy = fluid.ExecutionStrategy()
    # iteration number when CompiledProgram tries to drop local execution scopes.
    # Set it to be 1 to save memory usages, so that unused variables in
    # local execution scopes can be deleted after each iteration.
    exec_strategy.num_iteration_per_drop_scope = 1

    parallel_main = fluid.CompiledProgram(distill_prog).with_data_parallel(
        loss_name=loss.name,
        build_strategy=build_strategy,
        exec_strategy=exec_strategy)
    compiled_eval_prog = fluid.CompiledProgram(eval_prog)

    # parse eval fetches
    extra_keys = []
    if cfg.metric == 'COCO':
        extra_keys = ['im_info', 'im_id', 'im_shape']
    if cfg.metric == 'VOC':
        extra_keys = ['gt_bbox', 'gt_class', 'is_difficult']
    eval_keys, eval_values, eval_cls = parse_fetches(fetches, eval_prog,
                                                     extra_keys)

    # whether output bbox is normalized in model output layer
    is_bbox_normalized = False
    if hasattr(model, 'is_bbox_normalized') and \
            callable(model.is_bbox_normalized):
        is_bbox_normalized = model.is_bbox_normalized()
    map_type = cfg.map_type if 'map_type' in cfg else '11point'
    best_box_ap_list = [0.0, 0]  #[map, iter]
    cfg_name = os.path.basename(FLAGS.config).split('.')[0]
    save_dir = os.path.join(cfg.save_dir, cfg_name)

    train_loader.start()
    for step_id in range(start_iter, cfg.max_iters):
        teacher_loss_np, distill_loss_np, loss_np, lr_np = exe.run(
            parallel_main,
            fetch_list=[
                'teacher_' + teacher_loss.name, distill_loss.name, loss.name,
                lr.name
            ])
        if step_id % cfg.log_iter == 0:
            logger.info(
                "step {} lr {:.6f}, loss {:.6f}, distill_loss {:.6f}, teacher_loss {:.6f}"
                .format(step_id, lr_np[0], loss_np[0], distill_loss_np[0],
                        teacher_loss_np[0]))
        if step_id % cfg.snapshot_iter == 0 and step_id != 0 or step_id == cfg.max_iters - 1:
            save_name = str(
                step_id) if step_id != cfg.max_iters - 1 else "model_final"
            checkpoint.save(exe, distill_prog,
                            os.path.join(save_dir, save_name))
            # eval
            results = eval_run(exe, compiled_eval_prog, eval_loader, eval_keys,
                               eval_values, eval_cls, cfg)
            resolution = None
            box_ap_stats = eval_results(results, cfg.metric, cfg.num_classes,
                                        resolution, is_bbox_normalized,
                                        FLAGS.output_eval, map_type,
                                        cfg['EvalReader']['dataset'])

            if box_ap_stats[0] > best_box_ap_list[0]:
                best_box_ap_list[0] = box_ap_stats[0]
                best_box_ap_list[1] = step_id
                checkpoint.save(exe, distill_prog,
                                os.path.join("./", "best_model"))
            logger.info("Best test box ap: {}, in step: {}".format(
                best_box_ap_list[0], best_box_ap_list[1]))
    train_loader.reset()
コード例 #24
0
ファイル: run_classifier.py プロジェクト: gentelyang/scripts
    def net(self):
        args = self.p_args()
        bert_config = BertConfig("uncased_L-24_H-1024_A-16/bert_config.json")
        bert_config.print_config()
        place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0')))
        dev_count = 1
        if args.do_train:
            my_dist_env = dist_env()
            worker_endpoints_env = my_dist_env["trainer_endpoints"]
            worker_endpoints = worker_endpoints_env.split(",")
            current_endpoint = my_dist_env["current_endpoint"]
            trainer_id = worker_endpoints.index(current_endpoint)
            # new rolemaker here
            print("current_id: ", trainer_id)
            print("worker_endpoints: ", worker_endpoints)
            role = role_maker.UserDefinedCollectiveRoleMaker(
                current_id=trainer_id, worker_endpoints=worker_endpoints)
            # Fleet get role of each worker
            fleet.init(role)
        exe = fluid.Executor(place)

        # init program
        train_program = fluid.Program()
        startup_prog = fluid.Program()

        if args.random_seed != 0:
            print("set program random seed as: ", args.random_seed)
            startup_prog.random_seed = args.random_seed
            train_program.random_seed = args.random_seed

        task_name = args.task_name.lower()
        processors = {
            'xnli': reader.XnliProcessor,
            'cola': reader.ColaProcessor,
            'mrpc': reader.MrpcProcessor,
            'mnli': reader.MnliProcessor,
        }
        processor = processors[task_name](data_dir=args.data_dir,
                                          vocab_path=args.vocab_path,
                                          max_seq_len=args.max_seq_len,
                                          do_lower_case=args.do_lower_case,
                                          in_tokens=args.in_tokens,
                                          random_seed=args.random_seed)
        num_labels = len(processor.get_labels())

        dev_count = len(worker_endpoints)
        # we need to keep every trainer of fleet the same shuffle_seed
        print("shuffle_seed: ", args.shuffle_seed)
        self.train_data_generator = processor.data_generator(
            batch_size=args.batch_size,
            phase='train',
            epoch=args.epoch,
            dev_count=dev_count,
            dev_idx=0,
            shuffle=args.shuffle,
            shuffle_seed=args.shuffle_seed)

        num_train_examples = processor.get_num_examples(phase='train')

        max_train_steps = 5
        self.warmup_steps = int(5 * 0.1)

        exec_strategy = fluid.ExecutionStrategy()
        exec_strategy.use_experimental_executor = args.use_fast_executor
        exec_strategy.num_threads = dev_count
        exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope

        dist_strategy = DistributedStrategy()
        dist_strategy.exec_strategy = exec_strategy
        dist_strategy.nccl_comm_num = 3
        dist_strategy.use_hierarchical_allreduce = True
        #dist_strategy.mode = "collective"
        #dist_strategy.collective_mode = "grad_allreduce"

        with fluid.program_guard(train_program, startup_prog):
            with fluid.unique_name.guard():
                self.train_pyreader, self.loss, probs, accuracy, num_seqs, checkpoints = create_model(
                    args, bert_config=bert_config, num_labels=num_labels)
                scheduled_lr = optimization(loss=self.loss,
                                            warmup_steps=self.warmup_steps,
                                            num_train_steps=max_train_steps,
                                            learning_rate=args.learning_rate,
                                            train_program=train_program,
                                            startup_prog=startup_prog,
                                            weight_decay=args.weight_decay,
                                            scheduler=args.lr_scheduler,
                                            use_fp16=False,
                                            loss_scaling=args.loss_scaling,
                                            dist_strategy=dist_strategy)

        exe.run(startup_prog)
        with open("__model__", "wb") as f:
            f.write(fleet._origin_program.desc.serialize_to_string())

        with open("debug_program", "w") as f:
            f.write(str(fleet._origin_program))
        return self.loss
コード例 #25
0
def train(args):
    # parse config
    config = parse_config(args.config)
    train_config = merge_configs(config, 'train', vars(args))
    valid_config = merge_configs(config, 'valid', vars(args))
    print_configs(train_config, 'Train')
    train_model = models.get_model(args.model_name, train_config, mode='train')
    valid_model = models.get_model(args.model_name, valid_config, mode='valid')

    # build model
    startup = fluid.Program()
    train_prog = fluid.Program()
    if args.fix_random_seed:
        startup.random_seed = 1000
        train_prog.random_seed = 1000
    with fluid.program_guard(train_prog, startup):
        with fluid.unique_name.guard():
            train_model.build_input(use_dataloader=True)
            train_model.build_model()
            # for the input, has the form [data1, data2,..., label], so train_feeds[-1] is label
            train_feeds = train_model.feeds()
            train_fetch_list = train_model.fetches()
            train_loss = train_fetch_list[0]
            optimizer = train_model.optimizer()
            optimizer.minimize(train_loss)
            train_dataloader = train_model.dataloader()

    valid_prog = fluid.Program()
    with fluid.program_guard(valid_prog, startup):
        with fluid.unique_name.guard():
            valid_model.build_input(use_dataloader=True)
            valid_model.build_model()
            valid_feeds = valid_model.feeds()
            valid_fetch_list = valid_model.fetches()
            valid_dataloader = valid_model.dataloader()

    place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
    exe = fluid.Executor(place)
    exe.run(startup)

    if args.pretrain:
        train_model.load_pretrain_params(exe, args.pretrain, train_prog)

    build_strategy = fluid.BuildStrategy()
    build_strategy.enable_inplace = True

    exec_strategy = fluid.ExecutionStrategy()

    compiled_train_prog = fluid.compiler.CompiledProgram(
        train_prog).with_data_parallel(loss_name=train_loss.name,
                                       build_strategy=build_strategy,
                                       exec_strategy=exec_strategy)
    compiled_valid_prog = fluid.compiler.CompiledProgram(
        valid_prog).with_data_parallel(share_vars_from=compiled_train_prog,
                                       build_strategy=build_strategy,
                                       exec_strategy=exec_strategy)

    # get reader
    bs_denominator = 1
    if args.use_gpu:
        # check number of GPUs
        gpus = os.getenv("CUDA_VISIBLE_DEVICES", "")
        if gpus == "":
            pass
        else:
            gpus = gpus.split(",")
            num_gpus = len(gpus)
            assert num_gpus == train_config.TRAIN.num_gpus, \
                   "num_gpus({}) set by CUDA_VISIBLE_DEVICES " \
                   "shoud be the same as that " \
                   "set in {}({})".format(
                   num_gpus, args.config, train_config.TRAIN.num_gpus)
        bs_denominator = train_config.TRAIN.num_gpus

    train_config.TRAIN.batch_size = int(train_config.TRAIN.batch_size /
                                        bs_denominator)
    valid_config.VALID.batch_size = int(valid_config.VALID.batch_size /
                                        bs_denominator)
    train_reader = get_reader(args.model_name.upper(), 'train', train_config)
    valid_reader = get_reader(args.model_name.upper(), 'valid', valid_config)

    # get metrics
    train_metrics = get_metrics(args.model_name.upper(), 'train', train_config)
    valid_metrics = get_metrics(args.model_name.upper(), 'valid', valid_config)

    epochs = args.epoch or train_model.epoch_num()

    exe_places = fluid.cuda_places() if args.use_gpu else fluid.cpu_places()
    train_dataloader.set_sample_list_generator(train_reader, places=exe_places)
    valid_dataloader.set_sample_list_generator(valid_reader, places=exe_places)

    train_with_dataloader(exe,
                          train_prog,
                          compiled_train_prog,
                          train_dataloader,
                          train_fetch_list,
                          train_metrics,
                          epochs=epochs,
                          log_interval=args.log_interval,
                          valid_interval=args.valid_interval,
                          save_dir=args.save_dir,
                          save_model_name=args.model_name,
                          fix_random_seed=args.fix_random_seed,
                          compiled_test_prog=compiled_valid_prog,
                          test_dataloader=valid_dataloader,
                          test_fetch_list=valid_fetch_list,
                          test_metrics=valid_metrics)
コード例 #26
0
ファイル: prune.py プロジェクト: libo-coder/PaddleDetection
def main():
    env = os.environ
    FLAGS.dist = 'PADDLE_TRAINER_ID' in env and 'PADDLE_TRAINERS_NUM' in env
    if FLAGS.dist:
        trainer_id = int(env['PADDLE_TRAINER_ID'])
        import random
        local_seed = (99 + trainer_id)
        random.seed(local_seed)
        np.random.seed(local_seed)

    cfg = load_config(FLAGS.config)
    merge_config(FLAGS.opt)
    check_config(cfg)
    # check if set use_gpu=True in paddlepaddle cpu version
    check_gpu(cfg.use_gpu)
    # check if paddlepaddle version is satisfied
    check_version()

    main_arch = cfg.architecture

    if cfg.use_gpu:
        devices_num = fluid.core.get_cuda_device_count()
    else:
        devices_num = int(os.environ.get('CPU_NUM', 1))

    if 'FLAGS_selected_gpus' in env:
        device_id = int(env['FLAGS_selected_gpus'])
    else:
        device_id = 0
    place = fluid.CUDAPlace(device_id) if cfg.use_gpu else fluid.CPUPlace()
    exe = fluid.Executor(place)

    lr_builder = create('LearningRate')
    optim_builder = create('OptimizerBuilder')

    # build program
    startup_prog = fluid.Program()
    train_prog = fluid.Program()
    with fluid.program_guard(train_prog, startup_prog):
        with fluid.unique_name.guard():
            model = create(main_arch)
            if FLAGS.fp16:
                assert (getattr(model.backbone, 'norm_type', None)
                        != 'affine_channel'), \
                    '--fp16 currently does not support affine channel, ' \
                    ' please modify backbone settings to use batch norm'

            with mixed_precision_context(FLAGS.loss_scale, FLAGS.fp16) as ctx:
                inputs_def = cfg['TrainReader']['inputs_def']
                feed_vars, train_loader = model.build_inputs(**inputs_def)
                train_fetches = model.train(feed_vars)
                loss = train_fetches['loss']
                if FLAGS.fp16:
                    loss *= ctx.get_loss_scale_var()
                lr = lr_builder()
                optimizer = optim_builder(lr)
                optimizer.minimize(loss)
                if FLAGS.fp16:
                    loss /= ctx.get_loss_scale_var()

    # parse train fetches
    train_keys, train_values, _ = parse_fetches(train_fetches)
    train_values.append(lr)

    if FLAGS.print_params:
        param_delimit_str = '-' * 20 + "All parameters in current graph" + '-' * 20
        print(param_delimit_str)
        for block in train_prog.blocks:
            for param in block.all_parameters():
                print("parameter name: {}\tshape: {}".format(
                    param.name, param.shape))
        print('-' * len(param_delimit_str))
        return

    if FLAGS.eval:
        eval_prog = fluid.Program()
        with fluid.program_guard(eval_prog, startup_prog):
            with fluid.unique_name.guard():
                model = create(main_arch)
                inputs_def = cfg['EvalReader']['inputs_def']
                feed_vars, eval_loader = model.build_inputs(**inputs_def)
                fetches = model.eval(feed_vars)
        eval_prog = eval_prog.clone(True)

        eval_reader = create_reader(cfg.EvalReader)
        eval_loader.set_sample_list_generator(eval_reader, place)

        # parse eval fetches
        extra_keys = []
        if cfg.metric == 'COCO':
            extra_keys = ['im_info', 'im_id', 'im_shape']
        if cfg.metric == 'VOC':
            extra_keys = ['gt_bbox', 'gt_class', 'is_difficult']
        if cfg.metric == 'WIDERFACE':
            extra_keys = ['im_id', 'im_shape', 'gt_bbox']
        eval_keys, eval_values, eval_cls = parse_fetches(
            fetches, eval_prog, extra_keys)

    # compile program for multi-devices
    build_strategy = fluid.BuildStrategy()
    build_strategy.fuse_all_optimizer_ops = False
    build_strategy.fuse_elewise_add_act_ops = True
    # only enable sync_bn in multi GPU devices
    sync_bn = getattr(model.backbone, 'norm_type', None) == 'sync_bn'
    build_strategy.sync_batch_norm = sync_bn and devices_num > 1 \
        and cfg.use_gpu

    exec_strategy = fluid.ExecutionStrategy()
    # iteration number when CompiledProgram tries to drop local execution scopes.
    # Set it to be 1 to save memory usages, so that unused variables in
    # local execution scopes can be deleted after each iteration.
    exec_strategy.num_iteration_per_drop_scope = 1
    if FLAGS.dist:
        dist_utils.prepare_for_multi_process(exe, build_strategy, startup_prog,
                                             train_prog)
        exec_strategy.num_threads = 1

    exe.run(startup_prog)

    fuse_bn = getattr(model.backbone, 'norm_type', None) == 'affine_channel'

    start_iter = 0
    if cfg.pretrain_weights:
        checkpoint.load_params(exe, train_prog, cfg.pretrain_weights)

    pruned_params = FLAGS.pruned_params
    assert FLAGS.pruned_params is not None, \
        "FLAGS.pruned_params is empty!!! Please set it by '--pruned_params' option."
    pruned_params = FLAGS.pruned_params.strip().split(",")
    logger.info("pruned params: {}".format(pruned_params))
    pruned_ratios = [float(n) for n in FLAGS.pruned_ratios.strip().split(",")]
    logger.info("pruned ratios: {}".format(pruned_ratios))
    assert len(pruned_params) == len(pruned_ratios), \
        "The length of pruned params and pruned ratios should be equal."
    assert (pruned_ratios > [0] * len(pruned_ratios)
            and pruned_ratios < [1] * len(pruned_ratios)
            ), "The elements of pruned ratios should be in range (0, 1)."

    assert FLAGS.prune_criterion in ['l1_norm', 'geometry_median'], \
            "unsupported prune criterion {}".format(FLAGS.prune_criterion)
    pruner = Pruner(criterion=FLAGS.prune_criterion)
    train_prog = pruner.prune(train_prog,
                              fluid.global_scope(),
                              params=pruned_params,
                              ratios=pruned_ratios,
                              place=place,
                              only_graph=False)[0]

    compiled_train_prog = fluid.CompiledProgram(train_prog).with_data_parallel(
        loss_name=loss.name,
        build_strategy=build_strategy,
        exec_strategy=exec_strategy)

    if FLAGS.eval:

        base_flops = flops(eval_prog)
        eval_prog = pruner.prune(eval_prog,
                                 fluid.global_scope(),
                                 params=pruned_params,
                                 ratios=pruned_ratios,
                                 place=place,
                                 only_graph=True)[0]
        pruned_flops = flops(eval_prog)
        logger.info("FLOPs -{}; total FLOPs: {}; pruned FLOPs: {}".format(
            float(base_flops - pruned_flops) / base_flops, base_flops,
            pruned_flops))
        compiled_eval_prog = fluid.CompiledProgram(eval_prog)

    if FLAGS.resume_checkpoint:
        checkpoint.load_checkpoint(exe, train_prog, FLAGS.resume_checkpoint)
        start_iter = checkpoint.global_step()

    train_reader = create_reader(cfg.TrainReader,
                                 (cfg.max_iters - start_iter) * devices_num,
                                 cfg)
    train_loader.set_sample_list_generator(train_reader, place)

    # whether output bbox is normalized in model output layer
    is_bbox_normalized = False
    if hasattr(model, 'is_bbox_normalized') and \
            callable(model.is_bbox_normalized):
        is_bbox_normalized = model.is_bbox_normalized()

    # if map_type not set, use default 11point, only use in VOC eval
    map_type = cfg.map_type if 'map_type' in cfg else '11point'

    train_stats = TrainingStats(cfg.log_smooth_window, train_keys)
    train_loader.start()
    start_time = time.time()
    end_time = time.time()

    cfg_name = os.path.basename(FLAGS.config).split('.')[0]
    save_dir = os.path.join(cfg.save_dir, cfg_name)
    time_stat = deque(maxlen=cfg.log_smooth_window)
    best_box_ap_list = [0.0, 0]  #[map, iter]

    # use VisualDL to log data
    if FLAGS.use_vdl:
        from visualdl import LogWriter
        vdl_writer = LogWriter(FLAGS.vdl_log_dir)
        vdl_loss_step = 0
        vdl_mAP_step = 0

    if FLAGS.eval:
        resolution = None
        if 'Mask' in cfg.architecture:
            resolution = model.mask_head.resolution
        # evaluation
        results = eval_run(exe,
                           compiled_eval_prog,
                           eval_loader,
                           eval_keys,
                           eval_values,
                           eval_cls,
                           cfg,
                           resolution=resolution)
        dataset = cfg['EvalReader']['dataset']
        box_ap_stats = eval_results(results,
                                    cfg.metric,
                                    cfg.num_classes,
                                    resolution,
                                    is_bbox_normalized,
                                    FLAGS.output_eval,
                                    map_type,
                                    dataset=dataset)

    for it in range(start_iter, cfg.max_iters):
        start_time = end_time
        end_time = time.time()
        time_stat.append(end_time - start_time)
        time_cost = np.mean(time_stat)
        eta_sec = (cfg.max_iters - it) * time_cost
        eta = str(datetime.timedelta(seconds=int(eta_sec)))
        outs = exe.run(compiled_train_prog, fetch_list=train_values)
        stats = {k: np.array(v).mean() for k, v in zip(train_keys, outs[:-1])}

        # use VisualDL to log loss
        if FLAGS.use_vdl:
            if it % cfg.log_iter == 0:
                for loss_name, loss_value in stats.items():
                    vdl_writer.add_scalar(loss_name, loss_value, vdl_loss_step)
                vdl_loss_step += 1

        train_stats.update(stats)
        logs = train_stats.log()
        if it % cfg.log_iter == 0 and (not FLAGS.dist or trainer_id == 0):
            strs = 'iter: {}, lr: {:.6f}, {}, time: {:.3f}, eta: {}'.format(
                it, np.mean(outs[-1]), logs, time_cost, eta)
            logger.info(strs)

        if (it > 0 and it % cfg.snapshot_iter == 0 or it == cfg.max_iters - 1) \
           and (not FLAGS.dist or trainer_id == 0):
            save_name = str(it) if it != cfg.max_iters - 1 else "model_final"
            checkpoint.save(exe, train_prog, os.path.join(save_dir, save_name))

            if FLAGS.eval:
                # evaluation
                resolution = None
                if 'Mask' in cfg.architecture:
                    resolution = model.mask_head.resolution
                results = eval_run(exe,
                                   compiled_eval_prog,
                                   eval_loader,
                                   eval_keys,
                                   eval_values,
                                   eval_cls,
                                   cfg=cfg,
                                   resolution=resolution)
                box_ap_stats = eval_results(results,
                                            cfg.metric,
                                            cfg.num_classes,
                                            resolution,
                                            is_bbox_normalized,
                                            FLAGS.output_eval,
                                            map_type,
                                            dataset=dataset)

                # use VisualDL to log mAP
                if FLAGS.use_vdl:
                    vdl_writer.add_scalar("mAP", box_ap_stats[0], vdl_mAP_step)
                    vdl_mAP_step += 1

                if box_ap_stats[0] > best_box_ap_list[0]:
                    best_box_ap_list[0] = box_ap_stats[0]
                    best_box_ap_list[1] = it
                    checkpoint.save(exe, train_prog,
                                    os.path.join(save_dir, "best_model"))
                logger.info("Best test box ap: {}, in iter: {}".format(
                    best_box_ap_list[0], best_box_ap_list[1]))

    train_loader.reset()
コード例 #27
0
ファイル: train_quant.py プロジェクト: zjhellofss/PaddleSeg
def train_quant(cfg):
    startup_prog = fluid.Program()
    train_prog = fluid.Program()
    if args.enable_ce:
        startup_prog.random_seed = 1000
        train_prog.random_seed = 1000
    drop_last = True

    dataset = SegDataset(file_list=cfg.DATASET.TRAIN_FILE_LIST,
                         mode=ModelPhase.TRAIN,
                         shuffle=True,
                         data_dir=cfg.DATASET.DATA_DIR)

    def data_generator():
        if args.use_mpio:
            data_gen = dataset.multiprocess_generator(
                num_processes=cfg.DATALOADER.NUM_WORKERS,
                max_queue_size=cfg.DATALOADER.BUF_SIZE)
        else:
            data_gen = dataset.generator()

        batch_data = []
        for b in data_gen:
            batch_data.append(b)
            if len(batch_data) == (cfg.BATCH_SIZE // cfg.NUM_TRAINERS):
                for item in batch_data:
                    yield item[0], item[1], item[2]
                batch_data = []
        # If use sync batch norm strategy, drop last batch if number of samples
        # in batch_data is less then cfg.BATCH_SIZE to avoid NCCL hang issues
        if not cfg.TRAIN.SYNC_BATCH_NORM:
            for item in batch_data:
                yield item[0], item[1], item[2]

    # Get device environment
    # places = fluid.cuda_places() if args.use_gpu else fluid.cpu_places()
    # place = places[0]
    gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0))
    place = fluid.CUDAPlace(gpu_id) if args.use_gpu else fluid.CPUPlace()
    places = fluid.cuda_places() if args.use_gpu else fluid.cpu_places()

    # Get number of GPU
    dev_count = cfg.NUM_TRAINERS if cfg.NUM_TRAINERS > 1 else len(places)
    print_info("#Device count: {}".format(dev_count))

    # Make sure BATCH_SIZE can divided by GPU cards
    assert cfg.BATCH_SIZE % dev_count == 0, (
        'BATCH_SIZE:{} not divisble by number of GPUs:{}'.format(
            cfg.BATCH_SIZE, dev_count))
    # If use multi-gpu training mode, batch data will allocated to each GPU evenly
    batch_size_per_dev = cfg.BATCH_SIZE // dev_count
    print_info("batch_size_per_dev: {}".format(batch_size_per_dev))

    data_loader, avg_loss, lr, pred, grts, masks = build_model(
        train_prog, startup_prog, phase=ModelPhase.TRAIN)
    data_loader.set_sample_generator(data_generator,
                                     batch_size=batch_size_per_dev,
                                     drop_last=drop_last)

    exe = fluid.Executor(place)
    exe.run(startup_prog)

    exec_strategy = fluid.ExecutionStrategy()
    # Clear temporary variables every 100 iteration
    if args.use_gpu:
        exec_strategy.num_threads = fluid.core.get_cuda_device_count()
    exec_strategy.num_iteration_per_drop_scope = 100
    build_strategy = fluid.BuildStrategy()

    if cfg.NUM_TRAINERS > 1 and args.use_gpu:
        dist_utils.prepare_for_multi_process(exe, build_strategy, train_prog)
        exec_strategy.num_threads = 1

    # Resume training
    begin_epoch = cfg.SOLVER.BEGIN_EPOCH
    if cfg.TRAIN.RESUME_MODEL_DIR:
        begin_epoch = load_checkpoint(exe, train_prog)
    # Load pretrained model
    elif os.path.exists(cfg.TRAIN.PRETRAINED_MODEL_DIR):
        load_pretrained_weights(exe, train_prog,
                                cfg.TRAIN.PRETRAINED_MODEL_DIR)
    else:
        print_info(
            'Pretrained model dir {} not exists, training from scratch...'.
            format(cfg.TRAIN.PRETRAINED_MODEL_DIR))

    fetch_list = [avg_loss.name, lr.name]
    if args.debug:
        # Fetch more variable info and use streaming confusion matrix to
        # calculate IoU results if in debug mode
        np.set_printoptions(precision=4,
                            suppress=True,
                            linewidth=160,
                            floatmode="fixed")
        fetch_list.extend([pred.name, grts.name, masks.name])
        cm = ConfusionMatrix(cfg.DATASET.NUM_CLASSES, streaming=True)

    not_quant_pattern = []
    if args.not_quant_pattern:
        not_quant_pattern = args.not_quant_pattern
    config = {
        'weight_quantize_type': 'channel_wise_abs_max',
        'activation_quantize_type': 'moving_average_abs_max',
        'quantize_op_types': ['depthwise_conv2d', 'mul', 'conv2d'],
        'not_quant_pattern': not_quant_pattern
    }
    compiled_train_prog = quant_aware(train_prog,
                                      place,
                                      config,
                                      for_test=False)
    eval_prog = quant_aware(train_prog, place, config, for_test=True)
    build_strategy.fuse_all_reduce_ops = False
    build_strategy.sync_batch_norm = False
    compiled_train_prog = compiled_train_prog.with_data_parallel(
        loss_name=avg_loss.name,
        exec_strategy=exec_strategy,
        build_strategy=build_strategy)

    # trainer_id = int(os.getenv("PADDLE_TRAINER_ID", 0))
    # num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1))
    global_step = 0
    all_step = cfg.DATASET.TRAIN_TOTAL_IMAGES // cfg.BATCH_SIZE
    if cfg.DATASET.TRAIN_TOTAL_IMAGES % cfg.BATCH_SIZE and drop_last != True:
        all_step += 1
    all_step *= (cfg.SOLVER.NUM_EPOCHS - begin_epoch + 1)

    avg_loss = 0.0
    best_mIoU = 0.0

    timer = Timer()
    timer.start()
    if begin_epoch > cfg.SOLVER.NUM_EPOCHS:
        raise ValueError((
            "begin epoch[{}] is larger than cfg.SOLVER.NUM_EPOCHS[{}]").format(
                begin_epoch, cfg.SOLVER.NUM_EPOCHS))

    if args.use_mpio:
        print_info("Use multiprocess reader")
    else:
        print_info("Use multi-thread reader")

    for epoch in range(begin_epoch, cfg.SOLVER.NUM_EPOCHS + 1):
        data_loader.start()
        while True:
            try:
                if args.debug:
                    # Print category IoU and accuracy to check whether the
                    # traning process is corresponed to expectation
                    loss, lr, pred, grts, masks = exe.run(
                        program=compiled_train_prog,
                        fetch_list=fetch_list,
                        return_numpy=True)
                    cm.calculate(pred, grts, masks)
                    avg_loss += np.mean(np.array(loss))
                    global_step += 1

                    if global_step % args.log_steps == 0:
                        speed = args.log_steps / timer.elapsed_time()
                        avg_loss /= args.log_steps
                        category_acc, mean_acc = cm.accuracy()
                        category_iou, mean_iou = cm.mean_iou()

                        print_info((
                            "epoch={} step={} lr={:.5f} loss={:.4f} acc={:.5f} mIoU={:.5f} step/sec={:.3f} | ETA {}"
                        ).format(epoch, global_step, lr[0], avg_loss, mean_acc,
                                 mean_iou, speed,
                                 calculate_eta(all_step - global_step, speed)))
                        print_info("Category IoU: ", category_iou)
                        print_info("Category Acc: ", category_acc)
                        sys.stdout.flush()
                        avg_loss = 0.0
                        cm.zero_matrix()
                        timer.restart()
                else:
                    # If not in debug mode, avoid unnessary log and calculate
                    loss, lr = exe.run(program=compiled_train_prog,
                                       fetch_list=fetch_list,
                                       return_numpy=True)
                    avg_loss += np.mean(np.array(loss))
                    global_step += 1

                    if global_step % args.log_steps == 0 and cfg.TRAINER_ID == 0:
                        avg_loss /= args.log_steps
                        speed = args.log_steps / timer.elapsed_time()
                        print((
                            "epoch={} step={} lr={:.5f} loss={:.4f} step/sec={:.3f} | ETA {}"
                        ).format(epoch, global_step, lr[0], avg_loss, speed,
                                 calculate_eta(all_step - global_step, speed)))
                        sys.stdout.flush()
                        avg_loss = 0.0
                        timer.restart()

            except fluid.core.EOFException:
                data_loader.reset()
                break
            except Exception as e:
                print(e)

        if (epoch % cfg.TRAIN.SNAPSHOT_EPOCH == 0
                or epoch == cfg.SOLVER.NUM_EPOCHS) and cfg.TRAINER_ID == 0:
            ckpt_dir = save_checkpoint(exe, eval_prog, epoch)

            if args.do_eval:
                print("Evaluation start")
                _, mean_iou, _, mean_acc = evaluate(
                    cfg=cfg,
                    ckpt_dir=ckpt_dir,
                    use_gpu=args.use_gpu,
                    use_mpio=args.use_mpio,
                    not_quant_pattern=args.not_quant_pattern,
                    convert=False)

                if mean_iou > best_mIoU:
                    best_mIoU = mean_iou
                    update_best_model(ckpt_dir)
                    print_info(
                        "Save best model {} to {}, mIoU = {:.4f}".format(
                            ckpt_dir,
                            os.path.join(cfg.TRAIN.MODEL_SAVE_DIR,
                                         'best_model'), mean_iou))

    # save final model
    if cfg.TRAINER_ID == 0:
        save_checkpoint(exe, eval_prog, 'final')
コード例 #28
0
def do_train(args):
    # init executor
    if args.use_cuda:
        place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0')))
        dev_count = fluid.core.get_cuda_device_count()
    else:
        dev_count = min(multiprocessing.cpu_count(), args.cpu_num)
        if (dev_count < args.cpu_num):
            print(
                "WARNING: The total CPU NUM in this machine is %d, which is less than cpu_num parameter you set. "
                "Change the cpu_num from %d to %d" %
                (dev_count, args.cpu_num, dev_count))
        os.environ['CPU_NUM'] = str(dev_count)
        place = fluid.CPUPlace()

    train_program = fluid.Program()
    test_program = fluid.Program()
    startup_program = fluid.Program()

    dataset = reader.Dataset(args)
    with fluid.program_guard(train_program, startup_program):
        #train_program.random_seed = args.random_seed
        startup_program.random_seed = args.random_seed

        with fluid.unique_name.guard():
            train_ret = creator.create_model(args,
                                             dataset.vocab_size,
                                             dataset.num_labels,
                                             mode='train')

            optimizer = fluid.optimizer.Adam(
                learning_rate=args.base_learning_rate)
            optimizer.minimize(train_ret["avg_cost"])

    with fluid.program_guard(test_program, startup_program):
        with fluid.unique_name.guard():
            test_ret = creator.create_model(args,
                                            dataset.vocab_size,
                                            dataset.num_labels,
                                            mode='test')

            test_program = test_program.clone(for_test=True)

    exe = fluid.Executor(place)
    exe.run(startup_program)

    if args.init_checkpoint:
        model_utils.init_checkpoint(exe, args.init_checkpoint, train_program)
    if dev_count > 1:
        device = "GPU" if args.use_cuda else "CPU"
        print("%d %s are used to train model" % (dev_count, device))
        # multi cpu/gpu config
        exec_strategy = fluid.ExecutionStrategy()

        build_strategy = fluid.compiler.BuildStrategy()

        compiled_prog = fluid.compiler.CompiledProgram(
            train_program).with_data_parallel(
                loss_name=train_ret['avg_cost'].name,
                build_strategy=build_strategy,
                exec_strategy=exec_strategy)
    else:
        compiled_prog = fluid.compiler.CompiledProgram(train_program)

    # start training
    num_train_examples = dataset.get_num_examples(args.train_data)
    max_train_steps = args.epoch * num_train_examples // args.batch_size
    print("Num train examples: %d" % num_train_examples)
    print("Max train steps: %d" % max_train_steps)

    train_generator = creator.create_lexnet_data_generator(
        args,
        reader=dataset,
        file_name=args.train_data,
        place=place,
        mode='train')
    test_generator = creator.create_lexnet_data_generator(
        args,
        reader=dataset,
        file_name=args.test_data,
        place=place,
        mode='test')

    train_reader, test_reader = train_ret['pyreader'], test_ret['pyreader']
    train_reader.set_batch_generator(train_generator, places=place)
    test_reader.set_batch_generator(test_generator, places=place)

    ce_info = []
    step = 0
    ce_time = 0
    train_reader.start()
    while True:
        try:
            # this is for minimizing the fetching op, saving the training speed.
            if step % args.print_steps == 0:
                fetch_list = [
                    train_ret["avg_cost"], train_ret["precision"],
                    train_ret["recall"], train_ret["f1_score"],
                    train_ret["crf_avg_cost"], train_ret["teacher_cost"]
                ]
            else:
                fetch_list = []

            start_time = time.time()
            outputs = exe.run(program=compiled_prog, fetch_list=fetch_list)

            end_time = time.time()
            if step % args.print_steps == 0:
                avg_cost, precision, recall, f1_score, crf_avg_cost, teacher_cost = [
                    np.mean(x) for x in outputs
                ]
                print("Data loader queue size: %d " %
                      train_reader.queue.size())
                print(
                    "[train] step = %d, loss = %.5f, P: %.5f, R: %.5f, F1: %.5f, crf_avg_cost: %.5f, teacher_cost: %.5f, elapsed time %.5f"
                    % (step, avg_cost, precision, recall, f1_score,
                       crf_avg_cost, teacher_cost, end_time - start_time))

            if step % args.validation_steps == 0:
                test_process(exe, test_program, test_reader, test_ret)

                ce_time += end_time - start_time
                ce_info.append(
                    [ce_time, avg_cost, precision, recall, f1_score])

            # save checkpoints
            if step % args.save_steps == 0 and step != 0:
                save_path = os.path.join(args.model_save_dir,
                                         "step_" + str(step))
                fluid.io.save_persistables(exe, save_path, train_program)
            step += 1
        except fluid.core.EOFException:
            train_reader.reset()
            break

    if args.enable_ce:
        card_num = get_cards()
        ce_cost = 0
        ce_f1 = 0
        ce_p = 0
        ce_r = 0
        ce_time = 0
        try:
            ce_time = ce_info[-2][0]
            ce_cost = ce_info[-2][1]
            ce_p = ce_info[-2][2]
            ce_r = ce_info[-2][3]
            ce_f1 = ce_info[-2][4]
        except:
            print("ce info error")
        print("kpis\teach_step_duration_card%s\t%s" % (card_num, ce_time))
        print("kpis\ttrain_cost_card%s\t%f" % (card_num, ce_cost))
        print("kpis\ttrain_precision_card%s\t%f" % (card_num, ce_p))
        print("kpis\ttrain_recall_card%s\t%f" % (card_num, ce_r))
        print("kpis\ttrain_f1_card%s\t%f" % (card_num, ce_f1))
コード例 #29
0
 def net(self, args=None):
     """
     vgg net struct.
     Args:
         fleet:
         args (ArgumentParser): run args to config dist fleet.
     Returns:
         tuple: the return value contains avg_cost, py_reader
     """
     import paddle.distributed.fleet as fleet
     # from paddle.fluid.incubate.fleet.collective import fleet, DistributedStrategy
     from thirdparty.image_classfication.models.vgg import VGG16
     from thirdparty.image_classfication.train import parser
     from thirdparty.image_classfication.train import optimizer_setting
     parser.add_argument(
         '--update_method',
         type=str,
         required=True,
         choices=['pserver', 'nccl'])
     parser.add_argument(
         '--role', type=str, required=True, choices=['pserver', 'trainer'])
     parser.add_argument(
         '--endpoints', type=str, required=False, default="")
     parser.add_argument(
         '--current_id', type=int, required=False, default=0)
     parser.add_argument('--trainers', type=int, required=False, default=1)
     # parser.add_argument('--sync_mode', action='store_true')
     parser.add_argument(
         '--run_params', type=str, required=False, default='{}')
     args = parser.parse_args()
     args.run_params = json.loads(args.run_params)
     print(args.run_params)
     image_shape = [3, 224, 224]
     scale_loss = 1.0
     fluid.io.reader.keep_data_loader_order(False)
     image = fluid.data(shape=[-1, 3, 224, 224], dtype='float32', name='image')
     label = fluid.data(shape=[-1, 1], dtype='int64', name='label')
     self.loader = fluid.io.DataLoader.from_generator(capacity=16,
                                                      use_double_buffer=True,
                                                      feed_list=[image, label],
                                                      iterable=False)
     run_model = VGG16()
     out = run_model.net(image, 4)
     softmax_out = fluid.layers.softmax(out, use_cudnn=False)
     cost, prob = fluid.layers.softmax_with_cross_entropy(
         out, label, return_softmax=True)
     self.avg_cost = fluid.layers.mean(cost)
     checkpoints = run_model.checkpoints
     params = run_model.params
     params["total_images"] = args.total_images
     params["lr"] = 1e-5
     params["num_epochs"] = args.num_epochs
     params["learning_strategy"]["batch_size"] = args.batch_size
     params["learning_strategy"]["name"] = args.lr_strategy
     params["l2_decay"] = args.l2_decay
     params["momentum_rate"] = args.momentum_rate
     optimizer = optimizer_setting(params)
     global_lr = optimizer._global_learning_rate()
     global_lr.persistable = True
     build_strategy = paddle.fluid.BuildStrategy()
     build_strategy.enable_inplace = args.run_params['enable_inplace']
     exec_strategy = fluid.ExecutionStrategy()
     exec_strategy.num_threads = 1
     exec_strategy.num_iteration_per_drop_scope = 3
     dist_strategy = fleet.DistributedStrategy()
     dist_strategy.execution_strategy = exec_strategy
     dist_strategy.dgc = args.run_params['dgc']
     dist_strategy.lars = args.run_params['lars']
     dist_strategy.lamb = args.run_params['lamb']
     dist_strategy.auto = args.run_params['auto']
     dist_strategy.amp = args.run_params['fp16']
     dist_strategy.sync_batch_norm = args.run_params['sync_bn']
     dist_strategy.nccl_comm_num = args.run_params['nccl_comm_num']
     dist_strategy.fuse_all_reduce_ops = args.run_params['fuse_all_reduce_ops']
     dist_strategy.sync_nccl_allreduce = args.run_params['sync_nccl_allreduce']
     dist_strategy.hierarchical_allreduce_inter_nranks = args.run_params["inter_nranks"]
     if bool(args.run_params['recompute']):
         dist_strategy.recompute = True
         dist_strategy.recompute_configs = {"checkpoints": ["fc_1.tmp_1", "fc_1.tmp_2"],
                                            "enable_offload": False}
                                            # "checkpoint_shape": [1, 512, 1024]}
     else:
         dist_strategy.recompute = False
     if bool(args.run_params["pipeline"]):
         dist_strategy.pipeline = True
         dist_strategy.pipeline_configs = {"micro_batch": 2}
     else:
         dist_strategy.pipeline = False
     if bool(args.run_params["localsgd"]):
         dist_strategy.localsgd = True
         dist_strategy.localsgd_configs = {"k_steps": 2}
     else:
         dist_strategy.localsgd = False
     if bool(args.run_params['gradient_merge']):
         dist_strategy.gradient_merge = True
         dist_strategy.gradient_merge_configs = {"k_steps": 2}
     else:
         dist_strategy.gradient_merge = False
     if bool(args.run_params['sharding']):
         dist_strategy.sharding = True
         dist_strategy.sharding_configs = {
             "fuse_broadcast_MB": 32,
         }
     else:
         dist_strategy.sharding = False
     # if args.run_params["fp16"]:
     #     optimizer = fluid.contrib.mixed_precision.decorate(
     #         optimizer,
     #         init_loss_scaling=128.0,
     #         use_dynamic_loss_scaling=True)
     if args.run_params["gradient_merge"]:
         scheduled_lr = self.linear_warmup_decay(learning_rate=0.01, warmup_steps=16000,
                                            num_train_steps=1000000)
         optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr)
     if "use_dgc" in args.run_params and args.run_params["use_dgc"]:
         # use dgc must close fuse
         dist_strategy.fuse_all_reduce_ops = False
         optimizer = fluid.optimizer.DGCMomentumOptimizer(
             learning_rate=0.001, momentum=0.9, rampup_begin_step=0)
     dist_optimizer = fleet.distributed_optimizer(
         optimizer, strategy=dist_strategy)
     _, param_grads = dist_optimizer.minimize(self.avg_cost)
     shuffle_seed = 1
     train_reader = reader.train(
         settings=args, data_dir=DATA_DIR, pass_id_as_seed=shuffle_seed)
     self.loader.set_sample_list_generator(paddle.batch(
             train_reader, batch_size=self.batch_size))
     if scale_loss > 1:
         avg_cost = fluid.layers.mean(x=cost) * scale_loss
     return self.avg_cost, self.loader
コード例 #30
0
ファイル: base.py プロジェクト: shlucky401/PaddleSeg
    def train(self,
              num_epochs,
              train_reader,
              train_batch_size=2,
              eval_reader=None,
              eval_best_metric=None,
              save_interval_epochs=1,
              log_interval_steps=2,
              save_dir='output',
              pretrain_weights=None,
              resume_weights=None,
              optimizer=None,
              learning_rate=0.01,
              lr_decay_power=0.9,
              regularization_coeff=4e-5,
              use_vdl=False):
        self.labels = train_reader.labels
        self.train_transforms = train_reader.transforms
        self.train_init = locals()
        self.begin_epoch = 0

        if optimizer is None:
            num_steps_each_epoch = train_reader.num_samples // train_batch_size
            optimizer = self.default_optimizer(
                learning_rate=learning_rate,
                num_epochs=num_epochs,
                num_steps_each_epoch=num_steps_each_epoch,
                lr_decay_power=lr_decay_power,
                regularization_coeff=regularization_coeff)
        self.optimizer = optimizer
        self.build_program()
        self.net_initialize(startup_prog=fluid.default_startup_program(),
                            pretrain_weights=pretrain_weights,
                            resume_weights=resume_weights)

        if self.begin_epoch >= num_epochs:
            raise ValueError(
                ("begin epoch[{}] is larger than num_epochs[{}]").format(
                    self.begin_epoch, num_epochs))

        if not osp.isdir(save_dir):
            if osp.exists(save_dir):
                os.remove(save_dir)
            os.makedirs(save_dir)

        # add arrange op tor transforms
        self.arrange_transform(transforms=train_reader.transforms,
                               mode='train')
        self.build_train_data_loader(dataset=train_reader,
                                     batch_size=train_batch_size)

        if eval_reader is not None:
            self.eval_transforms = eval_reader.transforms
            self.test_transforms = copy.deepcopy(eval_reader.transforms)

        lr = self.optimizer._learning_rate
        lr.persistable = True
        if isinstance(lr, fluid.framework.Variable):
            self.train_outputs['lr'] = lr

        # 多卡训练
        if self.parallel_train_prog is None:
            build_strategy = fluid.compiler.BuildStrategy()
            if self.env_info['place'] != 'cpu' and len(self.places) > 1:
                build_strategy.sync_batch_norm = self.sync_bn
            exec_strategy = fluid.ExecutionStrategy()
            exec_strategy.num_iteration_per_drop_scope = 1

            self.parallel_train_prog = fluid.CompiledProgram(
                self.train_prog).with_data_parallel(
                    loss_name=self.train_outputs['loss'].name,
                    build_strategy=build_strategy,
                    exec_strategy=exec_strategy)

        total_num_steps = math.floor(train_reader.num_samples /
                                     train_batch_size)
        num_steps = 0
        time_stat = list()
        time_train_one_epoch = None
        time_eval_one_epoch = None

        total_num_steps_eval = 0
        # eval times
        total_eval_times = math.ceil(num_epochs / save_interval_epochs)
        eval_batch_size = train_batch_size
        if eval_reader is not None:
            total_num_steps_eval = math.ceil(eval_reader.num_samples /
                                             eval_batch_size)

        if use_vdl:
            from visualdl import LogWriter
            vdl_logdir = osp.join(save_dir, 'vdl_log')
            log_writer = LogWriter(vdl_logdir)
        best_metric = -1.0
        best_model_epoch = 1
        for i in range(self.begin_epoch, num_epochs):
            records = list()
            step_start_time = time.time()
            epoch_start_time = time.time()
            for step, data in enumerate(self.train_data_loader()):
                outputs = self.exe.run(self.parallel_train_prog,
                                       feed=data,
                                       fetch_list=list(
                                           self.train_outputs.values()))
                outputs_avg = np.mean(np.array(outputs), axis=1)
                records.append(outputs_avg)

                # time estimated to complete the training
                currend_time = time.time()
                step_cost_time = currend_time - step_start_time
                step_start_time = currend_time
                if len(time_stat) < 20:
                    time_stat.append(step_cost_time)
                else:
                    time_stat[num_steps % 20] = step_cost_time

                num_steps += 1
                if num_steps % log_interval_steps == 0:
                    step_metrics = OrderedDict(
                        zip(list(self.train_outputs.keys()), outputs_avg))

                    if use_vdl:
                        for k, v in step_metrics.items():
                            log_writer.add_scalar(step=num_steps,
                                                  tag='train/{}'.format(k),
                                                  value=v)

                    # 计算剩余时间
                    avg_step_time = np.mean(time_stat)
                    if time_train_one_epoch is not None:
                        eta = (num_epochs - i - 1) * time_train_one_epoch + (
                            total_num_steps - step - 1) * avg_step_time
                    else:
                        eta = ((num_epochs - i) * total_num_steps - step -
                               1) * avg_step_time
                    if time_eval_one_epoch is not None:
                        eval_eta = (total_eval_times - i //
                                    save_interval_epochs) * time_eval_one_epoch
                    else:
                        eval_eta = (total_eval_times -
                                    i // save_interval_epochs
                                    ) * total_num_steps_eval * avg_step_time
                    eta_str = seconds_to_hms(eta + eval_eta)

                    logging.info(
                        "[TRAIN] Epoch={}/{}, Step={}/{}, {}, time_each_step={}s, eta={}"
                        .format(i + 1, num_epochs, step + 1, total_num_steps,
                                dict2str(step_metrics),
                                round(avg_step_time, 2), eta_str))

            train_metrics = OrderedDict(
                zip(list(self.train_outputs.keys()), np.mean(records, axis=0)))
            logging.info('[TRAIN] Epoch {} finished, {} .'.format(
                i + 1, dict2str(train_metrics)))
            time_train_one_epoch = time.time() - epoch_start_time

            eval_epoch_start_time = time.time()
            if (i + 1) % save_interval_epochs == 0 or i == num_epochs - 1:
                current_save_dir = osp.join(save_dir, "epoch_{}".format(i + 1))
                if not osp.isdir(current_save_dir):
                    os.makedirs(current_save_dir)
                if eval_reader is not None:
                    self.eval_metrics = self.evaluate(
                        eval_reader=eval_reader,
                        batch_size=eval_batch_size,
                        epoch_id=i + 1)
                    # 保存最优模型
                    current_metric = self.eval_metrics[eval_best_metric]
                    if current_metric > best_metric:
                        best_metric = current_metric
                        best_model_epoch = i + 1
                        best_model_dir = osp.join(save_dir, "best_model")
                        self.save_model(save_dir=best_model_dir)
                    if use_vdl:
                        for k, v in self.eval_metrics.items():
                            if isinstance(v, list):
                                continue
                            if isinstance(v, np.ndarray):
                                if v.size > 1:
                                    continue
                            log_writer.add_scalar(step=num_steps,
                                                  tag='evaluate/{}'.format(k),
                                                  value=v)
                self.save_model(save_dir=current_save_dir)
                time_eval_one_epoch = time.time() - eval_epoch_start_time
                if eval_reader is not None:
                    logging.info(
                        'Current evaluated best model in validation dataset is epoch_{}, {}={}'
                        .format(best_model_epoch, eval_best_metric,
                                best_metric))