Ejemplo n.º 1
0
def train_with_dataloader(exe, train_prog, compiled_train_prog, train_dataloader, \
                        train_fetch_list, train_metrics, epochs = 10, \
                        log_interval = 0, valid_interval = 0, save_dir = './', \
                        save_model_name = 'model', fix_random_seed = False, \
                        compiled_test_prog = None, test_dataloader = None, \
                        test_fetch_list = None, test_metrics = None, \
                        is_profiler = None, profiler_path = None):
    if not train_dataloader:
        logger.error("[TRAIN] get dataloader failed.")
    epoch_periods = []
    train_loss = 0
    for epoch in range(epochs):
        log_lr_and_step()

        train_iter = 0
        epoch_periods = []

        for data in train_dataloader():
            cur_time = time.time()
            train_outs = exe.run(compiled_train_prog,
                                 fetch_list=train_fetch_list,
                                 feed=data)
            period = time.time() - cur_time
            epoch_periods.append(period)
            if log_interval > 0 and (train_iter % log_interval == 0):
                train_metrics.calculate_and_log_out(train_outs, \
                        info = '[TRAIN] Epoch {}, iter {} '.format(epoch, train_iter))
            train_iter += 1

            # NOTE: profiler tools, used for benchmark
            if is_profiler and epoch == 0 and train_iter == log_interval:
                profiler.start_profiler("All")
            elif is_profiler and epoch == 0 and train_iter == log_interval + 5:
                profiler.stop_profiler("total", profiler_path)
                return

        if len(epoch_periods) < 1:
            logger.info(
                'No iteration was executed, please check the data reader')
            sys.exit(1)

        logger.info(
            '[TRAIN] Epoch {} training finished, average time: {}'.format(
                epoch, np.mean(epoch_periods[1:])))
        save_model(exe, train_prog, save_dir, save_model_name,
                   "_epoch{}".format(epoch))
        if compiled_test_prog and valid_interval > 0 and (
                epoch + 1) % valid_interval == 0:
            test_with_dataloader(exe, compiled_test_prog, test_dataloader,
                                 test_fetch_list, test_metrics, log_interval,
                                 save_model_name)

    save_model(exe, train_prog, save_dir, save_model_name)
    #when fix_random seed for debug
    if fix_random_seed:
        cards = os.environ.get('CUDA_VISIBLE_DEVICES')
        gpu_num = len(cards.split(","))
        print("kpis\ttrain_cost_card{}\t{}".format(gpu_num, train_loss))
        print("kpis\ttrain_speed_card{}\t{}".format(gpu_num,
                                                    np.mean(epoch_periods)))
Ejemplo n.º 2
0
def train_prog(exe, program, loss, node2vec_pyreader, args, train_steps):
    step = 0
    node2vec_pyreader.start()

    profiler.start_profiler("All")
    while True:
        try:
            begin_time = time.time()
            loss_val = exe.run(program, fetch_list=[loss])
            log.info("step %s: loss %.5f speed: %.5f s/step" %
                     (step, np.mean(loss_val), time.time() - begin_time))
            step += 1
        except F.core.EOFException:
            node2vec_pyreader.reset()

        if step % args.steps_per_save == 0 or step == train_steps:
            profiler.stop_profiler("total", "/tmp/profile")
            model_save_dir = args.save_path
            model_path = os.path.join(model_save_dir, str(step))
            if not os.path.exists(model_save_dir):
                os.makedirs(model_save_dir)
            #fleet.save_persistables(exe, model_path)
            F.io.save_params(exe, dirname=model_path, main_program=program)
        if step == train_steps:
            break
Ejemplo n.º 3
0
def infer(place, save_dirname=None, trans=False):
    if save_dirname is None:
        return

    exe = fluid.Executor(place)

    inference_scope = fluid.core.Scope()
    with fluid.scope_guard(inference_scope):
        # Use fluid.io.load_inference_model to obtain the inference program desc,
        # the feed_target_names (the names of variables that will be feeded
        # data using feed operators), and the fetch_targets (variables that
        # we want to obtain data from using fetch operators).
        [inference_program, feed_target_names, fetch_targets
         ] = fluid.io.load_inference_model(save_dirname,
                                           exe,
                                           model_filename='model',
                                           params_filename='params')

        assert feed_target_names[0] == 'data'
        #assert feed_target_names[1] == 'label'

        print(feed_target_names)
        print(fetch_targets)

        if (trans):
            inference_transpiler_program = inference_program.clone()
            t = fluid.transpiler.InferenceTranspiler()
            t.transpile(inference_transpiler_program, place)
            prog = inference_transpiler_program
        else:
            prog = inference_program
        """
        for block in inference_program.blocks:
            for op in block.ops:
                print(op.type)

        print("----------------")

        for block in inference_transpiler_program.blocks:
            for op in block.ops:
                print(op.type)

        print(debugger.pprint_program_codes(inference_program))
        print("----------------")
        print(debugger.pprint_program_codes(inference_transpiler_program))
        exit()
        """

        for i in range(10):
            img_data = np.random.random([1, 3, 224, 224]).astype('float32')
            if (i == 9):
                profiler.start_profiler("All")
            exe.run(prog,
                    feed={feed_target_names[0]: img_data},
                    fetch_list=fetch_targets)
            if (i == 9):
                profiler.stop_profiler("total", "/tmp/profile")
Ejemplo n.º 4
0
    def train_loop_pyreader():
        py_reader.start()
        train_stats = TrainingStats(cfg.log_window, keys)
        try:
            start_time = time.time()
            prev_start_time = start_time
            for iter_id in range(cfg.max_iter):
                prev_start_time = start_time
                start_time = time.time()
                outs = train_exe.run(fetch_list=[v.name for v in fetch_list])
                stats = {
                    k: np.array(v).mean()
                    for k, v in zip(keys, outs[:-1])
                }
                train_stats.update(stats)
                logs = train_stats.log()
                strs = '{}, iter: {}, lr: {:.5f}, {}, time: {:.3f}'.format(
                    now_time(), iter_id, np.mean(outs[-1]), logs,
                    start_time - prev_start_time)
                print(strs)
                sys.stdout.flush()
                if (iter_id + 1) % cfg.TRAIN.snapshot_iter == 0:
                    save_model("model_iter{}".format(iter_id))

                #profiler tools, used for benchmark
                if args.is_profiler and iter_id == 10:
                    profiler.start_profiler("All")
                elif args.is_profiler and iter_id == 15:
                    profiler.stop_profiler("total", args.profiler_path)
                    return

            end_time = time.time()
            total_time = end_time - start_time
            last_loss = np.array(outs[0]).mean()
            if cfg.enable_ce:
                gpu_num = devices_num
                epoch_idx = iter_id + 1
                loss = last_loss
                print("kpis\teach_pass_duration_card%s\t%s" %
                      (gpu_num, total_time / epoch_idx))
                print("kpis\ttrain_loss_card%s\t%s" % (gpu_num, loss))
        except (StopIteration, fluid.core.EOFException):
            py_reader.reset()
Ejemplo n.º 5
0
def train(args):
    """Train model
    
    Args:
        args: all arguments.    
    """
    startup_prog = fluid.Program()
    train_prog = fluid.Program()
    train_out = build_program(
        is_train=True,
        main_prog=train_prog,
        startup_prog=startup_prog,
        args=args)
    train_data_loader = train_out[-1]
    if args.use_ema:
        train_fetch_vars = train_out[:-2]
        ema = train_out[-2]
    else:
        train_fetch_vars = train_out[:-1]

    train_fetch_list = [var.name for var in train_fetch_vars]

    if args.validate:
        test_prog = fluid.Program()
        test_out = build_program(
            is_train=False,
            main_prog=test_prog,
            startup_prog=startup_prog,
            args=args)
        test_data_loader = test_out[-1]
        test_fetch_vars = test_out[:-1]

        test_fetch_list = [var.name for var in test_fetch_vars]

        #Create test_prog and set layers' is_test params to True
        test_prog = test_prog.clone(for_test=True)

    gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0))
    place = fluid.CUDAPlace(gpu_id) if args.use_gpu else fluid.CPUPlace()
    exe = fluid.Executor(place)
    exe.run(startup_prog)

    trainer_id = int(os.getenv("PADDLE_TRAINER_ID", 0))

    #init model by checkpoint or pretrianed model.
    init_model(exe, args, train_prog)
    num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1))
    if args.use_dali:
        import dali
        train_iter = dali.train(settings=args)
        if trainer_id == 0:
            test_iter = dali.val(settings=args)
    else:
        imagenet_reader = reader.ImageNetReader(0 if num_trainers > 1 else None)
        train_reader = imagenet_reader.train(settings=args)
        if args.use_gpu:
            if num_trainers <= 1:
                places = fluid.framework.cuda_places()
            else:
                places = place
        else:
            if num_trainers <= 1:
                places = fluid.framework.cpu_places()
            else:
                places = place

        train_data_loader.set_sample_list_generator(train_reader, places)

        if args.validate:
            test_reader = imagenet_reader.val(settings=args)
            test_data_loader.set_sample_list_generator(test_reader, places)

    compiled_train_prog = best_strategy_compiled(args, train_prog,
                                                 train_fetch_vars[0], exe)
    #NOTE: this for benchmark
    total_batch_num = 0
    for pass_id in range(args.num_epochs):
        if num_trainers > 1 and not args.use_dali:
            imagenet_reader.set_shuffle_seed(pass_id + (
                args.random_seed if args.random_seed else 0))
        train_batch_id = 0
        train_batch_time_record = []
        train_batch_metrics_record = []

        if not args.use_dali:
            train_iter = train_data_loader()
            if args.validate:
                test_iter = test_data_loader()

        t1 = time.time()
        for batch in train_iter:
            #NOTE: this is for benchmark
            if args.max_iter and total_batch_num == args.max_iter:
                return
            train_batch_metrics = exe.run(compiled_train_prog,
                                          feed=batch,
                                          fetch_list=train_fetch_list)
            t2 = time.time()
            train_batch_elapse = t2 - t1
            train_batch_time_record.append(train_batch_elapse)

            train_batch_metrics_avg = np.mean(
                np.array(train_batch_metrics), axis=1)
            train_batch_metrics_record.append(train_batch_metrics_avg)
            if trainer_id == 0:
                print_info("batch", train_batch_metrics_avg, train_batch_elapse,
                           pass_id, train_batch_id, args.print_step)
                sys.stdout.flush()
            train_batch_id += 1
            t1 = time.time()
            #NOTE: this for benchmark profiler
            total_batch_num = total_batch_num + 1
            if args.is_profiler and pass_id == 0 and train_batch_id == args.print_step:
                profiler.start_profiler("All")
            elif args.is_profiler and pass_id == 0 and train_batch_id == args.print_step + 5:
                profiler.stop_profiler("total", args.profiler_path)
                return

        if args.use_dali:
            train_iter.reset()

        if trainer_id == 0 and args.validate:
            if args.use_ema:
                logger.info('ExponentialMovingAverage validate start...')
                with ema.apply(exe):
                    validate(args, test_iter, exe, test_prog, test_fetch_list,
                             pass_id, train_batch_metrics_record,
                             compiled_train_prog)
                logger.info('ExponentialMovingAverage validate over!')

            validate(args, test_iter, exe, test_prog, test_fetch_list, pass_id,
                     train_batch_metrics_record, train_batch_time_record,
                     compiled_train_prog)

            if args.use_dali:
                test_iter.reset()

        if pass_id % args.save_step == 0:
            save_model(args, exe, train_prog, pass_id)
Ejemplo n.º 6
0
def train():

    # check if set use_gpu=True in paddlepaddle cpu version
    check_gpu(cfg.use_gpu)

    if cfg.debug or args.enable_ce:
        fluid.default_startup_program().random_seed = 1000
        fluid.default_main_program().random_seed = 1000
        random.seed(0)
        np.random.seed(0)

    if not os.path.exists(cfg.model_save_dir):
        os.makedirs(cfg.model_save_dir)

    model = YOLOv3()
    model.build_model()
    input_size = cfg.input_size
    loss = model.loss()
    loss.persistable = True

    devices_num = get_device_num() if cfg.use_gpu else 1
    print("Found {} CUDA/CPU devices.".format(devices_num))

    learning_rate = cfg.learning_rate
    boundaries = cfg.lr_steps
    gamma = cfg.lr_gamma
    step_num = len(cfg.lr_steps)
    values = [learning_rate * (gamma**i) for i in range(step_num + 1)]

    optimizer = fluid.optimizer.Momentum(
        learning_rate=exponential_with_warmup_decay(
            learning_rate=learning_rate,
            boundaries=boundaries,
            values=values,
            warmup_iter=cfg.warm_up_iter,
            warmup_factor=cfg.warm_up_factor),
        regularization=fluid.regularizer.L2Decay(cfg.weight_decay),
        momentum=cfg.momentum)
    optimizer.minimize(loss)

    gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0))
    place = fluid.CUDAPlace(gpu_id) if cfg.use_gpu else fluid.CPUPlace()
    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())

    if cfg.pretrain:
        if not os.path.exists(cfg.pretrain):
            print("Pretrain weights not found: {}".format(cfg.pretrain))

        def if_exist(var):
            return os.path.exists(os.path.join(cfg.pretrain, var.name))

        fluid.io.load_vars(exe, cfg.pretrain, predicate=if_exist)

    build_strategy = fluid.BuildStrategy()
    build_strategy.memory_optimize = False  #gc and memory optimize may conflict
    syncbn = cfg.syncbn
    if (syncbn and devices_num <= 1) or num_trainers > 1:
        print("Disable syncbn in single device")
        syncbn = False
    build_strategy.sync_batch_norm = syncbn

    exec_strategy = fluid.ExecutionStrategy()
    if cfg.use_gpu and num_trainers > 1:
        dist_utils.prepare_for_multi_process(exe, build_strategy,
                                             fluid.default_main_program())
        exec_strategy.num_threads = 1

    compile_program = fluid.compiler.CompiledProgram(
        fluid.default_main_program()).with_data_parallel(
            loss_name=loss.name,
            build_strategy=build_strategy,
            exec_strategy=exec_strategy)

    random_sizes = [cfg.input_size]
    if cfg.random_shape:
        random_sizes = [32 * i for i in range(10, 20)]

    total_iter = cfg.max_iter - cfg.start_iter
    mixup_iter = total_iter - cfg.no_mixup_iter

    shuffle = True
    if args.enable_ce:
        shuffle = False
    shuffle_seed = None
    # NOTE: yolov3 is a special model, if num_trainers > 1, each process
    # trian the completed dataset.
    # if num_trainers > 1: shuffle_seed  = 1
    train_reader = reader.train(
        input_size,
        batch_size=cfg.batch_size,
        shuffle=shuffle,
        shuffle_seed=shuffle_seed,
        total_iter=total_iter * devices_num,
        mixup_iter=mixup_iter * devices_num,
        random_sizes=random_sizes,
        use_multiprocess_reader=cfg.use_multiprocess_reader,
        num_workers=cfg.worker_num)
    py_reader = model.py_reader
    py_reader.decorate_paddle_reader(train_reader)

    def save_model(postfix):
        model_path = os.path.join(cfg.model_save_dir, postfix)
        if os.path.isdir(model_path):
            shutil.rmtree(model_path)
        fluid.io.save_persistables(exe, model_path)

    fetch_list = [loss]

    py_reader.start()
    smoothed_loss = SmoothedValue()
    try:
        start_time = time.time()
        prev_start_time = start_time
        snapshot_loss = 0
        snapshot_time = 0
        for iter_id in range(cfg.start_iter, cfg.max_iter):
            prev_start_time = start_time
            start_time = time.time()
            losses = exe.run(compile_program,
                             fetch_list=[v.name for v in fetch_list])
            smoothed_loss.add_value(np.mean(np.array(losses[0])))
            snapshot_loss += np.mean(np.array(losses[0]))
            snapshot_time += start_time - prev_start_time
            lr = np.array(
                fluid.global_scope().find_var('learning_rate').get_tensor())
            print("Iter {:d}, lr {:.6f}, loss {:.6f}, time {:.5f}".format(
                iter_id, lr[0], smoothed_loss.get_mean_value(),
                start_time - prev_start_time))
            sys.stdout.flush()
            #add profiler tools
            if args.is_profiler and iter_id == 5:
                profiler.start_profiler("All")
            elif args.is_profiler and iter_id == 10:
                profiler.stop_profiler("total", args.profiler_path)
                return

            if (iter_id + 1) % cfg.snapshot_iter == 0:
                save_model("model_iter{}".format(iter_id))
                print("Snapshot {} saved, average loss: {}, \
                      average time: {}".format(
                    iter_id + 1, snapshot_loss / float(cfg.snapshot_iter),
                    snapshot_time / float(cfg.snapshot_iter)))
                if args.enable_ce and iter_id == cfg.max_iter - 1:
                    if devices_num == 1:
                        print("kpis\ttrain_cost_1card\t%f" %
                              (snapshot_loss / float(cfg.snapshot_iter)))
                        print("kpis\ttrain_duration_1card\t%f" %
                              (snapshot_time / float(cfg.snapshot_iter)))
                    else:
                        print("kpis\ttrain_cost_8card\t%f" %
                              (snapshot_loss / float(cfg.snapshot_iter)))
                        print("kpis\ttrain_duration_8card\t%f" %
                              (snapshot_time / float(cfg.snapshot_iter)))

                snapshot_loss = 0
                snapshot_time = 0
    except fluid.core.EOFException:
        py_reader.reset()

    save_model('model_final')
Ejemplo n.º 7
0
def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
                   batch_acc, args, train_prog, startup_prog, nccl_id_var,
                   num_trainers, trainer_id):
    feed_var_list = [
        var for var in train_prog.global_block().vars.itervalues()
        if var.is_data
    ]
    # generate fake:
    if args.use_fake_data:
        for var in feed_var_list:
            v = startup_prog.global_block().clone_variable(var)
            var.persistable = True
            v.persistable = True

            real_shape = list(var.shape)
            real_shape[0] = args.batch_size / args.gpus
            startup_prog.global_block().append_op(
                outputs={"Out": v},
                type="fill_constant",
                attrs={"shape": real_shape,
                       "value": 1.0,
                       "dtype": var.dtype})

    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
    if nccl_id_var and trainer_id == 0:
        #FIXME(wuyi): wait other trainer to start listening
        time.sleep(30)

    startup_exe = fluid.Executor(place)
    startup_exe.run(startup_prog)
    strategy = fluid.ExecutionStrategy()
    strategy.num_threads = 1
    strategy.allow_op_delay = False
    exe = fluid.ParallelExecutor(
        True,
        avg_loss.name,
        exec_strategy=strategy,
        num_trainers=num_trainers,
        trainer_id=trainer_id)

    feeder = fluid.DataFeeder(feed_var_list, place)
    for pass_id in range(args.pass_num):
        num_samples = 0
        iters = 0
        start_time = time.time()
        for batch_id, data in enumerate(train_reader()):
            if args.profile and pass_id == 0 and batch_id == 5:
                profiler.start_profiler("All")
            elif args.profile and pass_id == 0 and batch_id == 10:
                profiler.stop_profiler("total", "/tmp/profile_%d" % trainer_id)

            if iters == args.skip_batch_num:
                start_time = time.time()
                num_samples = 0
            if iters == args.iterations:
                break
            if args.use_fake_data:
                loss, = exe.run([avg_loss.name])
            else:
                loss, = exe.run([avg_loss.name], feed=feeder.feed(data))
            if args.update_method == "pserver":
                exe.bcast_params()
            num_samples += len(data)
            iters += 1
            if batch_id % 1 == 0:
                print("Pass %d, batch %d, loss %s" %
                      (pass_id, batch_id, np.array(loss)))
        train_elapsed = time.time() - start_time
        examples_per_sec = num_samples / train_elapsed
        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
              (num_samples, train_elapsed, examples_per_sec))
        if not args.no_test and batch_acc != None:
            test_acc = test(startup_exe, infer_prog, test_reader, feeder,
                            batch_acc)
            print("Pass: %d, Test Accuracy: %f\n" % (pass_id, test_acc))
        exit(0)
def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
                   batch_acc, batch_size_tensor, args, train_prog,
                   startup_prog, nccl_id_var, num_trainers, trainer_id):
    feed_var_list = [
        var for var in train_prog.global_block().vars.itervalues()
        if var.is_data
    ]
    # generate fake:
    if args.use_fake_data:
        for var in feed_var_list:
            v = startup_prog.global_block().clone_variable(var)
            var.persistable = True
            v.persistable = True

            real_shape = list(var.shape)
            real_shape[0] = args.batch_size / args.gpus
            startup_prog.global_block().append_op(outputs={"Out": v},
                                                  type="fill_constant",
                                                  attrs={
                                                      "shape": real_shape,
                                                      "value": 1.0,
                                                      "dtype": var.dtype
                                                  })

    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
    if nccl_id_var and trainer_id == 0:
        #FIXME(wuyi): wait other trainer to start listening
        time.sleep(30)

    startup_exe = fluid.Executor(place)
    startup_exe.run(startup_prog)
    strategy = fluid.ExecutionStrategy()
    strategy.num_threads = 1
    strategy.allow_op_delay = False
    exe = fluid.ParallelExecutor(True,
                                 avg_loss.name,
                                 exec_strategy=strategy,
                                 num_trainers=num_trainers,
                                 trainer_id=trainer_id)

    feeder = fluid.DataFeeder(feed_var_list, place)
    acc_4passes = None
    converge_speed = None
    accuracy_evaluator = fluid.metrics.Accuracy()
    fetch_list = [avg_loss.name]
    if batch_acc is not None:
        fetch_list.append(batch_acc.name)
    start_time = time.time()

    for pass_id in range(args.pass_num):
        num_samples = 0
        iters = 0
        pass_start_time = time.time()
        accuracy_evaluator.reset()
        for batch_id, data in enumerate(train_reader()):
            if args.profile and pass_id == 0 and batch_id == 5:
                profiler.start_profiler("All")
            elif args.profile and pass_id == 0 and batch_id == 10:
                profiler.stop_profiler("total", "/tmp/profile_%d" % trainer_id)

            if iters == args.skip_batch_num:
                start_time = time.time()
                num_samples = 0
            if iters == args.iterations:
                break
            if args.use_fake_data:
                outs = exe.run(fetch_list)
            else:
                outs = exe.run(fetch_list, feed=feeder.feed(data))

            if args.update_method == "pserver":
                exe.bcast_params()
            num_samples += len(data)
            iters += 1

            if batch_acc is not None:
                acc = np.mean(outs[1]).item()
                accuracy_evaluator.update(value=acc, weight=len(data))
            else:
                acc = None

            if batch_id % 1 == 0:
                print("Pass %d, batch %d, loss %s, acc %s" %
                      (pass_id, batch_id, np.mean(outs[0]), str(acc)))
            if converge_speed is None and args.acc_target and acc >= args.acc_target:
                converge_speed = time.time() - start_time
                print("converge_speed set with %f" % converge_speed)

        pass_elapsed = time.time() - pass_start_time
        examples_per_sec = num_samples / pass_elapsed
        if batch_acc is not None:
            pass_train_acc = accuracy_evaluator.eval()
        else:
            pass_train_acc = None

        if pass_id == 4 and batch_acc is not None:
            print("acc_4passes set with %f" % pass_train_acc)
            acc_4passes = float(pass_train_acc)

        output_metric_data(pass_id, examples_per_sec, pass_train_acc,
                           acc_4passes, converge_speed)

        if not args.no_test and batch_acc != None:
            test_acc = test(startup_exe, infer_prog, test_reader, feeder,
                            batch_acc)
            print("Pass: %d, Test Accuracy: %f\n" % (pass_id, test_acc))
        exit(0)
Ejemplo n.º 9
0
def train(cfg):
    startup_prog = fluid.Program()
    train_prog = fluid.Program()
    test_prog = fluid.Program()
    if args.enable_ce:
        startup_prog.random_seed = 1000
        train_prog.random_seed = 1000
    drop_last = True

    dataset = SegDataset(
        file_list=cfg.DATASET.TRAIN_FILE_LIST,
        mode=ModelPhase.TRAIN,
        shuffle=True,
        data_dir=cfg.DATASET.DATA_DIR)

    def data_generator():
        if args.use_mpio:
            data_gen = dataset.multiprocess_generator(
                num_processes=cfg.DATALOADER.NUM_WORKERS,
                max_queue_size=cfg.DATALOADER.BUF_SIZE)
        else:
            data_gen = dataset.generator()

        batch_data = []
        for b in data_gen:
            batch_data.append(b)
            if len(batch_data) == (cfg.BATCH_SIZE // cfg.NUM_TRAINERS):
                for item in batch_data:
                    yield item[0], item[1], item[2]
                batch_data = []
        # If use sync batch norm strategy, drop last batch if number of samples
        # in batch_data is less then cfg.BATCH_SIZE to avoid NCCL hang issues
        if not cfg.TRAIN.SYNC_BATCH_NORM:
            for item in batch_data:
                yield item[0], item[1], item[2]

    # Get device environment
    gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0))
    place = fluid.CUDAPlace(gpu_id) if args.use_gpu else fluid.CPUPlace()
    places = fluid.cuda_places() if args.use_gpu else fluid.cpu_places()

    # Get number of GPU
    dev_count = cfg.NUM_TRAINERS if cfg.NUM_TRAINERS > 1 else len(places)
    print_info("#Device count: {}".format(dev_count))

    # Make sure BATCH_SIZE can divided by GPU cards
    assert cfg.BATCH_SIZE % dev_count == 0, (
        'BATCH_SIZE:{} not divisble by number of GPUs:{}'.format(
            cfg.BATCH_SIZE, dev_count))
    # If use multi-gpu training mode, batch data will allocated to each GPU evenly
    batch_size_per_dev = cfg.BATCH_SIZE // dev_count
    print_info("batch_size_per_dev: {}".format(batch_size_per_dev))

    data_loader, avg_loss, lr, pred, grts, masks = build_model(
        train_prog, startup_prog, phase=ModelPhase.TRAIN)
    build_model(test_prog, fluid.Program(), phase=ModelPhase.EVAL)
    data_loader.set_sample_generator(
        data_generator, batch_size=batch_size_per_dev, drop_last=drop_last)

    exe = fluid.Executor(place)
    exe.run(startup_prog)

    exec_strategy = fluid.ExecutionStrategy()
    # Clear temporary variables every 100 iteration
    if args.use_gpu:
        exec_strategy.num_threads = fluid.core.get_cuda_device_count()
    exec_strategy.num_iteration_per_drop_scope = 100
    build_strategy = fluid.BuildStrategy()

    if cfg.NUM_TRAINERS > 1 and args.use_gpu:
        dist_utils.prepare_for_multi_process(exe, build_strategy, train_prog)
        exec_strategy.num_threads = 1

    if cfg.TRAIN.SYNC_BATCH_NORM and args.use_gpu:
        if dev_count > 1:
            # Apply sync batch norm strategy
            print_info("Sync BatchNorm strategy is effective.")
            build_strategy.sync_batch_norm = True
        else:
            print_info(
                "Sync BatchNorm strategy will not be effective if GPU device"
                " count <= 1")
    compiled_train_prog = fluid.CompiledProgram(train_prog).with_data_parallel(
        loss_name=avg_loss.name,
        exec_strategy=exec_strategy,
        build_strategy=build_strategy)

    # Resume training
    begin_epoch = cfg.SOLVER.BEGIN_EPOCH
    if cfg.TRAIN.RESUME_MODEL_DIR:
        begin_epoch = load_checkpoint(exe, train_prog)
    # Load pretrained model
    elif os.path.exists(cfg.TRAIN.PRETRAINED_MODEL_DIR):
        load_pretrained_weights(exe, train_prog, cfg.TRAIN.PRETRAINED_MODEL_DIR)
    else:
        print_info(
            'Pretrained model dir {} not exists, training from scratch...'.
            format(cfg.TRAIN.PRETRAINED_MODEL_DIR))

    fetch_list = [avg_loss.name, lr.name]
    if args.debug:
        # Fetch more variable info and use streaming confusion matrix to
        # calculate IoU results if in debug mode
        np.set_printoptions(
            precision=4, suppress=True, linewidth=160, floatmode="fixed")
        fetch_list.extend([pred.name, grts.name, masks.name])
        cm = ConfusionMatrix(cfg.DATASET.NUM_CLASSES, streaming=True)

    if args.use_vdl:
        if not args.vdl_log_dir:
            print_info("Please specify the log directory by --vdl_log_dir.")
            exit(1)

        from visualdl import LogWriter
        log_writer = LogWriter(args.vdl_log_dir)

    # trainer_id = int(os.getenv("PADDLE_TRAINER_ID", 0))
    # num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1))
    step = 0
    all_step = cfg.DATASET.TRAIN_TOTAL_IMAGES // cfg.BATCH_SIZE
    if cfg.DATASET.TRAIN_TOTAL_IMAGES % cfg.BATCH_SIZE and drop_last != True:
        all_step += 1
    all_step *= (cfg.SOLVER.NUM_EPOCHS - begin_epoch + 1)

    avg_loss = 0.0
    best_mIoU = 0.0

    timer = Timer()
    timer.start()
    if begin_epoch > cfg.SOLVER.NUM_EPOCHS:
        raise ValueError(
            ("begin epoch[{}] is larger than cfg.SOLVER.NUM_EPOCHS[{}]").format(
                begin_epoch, cfg.SOLVER.NUM_EPOCHS))

    if args.use_mpio:
        print_info("Use multiprocess reader")
    else:
        print_info("Use multi-thread reader")

    for epoch in range(begin_epoch, cfg.SOLVER.NUM_EPOCHS + 1):
        data_loader.start()
        while True:
            try:
                if args.debug:
                    # Print category IoU and accuracy to check whether the
                    # traning process is corresponed to expectation
                    loss, lr, pred, grts, masks = exe.run(
                        program=compiled_train_prog,
                        fetch_list=fetch_list,
                        return_numpy=True)
                    cm.calculate(pred, grts, masks)
                    avg_loss += np.mean(np.array(loss))
                    step += 1

                    if step % args.log_steps == 0:
                        speed = args.log_steps / timer.elapsed_time()
                        avg_loss /= args.log_steps
                        category_acc, mean_acc = cm.accuracy()
                        category_iou, mean_iou = cm.mean_iou()

                        print_info((
                            "epoch={} step={} lr={:.5f} loss={:.4f} acc={:.5f} mIoU={:.5f} step/sec={:.3f} | ETA {}"
                        ).format(epoch, step, lr[0], avg_loss, mean_acc,
                                 mean_iou, speed,
                                 calculate_eta(all_step - step, speed)))
                        print_info("Category IoU: ", category_iou)
                        print_info("Category Acc: ", category_acc)
                        if args.use_vdl:
                            log_writer.add_scalar('Train/mean_iou', mean_iou,
                                                  step)
                            log_writer.add_scalar('Train/mean_acc', mean_acc,
                                                  step)
                            log_writer.add_scalar('Train/loss', avg_loss, step)
                            log_writer.add_scalar('Train/lr', lr[0], step)
                            log_writer.add_scalar('Train/step/sec', speed, step)
                        sys.stdout.flush()
                        avg_loss = 0.0
                        cm.zero_matrix()
                        timer.restart()
                else:
                    # If not in debug mode, avoid unnessary log and calculate
                    loss, lr = exe.run(
                        program=compiled_train_prog,
                        fetch_list=fetch_list,
                        return_numpy=True)
                    avg_loss += np.mean(np.array(loss))
                    step += 1

                    if step % args.log_steps == 0 and cfg.TRAINER_ID == 0:
                        avg_loss /= args.log_steps
                        speed = args.log_steps / timer.elapsed_time()
                        print((
                            "epoch={} step={} lr={:.5f} loss={:.4f} step/sec={:.3f} | ETA {}"
                        ).format(epoch, step, lr[0], avg_loss, speed,
                                 calculate_eta(all_step - step, speed)))
                        if args.use_vdl:
                            log_writer.add_scalar('Train/loss', avg_loss, step)
                            log_writer.add_scalar('Train/lr', lr[0], step)
                            log_writer.add_scalar('Train/speed', speed, step)
                        sys.stdout.flush()
                        avg_loss = 0.0
                        timer.restart()

                    # NOTE : used for benchmark, profiler tools
                    if args.is_profiler and epoch == 1 and step == args.log_steps:
                        profiler.start_profiler("All")
                    elif args.is_profiler and epoch == 1 and step == args.log_steps + 5:
                        profiler.stop_profiler("total", args.profiler_path)
                        return

            except fluid.core.EOFException:
                data_loader.reset()
                break
            except Exception as e:
                print(e)

        if (epoch % cfg.TRAIN.SNAPSHOT_EPOCH == 0
                or epoch == cfg.SOLVER.NUM_EPOCHS) and cfg.TRAINER_ID == 0:
            ckpt_dir = save_checkpoint(train_prog, epoch)
            save_infer_program(test_prog, ckpt_dir)

            if args.do_eval:
                print("Evaluation start")
                _, mean_iou, _, mean_acc = evaluate(
                    cfg=cfg,
                    ckpt_dir=ckpt_dir,
                    use_gpu=args.use_gpu,
                    use_mpio=args.use_mpio)
                if args.use_vdl:
                    log_writer.add_scalar('Evaluate/mean_iou', mean_iou, step)
                    log_writer.add_scalar('Evaluate/mean_acc', mean_acc, step)

                if mean_iou > best_mIoU:
                    best_mIoU = mean_iou
                    update_best_model(ckpt_dir)
                    print_info("Save best model {} to {}, mIoU = {:.4f}".format(
                        ckpt_dir,
                        os.path.join(cfg.TRAIN.MODEL_SAVE_DIR, 'best_model'),
                        mean_iou))

            # Use VisualDL to visualize results
            if args.use_vdl and cfg.DATASET.VIS_FILE_LIST is not None:
                visualize(
                    cfg=cfg,
                    use_gpu=args.use_gpu,
                    vis_file_list=cfg.DATASET.VIS_FILE_LIST,
                    vis_dir="visual",
                    ckpt_dir=ckpt_dir,
                    log_writer=log_writer)

    # save final model
    if cfg.TRAINER_ID == 0:
        ckpt_dir = save_checkpoint(train_prog, 'final')
        save_infer_program(test_prog, ckpt_dir)
Ejemplo n.º 10
0
def train(args):
    # parameters from arguments
    model_name = args.model
    checkpoint = args.checkpoint
    pretrained_model = args.pretrained_model
    with_memory_optimization = args.with_mem_opt
    model_save_dir = args.model_save_dir

    startup_prog = fluid.Program()
    train_prog = fluid.Program()
    test_prog = fluid.Program()

    if args.enable_ce:
        startup_prog.random_seed = 1000
        train_prog.random_seed = 1000

    #train_py_reader, train_cost, train_acc1, train_acc5 = build_program(
    infer_prog, train_out, train_cost, train_acc1, train_acc5 = build_program(
        is_train=True,
        main_prog=train_prog,
        startup_prog=startup_prog,
        args=args)
    #test_py_reader, test_cost, test_acc1, test_acc5 = build_program(
    test_cost, test_acc1, test_acc5 = build_program(
        is_train=False,
        main_prog=test_prog,
        startup_prog=startup_prog,
        args=args)
    test_prog = test_prog.clone(for_test=True)

    if with_memory_optimization:
        fluid.memory_optimize(train_prog)
        fluid.memory_optimize(test_prog)

    """
    print("-------------------------------------")
    for block in train_prog.blocks:
        for op in block.ops:
            print("op_train: ", op.type)
    print("-------------------------------------")
    for block in test_prog.blocks:
        for op in block.ops:
            print("op_infer: ", op.type)
    exit()
    """

    #place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
    #place = fluid.XSIMPlace()
    #place = fluid.XCPUPlace()
    if args.place == "cuda":
        place = fluid.CUDAPlace(0)
    elif args.place == "xsim":
        place = fluid.XSIMPlace()
    elif args.place == "xpu":
        place = fluid.XPUPlace()
    else:
        print("Unsurpported place!")
        exit()

    exe = fluid.Executor(place)

    print("Run startup...")
    exe.run(startup_prog)

    train_fetch_list = [train_cost.name, train_acc1.name, train_acc5.name]

    if (args.run_mode == "train"):
        prog = train_prog
    elif (args.run_mode == "infer"):
        prog = test_prog
    elif (args.run_mode == "fused_infer"):
        print("Transpiling...")
        inference_transpiler_program = test_prog.clone()
        t = fluid.transpiler.InferenceXPUTranspiler()
        config = {
                "use_fake_max": True,
                "conv_weight_type": args.precision,
                "fc_weight_type": args.precision,
                "fc_pretrans_a": False,
                "fc_pretrans_b": True,
                "batch_size": args.batch_size
                }
        t.transpile_xpu(inference_transpiler_program, place, config)
        prog = inference_transpiler_program
    else:
        print("bad run_mode: ", args.run_mode)
        exit()


    print("Running...")
    img_data = np.random.random([args.batch_size, 3, 224, 224]).astype('float32')
    y_data = np.random.random([args.batch_size, 1]).astype('int64')

    if args.place == "cuda":
        # warm up
        loss, acc1, acc5 = exe.run(prog,
                feed={"data": img_data, "label": y_data},
                fetch_list=train_fetch_list)

        profiler.start_profiler("All")

    loss, acc1, acc5 = exe.run(prog,
            feed={"data": img_data, "label": y_data},
            fetch_list=train_fetch_list)

    if args.place == "cuda":
        profiler.stop_profiler("total", "/tmp/profile")
Ejemplo n.º 11
0
def do_train(args):
    # Initialize the paddle and paddle fleet execute enviroment
    paddle.enable_static()
    place = paddle.set_device(args.select_device)
    fleet.init(is_collective=True)
    # paddle.distributed.init_parallel_env()

    worker_num = fleet.worker_num()
    worker_index = fleet.worker_index()

    # Create the random seed for the worker
    set_seed(args.seed)
    # worker_init = WorkerInitObj(args.seed + worker_index)
    worker_init = WorkerInitObj(args.seed)
    tracker = get_rng_state_tracker()
    tracker.add('global_seed', args.seed)
    tracker.add('local_seed', args.seed + worker_index + 2021)

    # Define the input data in the static mode
    main_program = paddle.static.default_main_program()
    startup_program = paddle.static.default_startup_program()
    data_holders = create_data_holder(args)

    [
        input_ids, segment_ids, input_mask, masked_lm_positions,
        masked_lm_labels, next_sentence_labels, masked_lm_scale
    ] = data_holders

    # Define the model structure in static mode
    args.model_type = args.model_type.lower()
    model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
    config = model_class.pretrained_init_configuration[args.model_name_or_path]
    if config["vocab_size"] % 8 != 0:
        config["vocab_size"] += 8 - (config["vocab_size"] % 8)
    config['num_partitions'] = args.num_partitions
    model = BertForPretraining(BertModel(**config), args.num_partitions)
    criterion = BertPretrainingCriterion(model.bert.config["vocab_size"])
    prediction_scores, seq_relationship_score = model(
        input_ids=input_ids,
        token_type_ids=segment_ids,
        attention_mask=input_mask,
        masked_positions=masked_lm_positions)
    loss = criterion(prediction_scores, seq_relationship_score,
                     masked_lm_labels, next_sentence_labels, masked_lm_scale)

    # Define the dynamic learing_reate scheduler and optimizer
    lr_scheduler = paddle.optimizer.lr.LambdaDecay(
        args.learning_rate,
        lambda current_step, num_warmup_steps=args.warmup_steps,
        num_training_steps=args.max_steps if args.max_steps > 0 else
        (len(train_data_loader) * args.num_train_epochs): float(
            current_step) / float(max(1, num_warmup_steps))
        if current_step < num_warmup_steps else max(
            0.0,
            float(num_training_steps - current_step) / float(
                max(1, num_training_steps - num_warmup_steps))))

    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        epsilon=args.adam_epsilon,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in [
            p.name for n, p in model.named_parameters()
            if not any(nd in n for nd in ["bias", "norm"])
        ])
    # if worker_num == 1 and args.use_amp:
    #     amp_list = paddle.fluid.contrib.mixed_precision.AutoMixedPrecisionLists(
    #         custom_white_list=['softmax', 'layer_norm', 'gelu'])
    #     optimizer = paddle.fluid.contrib.mixed_precision.decorate(
    #         optimizer,
    #         amp_list,
    #         init_loss_scaling=args.scale_loss,
    #         use_dynamic_loss_scaling=True)

    if fleet.worker_num() > 1:
        # Use the fleet api to compile the distributed optimizer
        optimizer = dist_optimizer(args, optimizer)
    optimizer.minimize(loss)

    # Define the Executor for running the static model
    exe = paddle.static.Executor(place)
    exe.run(startup_program)
    # state_dict = model.state_dict()

    # Use the state dict to update the parameter
    # reset_state_dict = reset_program_state_dict(model, state_dict)
    # paddle.static.set_program_state(main_program, reset_state_dict)

    # if worker_num == 1:
    #     # Construct the compiled program
    #     main_program = build_compiled_program(main_program, loss)
    main_program._graph = None

    if fleet.worker_index() == 0:
        with open('startup_%d' % fleet.worker_num(), 'w') as f:
            f.writelines(str(startup_program))
        with open('main_%d' % fleet.worker_num(), 'w') as f:
            f.writelines(str(main_program))
    pool = ThreadPoolExecutor(1)
    global_step = 0
    tic_train = time.time()
    epoch = 0
    while True:
        files = [
            os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir)
            if os.path.isfile(os.path.join(args.input_dir, f))
            and "training" in f
        ]
        files.sort()
        num_files = len(files)
        random.Random(args.seed + epoch).shuffle(files)
        f_start_id = 0

        # Select one file for each worker and create the DataLoader for the file
        data_file = select_dataset_file_for_each_worker(
            files, f_start_id, 1, 0)
        #files, f_start_id, worker_num, worker_index)
        train_data_loader, _ = create_pretraining_dataset(
            data_file, args.max_predictions_per_seq, args, data_holders,
            worker_init, paddle.static.cuda_places())

        for f_id in range(f_start_id + 1, len(files)):
            data_file = select_dataset_file_for_each_worker(files, f_id, 1, 0)
            # files, f_id, worker_num, worker_index)
            dataset_future = pool.submit(create_pretraining_dataset, data_file,
                                         args.max_predictions_per_seq, args,
                                         data_holders, worker_init,
                                         paddle.static.cuda_places())

            for step, batch in enumerate(train_data_loader):
                global_step += 1
                if step == 10 and worker_index == 0:
                    profiler.start_profiler("All")
                if step == 20 and worker_index == 0:
                    profiler.stop_profiler("total", "/tmp/profile")

                loss_return = exe.run(main_program,
                                      feed=batch,
                                      fetch_list=[loss])
                # In the new 2.0 api, must call this function to change the learning_rate
                lr_scheduler.step()
                if global_step % args.logging_steps == 0:
                    time_cost = time.time() - tic_train
                    print(
                        "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s, ips: %.2f sequences/s"
                        % (global_step, epoch, step, loss_return[0],
                           args.logging_steps / time_cost,
                           args.logging_steps * args.batch_size / time_cost))
                    tic_train = time.time()
                if global_step % args.save_steps == 0:
                    if worker_index == 0:
                        output_dir = os.path.join(args.output_dir,
                                                  "model_%d" % global_step)
                        if not os.path.exists(output_dir):
                            os.makedirs(output_dir)
                        # TODO(fangzeyang): Udpate the save_params to paddle.static
                        paddle.fluid.io.save_params(exe, output_dir)
                        tokenizer.save_pretrained(output_dir)
                if global_step >= args.max_steps:
                    del train_data_loader
                    return
            del train_data_loader
            train_data_loader, data_file = dataset_future.result(timeout=None)
        epoch += 1
Ejemplo n.º 12
0
def train(args):
    # parameters from arguments
    model_name = args.model
    checkpoint = args.checkpoint
    pretrained_model = args.pretrained_model
    model_save_dir = args.model_save_dir
    use_mixup = args.use_mixup
    use_ngraph = os.getenv('FLAGS_use_ngraph')

    startup_prog = fluid.Program()
    train_prog = fluid.Program()
    test_prog = fluid.Program()

    exec_strategy = fluid.ExecutionStrategy()
    exec_strategy.num_threads = args.num_threads
    exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope

    dist_strategy = DistributedStrategy()
    dist_strategy.exec_strategy = exec_strategy
    dist_strategy.enable_inplace = args.with_inplace
    if not args.fuse:
        dist_strategy.fuse_all_reduce_ops = False
    dist_strategy.nccl_comm_num = args.nccl_comm_num
    dist_strategy.fuse_elewise_add_act_ops=args.fuse_elewise_add_act_ops

    role = role_maker.PaddleCloudRoleMaker(is_collective=True)
    fleet.init(role)

    b_out = build_program(
                     is_train=True,
                     main_prog=train_prog,
                     startup_prog=startup_prog,
                     args=args,
                     dist_strategy=dist_strategy,
                     data_layout=args.data_format)
    if use_mixup:
        train_data_loader, train_cost, global_lr = b_out[0], b_out[1], b_out[2]
        train_fetch_vars = [train_cost, global_lr]
        train_fetch_list = []
        for var in train_fetch_vars:
            var.persistable=True
            train_fetch_list.append(var.name)

    else:
        train_data_loader, train_cost, train_acc1, train_acc5, global_lr = b_out[0],b_out[1],b_out[2],b_out[3],b_out[4]
        train_fetch_vars = [train_cost, train_acc1, train_acc5, global_lr]
        train_fetch_list = []
        for var in train_fetch_vars:
            var.persistable=True
            train_fetch_list.append(var.name)

    train_prog = fleet.main_program

    b_out_test = build_program(
                     is_train=False,
                     main_prog=test_prog,
                     startup_prog=startup_prog,
                     args=args,
                     dist_strategy=dist_strategy,
                     data_layout=args.data_format)
    test_data_loader, test_cost, test_acc1, test_acc5 = b_out_test[0],b_out_test[1],b_out_test[2],b_out_test[3]

    test_prog = test_prog.clone(for_test=True)
    test_prog = compiler.CompiledProgram(test_prog).with_data_parallel(loss_name=test_cost.name, exec_strategy=exec_strategy)

    gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0))
    place = fluid.CUDAPlace(gpu_id) if args.use_gpu else fluid.CPUPlace()
    exe = fluid.Executor(place)
    exe.run(startup_prog)

    if checkpoint is not None:
        fluid.io.load_persistables(exe, checkpoint, main_program=train_prog)

    if pretrained_model:
        def if_exist(var):
            return os.path.exists(os.path.join(pretrained_model, var.name))

        fluid.io.load_vars(
            exe, pretrained_model, main_program=train_prog, predicate=if_exist)

    if args.use_gpu:
        device_num = get_device_num()
    else:
        device_num = 1

    train_batch_size = args.batch_size
    print("train_batch_size: %d device_num:%d" % (train_batch_size, device_num))

    test_batch_size = args.batch_size
    # NOTE: the order of batch data generated by batch_reader
    # must be the same in the respective processes.
    shuffle_seed = 1 if num_trainers > 1 else None

    if args.use_dali:
        import dali
        train_iter = dali.train(settings=args, trainer_id=trainer_id, trainers_num=num_trainers,
                                gpu_id=gpu_id, data_layout=args.data_format)
    else:
        train_reader = reader.train(settings=args, data_dir=args.data_dir,
                                    pass_id_as_seed=shuffle_seed, data_layout=args.data_format, threads=10)
        train_batch_reader=paddle.batch(train_reader, batch_size=train_batch_size)

        test_reader = reader.val(settings=args, data_dir=args.data_dir, data_layout=args.data_format, threads=10)
        test_batch_reader=paddle.batch(test_reader, batch_size=test_batch_size)

        places = place
        if num_trainers <= 1 and args.use_gpu:
            places = fluid.framework.cuda_places()

        train_data_loader.set_sample_list_generator(train_batch_reader, places)
        test_data_loader.set_sample_list_generator(test_batch_reader, place)

    test_fetch_vars = [test_cost, test_acc1, test_acc5]
    test_fetch_list = []
    for var in test_fetch_vars:
        var.persistable=True
        test_fetch_list.append(var.name)

    train_exe = exe

    params = models.__dict__[args.model]().params

    train_speed_list = []
    acc1_logs = []
    acc5_logs = []
    for pass_id in range(params["num_epochs"]):
        train_info = [[], [], []]
        test_info = [[], [], []]
        train_begin=time.time()
        batch_id = 0
        time_record=[]

        if not args.use_dali:
            train_iter = train_data_loader()

        for data in train_iter:
            t1 = time.time()

            if batch_id % args.fetch_steps != 0:
                train_exe.run(train_prog, feed=data)
            else:
                if use_mixup:
                    loss, lr = train_exe.run(train_prog, feed=data, fetch_list=train_fetch_list)
                else:
                    loss, acc1, acc5, lr = train_exe.run(train_prog,  feed=data,  fetch_list=train_fetch_list)
                    acc1 = np.mean(np.array(acc1))
                    acc5 = np.mean(np.array(acc5))
                    train_info[1].append(acc1)
                    train_info[2].append(acc5)

            t2 = time.time()
            period = t2 - t1
            time_record.append(period)

            if args.profile and batch_id == 100:
                print("begin profiler")
                if trainer_id == 0:
                    profiler.start_profiler("All")
            elif args.profile and batch_id == 105:
                print("begin to end profiler")
                if trainer_id == 0:
                    profiler.stop_profiler("total", "./profile_pass_%d" % (pass_id))
                print("end profiler break!")
                args.profile=False

            if batch_id % args.fetch_steps == 0:
                loss = np.mean(np.array(loss))
                train_info[0].append(loss)
                lr = np.mean(np.array(lr))
                period = np.mean(time_record)
                speed = args.batch_size * 1.0 / period
                time_record=[]
                if use_mixup:
                    print("Pass {0}, trainbatch {1}, loss {2}, lr {3}, time {4}, speed {5}"
                          .format(pass_id, batch_id, "%.5f"%loss, "%.5f" %lr, "%2.4f sec" % period, "%.2f" % speed))
                else:
                    print("Pass {0}, trainbatch {1}, loss {2}, \
                        acc1 {3}, acc5 {4}, lr {5}, time {6}, speed {7}"
                          .format(pass_id, batch_id, "%.5f"%loss, "%.5f"%acc1, "%.5f"%acc5, "%.5f" %
                                  lr, "%2.4f sec" % period, "%.2f" % speed))
                sys.stdout.flush()
            batch_id += 1

        if args.use_dali:
            train_iter.reset()

        train_loss = np.array(train_info[0]).mean()
        if not use_mixup:
            train_acc1 = np.array(train_info[1]).mean()
            train_acc5 = np.array(train_info[2]).mean()
        train_end=time.time()
        train_speed = (batch_id * train_batch_size) / (train_end - train_begin)
        train_speed_list.append(train_speed)

        if trainer_id == 0 and (args.do_test or (pass_id + 1) == params["num_epochs"]):
            if args.use_dali:
                test_iter = dali.val(settings=args, trainer_id=trainer_id, trainers_num=num_trainers,
                                 gpu_id=gpu_id, data_layout=args.data_format)
            else:
                test_iter = test_data_loader()

            test_batch_id = 0
            for data in test_iter:
                t1 = time.time()
                loss, acc1, acc5 = exe.run(program=test_prog,
                                           feed=data,
                                           fetch_list=test_fetch_list)
                t2 = time.time()
                period = t2 - t1
                loss = np.mean(loss)
                acc1 = np.mean(acc1)
                acc5 = np.mean(acc5)
                test_info[0].append(loss)
                test_info[1].append(acc1)
                test_info[2].append(acc5)

                if test_batch_id % 10 == 0:
                    test_speed = test_batch_size * 1.0 / period
                    print("Pass {0},testbatch {1},loss {2}, \
                        acc1 {3},acc5 {4},time {5},speed {6}"
                        .format(pass_id, test_batch_id, "%.5f"%loss,"%.5f"%acc1, "%.5f"%acc5,
                                "%2.2f sec" % period, "%.2f" % test_speed))
                    sys.stdout.flush()
                test_batch_id += 1

            if args.use_dali:
                test_iter.reset()
                del test_iter

            test_loss = np.array(test_info[0]).mean()
            test_acc1 = np.array(test_info[1]).mean()
            test_acc5 = np.array(test_info[2]).mean()

            acc1_logs.append(test_acc1)
            acc5_logs.append(test_acc5)

            if use_mixup:
                print("End pass {0}, train_loss {1}, test_loss {2}, test_acc1 {3}, test_acc5 {4}, speed {5}".format(
                      pass_id, "%.5f"%train_loss, "%.5f"%test_loss, "%.5f"%test_acc1, "%.5f"%test_acc5,
                      "%.2f" % train_speed))
            else:
                print("End pass {0}, train_loss {1}, train_acc1 {2}, train_acc5 {3}, "
                  "test_loss {4}, test_acc1 {5}, test_acc5 {6}, speed {7}".format(
                      pass_id, "%.5f"%train_loss, "%.5f"%train_acc1, "%.5f"%train_acc5, "%.5f"%test_loss,
                      "%.5f"%test_acc1, "%.5f"%test_acc5, "%.2f" % train_speed))
        else:
            if use_mixup:
                print("End pass {0}, train_loss {1}, speed {2}".format(pass_id, "%.5f"%train_loss, "%.2f" % train_speed))
            else:
                print("End pass {0}, train_loss {1}, train_acc1 {2}, train_acc5 {3}, ""speed {4}".format(
                    pass_id, "%.5f"%train_loss, "%.5f"%train_acc1, "%.5f"%train_acc5, "%.2f" % train_speed))

        sys.stdout.flush()

    # save in last epoch
    if trainer_id == 0:
        model_path = os.path.join(model_save_dir + '/' + model_name, str(pass_id))
        if not os.path.isdir(model_path):
            os.makedirs(model_path)

        fluid.io.save_persistables(exe, model_path, main_program=fleet._origin_program)
        if args.benchmark_test:
            if not os.path.isdir("./benchmark_logs/"):
                os.makedirs("./benchmark_logs/")
            with open("./benchmark_logs/log_%d" % trainer_id, 'w') as f:
                result = dict()
                result['0'] = dict()
                result['0']['acc1'] = test_acc1
                result['0']['acc5'] = test_acc5
                result['0']['result_log'] = dict()
                result['0']['result_log']['acc1'] = acc1_logs
                result['0']['result_log']['acc5'] = acc5_logs
                # maximum speed of all epochs
                result['1'] = max(train_speed_list) * num_trainers
                result['14'] = args.batch_size

                print(str(result))
                f.writelines(str(result))
def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
                   batch_acc, args, train_prog, startup_prog, nccl_id_var,
                   num_trainers, trainer_id):
    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
    if not args.use_reader_op:
        feed_var_list = [
            var for var in train_prog.global_block().vars.itervalues()
            if var.is_data
        ]
        feeder = fluid.DataFeeder(feed_var_list, place)

    # generate fake:
    if args.use_fake_data:
        for var in feed_var_list:
            v = startup_prog.global_block()._clone_variable(var)
            var.persistable = True
            v.persistable = True

            real_shape = list(var.shape)
            real_shape[0] = args.batch_size / args.gpus
            startup_prog.global_block().append_op(outputs={"Out": v},
                                                  type="fill_constant",
                                                  attrs={
                                                      "shape": real_shape,
                                                      "value": 1.0,
                                                      "dtype": var.dtype
                                                  })

    if nccl_id_var and trainer_id == 0:
        #FIXME(wuyi): wait other trainer to start listening
        time.sleep(30)

    startup_exe = fluid.Executor(place)
    startup_exe.run(startup_prog)
    strategy = fluid.ExecutionStrategy()
    strategy.num_threads = 1
    strategy.allow_op_delay = False
    exe = fluid.ParallelExecutor(True,
                                 avg_loss.name,
                                 exec_strategy=strategy,
                                 num_trainers=num_trainers,
                                 trainer_id=trainer_id)

    for pass_id in range(args.pass_num):
        num_samples = 0
        iters = 0
        start_time = time.time()
        if not args.use_reader_op:
            reader_generator = train_reader()
        batch_id = 0
        data = None
        while True:
            if not args.use_reader_op:
                data = next(reader_generator, None)
                if data == None:
                    break
            if iters == args.iterations:
                break
            if args.profile and pass_id == 0 and batch_id == 5:
                profiler.start_profiler("All")
            elif args.profile and pass_id == 0 and batch_id == 10:
                profiler.stop_profiler("total", "/tmp/profile_%d" % trainer_id)

            if iters == args.skip_batch_num:
                start_time = time.time()
                num_samples = 0
            if args.use_fake_data or args.use_reader_op:
                try:
                    loss, = exe.run([avg_loss.name])
                except fluid.core.EnforceNotMet as ex:
                    break
            else:
                loss, = exe.run([avg_loss.name], feed=feeder.feed(data))
            if args.use_reader_op:
                num_samples += args.batch_size * args.gpus
            else:
                num_samples += len(data)
            iters += 1
            if batch_id % 1 == 0:
                print("Pass %d, batch %d, loss %s" %
                      (pass_id, batch_id, np.array(loss)))
            batch_id += 1

        print_train_time(start_time, time.time(), num_samples)
        print("current activate thread num: ", threading.active_count())
        if not args.no_test and batch_acc and not args.use_reader_op:
            # we have not implement record io for test
            # skip test when use args.use_reader_op
            test_acc = test(startup_exe, infer_prog, test_reader, feeder,
                            batch_acc)
            print("Pass: %d, Test Accuracy: %f\n" % (pass_id, test_acc))
            print_test_acc(pass_id, test_acc)
Ejemplo n.º 14
0
def train_parallel(train_args, test_args, args, train_prog, test_prog,
                   startup_prog, nccl_id_var, num_trainers, trainer_id):
    over_all_start = time.time()
    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
    feeder = None
    if not args.use_reader_op:
        feed_var_list = [
            var for var in train_prog.global_block().vars.itervalues()
            if var.is_data
        ]
        feeder = fluid.DataFeeder(feed_var_list, place)
    # generate fake:
    if args.use_fake_data:
        for var in feed_var_list:
            v = startup_prog.global_block()._clone_variable(var)
            var.persistable = True
            v.persistable = True

            real_shape = list(var.shape)
            real_shape[0] = args.batch_size / args.gpus
            startup_prog.global_block().append_op(outputs={"Out": v},
                                                  type="fill_constant",
                                                  attrs={
                                                      "shape": real_shape,
                                                      "value": 1.0,
                                                      "dtype": var.dtype
                                                  })

    if nccl_id_var and trainer_id == 0:
        #FIXME(wuyi): wait other trainer to start listening
        time.sleep(30)

    startup_exe = fluid.Executor(place)
    startup_exe.run(startup_prog)
    strategy = fluid.ExecutionStrategy()
    strategy.num_threads = args.cpus
    strategy.allow_op_delay = False
    build_strategy = fluid.BuildStrategy()
    if args.reduce_strategy == "reduce":
        build_strategy.reduce_strategy = fluid.BuildStrategy(
        ).ReduceStrategy.Reduce
    else:
        build_strategy.reduce_strategy = fluid.BuildStrategy(
        ).ReduceStrategy.AllReduce
    build_strategy.fuse_broadcast_op = args.fuse_broadcast_op

    avg_loss = train_args[0]

    if args.update_method == "pserver":
        # parameter server mode distributed training, merge
        # gradients on local server, do not initialize
        # ParallelExecutor with multi server all-reduce mode.
        num_trainers = 1
        trainer_id = 0

    exe = fluid.ParallelExecutor(True,
                                 avg_loss.name,
                                 main_program=train_prog,
                                 exec_strategy=strategy,
                                 build_strategy=build_strategy,
                                 num_trainers=num_trainers,
                                 trainer_id=trainer_id)

    if not args.no_test:
        if args.update_method == "pserver":
            test_scope = None
        else:
            # NOTE: use an empty scope to avoid test exe using NCCLID
            test_scope = fluid.Scope()
        test_exe = fluid.ParallelExecutor(True,
                                          main_program=test_prog,
                                          share_vars_from=exe)

    for pass_id in range(args.pass_num):
        num_samples = 0
        iters = 0
        start_time = time.time()
        if not args.use_reader_op:
            reader_generator = train_args[3]()  #train_reader
        batch_id = 0
        data = None
        if args.use_reader_op:
            train_args[4].start()
        while True:
            if not args.use_reader_op:
                data = next(reader_generator, None)
                if data == None:
                    break
            if args.profile and batch_id == 5:
                profiler.start_profiler("All")
                profiler.reset_profiler()
            elif args.profile and batch_id == 10:
                print("profiling total time: ", time.time() - start_time)
                profiler.stop_profiler(
                    "total", "/tmp/profile_%d_pass%d" % (trainer_id, pass_id))
            if iters == args.iterations:
                reader_generator.close()
                break

            if iters == args.skip_batch_num:
                start_time = time.time()
                num_samples = 0
            fetch_list = [avg_loss.name]
            acc_name_list = [v.name for v in train_args[2]]
            fetch_list.extend(acc_name_list)

            if args.use_fake_data or args.use_reader_op:
                try:
                    fetch_ret = exe.run(fetch_list)
                except fluid.core.EOFException as eof:
                    break
                except fluid.core.EnforceNotMet as ex:
                    traceback.print_exc()
                    break
            else:
                fetch_ret = exe.run(fetch_list, feed=feeder.feed(data))
            if args.use_reader_op:
                num_samples += args.batch_size * args.gpus
            else:
                num_samples += len(data)

            iters += 1
            if batch_id % 1 == 0:
                fetched_data = [np.mean(np.array(d)) for d in fetch_ret]
                print("Pass %d, batch %d, loss %s, accucacys: %s" %
                      (pass_id, batch_id, fetched_data[0], fetched_data[1:]))
            batch_id += 1

        print_train_time(start_time, time.time(), num_samples)
        if args.use_reader_op:
            train_args[4].reset()  # reset reader handle
        else:
            del reader_generator

        if not args.no_test and test_args[2]:
            test_feeder = None
            if not args.use_reader_op:
                test_feed_var_list = [
                    var for var in test_prog.global_block().vars.itervalues()
                    if var.is_data
                ]
                test_feeder = fluid.DataFeeder(test_feed_var_list, place)
            test_ret = test_parallel(test_exe, test_args, args, test_prog,
                                     test_feeder)
            print("Pass: %d, Test Accuracy: %s\n" %
                  (pass_id, [np.mean(np.array(v)) for v in test_ret]))

    print("total train time: ", time.time() - over_all_start)
Ejemplo n.º 15
0
def main(args):
    bert_config = BertConfig(args.bert_config_path)
    bert_config.print_config()

    if args.use_cuda:
        place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0')))
        dev_count = get_device_num()
    else:
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
    exe = fluid.Executor(place)

    task_name = args.task_name.lower()
    processors = {
        'xnli': reader.XnliProcessor,
        'cola': reader.ColaProcessor,
        'mrpc': reader.MrpcProcessor,
        'mnli': reader.MnliProcessor,
    }

    processor = processors[task_name](data_dir=args.data_dir,
                                      vocab_path=args.vocab_path,
                                      max_seq_len=args.max_seq_len,
                                      do_lower_case=args.do_lower_case,
                                      in_tokens=args.in_tokens,
                                      random_seed=args.random_seed)
    num_labels = len(processor.get_labels())

    if not (args.do_train or args.do_val or args.do_test):
        raise ValueError("For args `do_train`, `do_val` and `do_test`, at "
                         "least one of them must be True.")

    train_program = fluid.Program()
    startup_prog = fluid.Program()
    if args.random_seed is not None:
        startup_prog.random_seed = args.random_seed
        train_program.random_seed = args.random_seed

    if args.do_train:
        # NOTE: If num_trainers > 1, the shuffle_seed must be set, because
        # the order of batch data generated by reader
        # must be the same in the respective processes.
        shuffle_seed = 1 if num_trainers > 1 else None
        train_data_generator = processor.data_generator(
            batch_size=args.batch_size,
            phase='train',
            epoch=args.epoch,
            dev_count=dev_count,
            shuffle=args.shuffle,
            shuffle_seed=shuffle_seed)

        num_train_examples = processor.get_num_examples(phase='train')

        if args.in_tokens:
            max_train_steps = args.epoch * num_train_examples // (
                args.batch_size // args.max_seq_len) // dev_count
        else:
            max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count

        warmup_steps = int(max_train_steps * args.warmup_proportion)
        print("Device count: %d" % dev_count)
        print("Num train examples: %d" % num_train_examples)
        print("Max train steps: %d" % max_train_steps)
        print("Num warmup steps: %d" % warmup_steps)

        with fluid.program_guard(train_program, startup_prog):
            with fluid.unique_name.guard():
                train_data_loader, loss, probs, accuracy, num_seqs = create_model(
                    args, bert_config=bert_config, num_labels=num_labels)
                scheduled_lr, loss_scaling = optimization(
                    loss=loss,
                    warmup_steps=warmup_steps,
                    num_train_steps=max_train_steps,
                    learning_rate=args.learning_rate,
                    train_program=train_program,
                    startup_prog=startup_prog,
                    weight_decay=args.weight_decay,
                    scheduler=args.lr_scheduler,
                    use_fp16=args.use_fp16,
                    use_dynamic_loss_scaling=args.use_dynamic_loss_scaling,
                    init_loss_scaling=args.init_loss_scaling,
                    incr_every_n_steps=args.incr_every_n_steps,
                    decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf,
                    incr_ratio=args.incr_ratio,
                    decr_ratio=args.decr_ratio)

    if args.do_val:
        dev_prog = fluid.Program()
        with fluid.program_guard(dev_prog, startup_prog):
            with fluid.unique_name.guard():
                dev_data_loader, loss, probs, accuracy, num_seqs = create_model(
                    args, bert_config=bert_config, num_labels=num_labels)

        dev_prog = dev_prog.clone(for_test=True)
        dev_data_loader.set_batch_generator(
            processor.data_generator(batch_size=args.batch_size,
                                     phase='dev',
                                     epoch=1,
                                     dev_count=1,
                                     shuffle=False), place)

    if args.do_test:
        test_prog = fluid.Program()
        with fluid.program_guard(test_prog, startup_prog):
            with fluid.unique_name.guard():
                test_data_loader, loss, probs, accuracy, num_seqs = create_model(
                    args, bert_config=bert_config, num_labels=num_labels)

        test_prog = test_prog.clone(for_test=True)
        test_data_loader.set_batch_generator(
            processor.data_generator(batch_size=args.batch_size,
                                     phase='test',
                                     epoch=1,
                                     dev_count=1,
                                     shuffle=False), place)

    exe.run(startup_prog)

    if args.do_train:
        if args.init_checkpoint and args.init_pretraining_params:
            print(
                "WARNING: args 'init_checkpoint' and 'init_pretraining_params' "
                "both are set! Only arg 'init_checkpoint' is made valid.")
        if args.init_checkpoint:
            init_checkpoint(exe,
                            args.init_checkpoint,
                            main_program=startup_prog,
                            use_fp16=args.use_fp16)
        elif args.init_pretraining_params:
            init_pretraining_params(exe,
                                    args.init_pretraining_params,
                                    main_program=startup_prog,
                                    use_fp16=args.use_fp16)
    elif args.do_val or args.do_test:
        if not args.init_checkpoint:
            raise ValueError("args 'init_checkpoint' should be set if"
                             "only doing validation or testing!")
        init_checkpoint(exe,
                        args.init_checkpoint,
                        main_program=startup_prog,
                        use_fp16=args.use_fp16)

    if args.do_train:
        exec_strategy = fluid.ExecutionStrategy()
        exec_strategy.use_experimental_executor = args.use_fast_executor
        exec_strategy.num_threads = dev_count
        exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope
        build_strategy = fluid.BuildStrategy()

        if args.use_cuda and num_trainers > 1:
            assert shuffle_seed is not None
            dist_utils.prepare_for_multi_process(exe, build_strategy,
                                                 train_program)
            train_data_generator = fluid.contrib.reader.distributed_batch_reader(
                train_data_generator)

        train_compiled_program = fluid.CompiledProgram(
            train_program).with_data_parallel(loss_name=loss.name,
                                              build_strategy=build_strategy)

        train_data_loader.set_batch_generator(train_data_generator, place)

    if args.do_train:
        train_data_loader.start()
        steps = 0
        total_cost, total_acc, total_num_seqs = [], [], []
        time_begin = time.time()
        throughput = []
        ce_info = []

        total_batch_num = 0  # used for benchmark

        while True:
            try:
                steps += 1

                total_batch_num += 1  # used for benchmark
                if args.max_iter and total_batch_num == args.max_iter:  # used for benchmark
                    return

                if steps % args.skip_steps == 0:
                    if args.use_fp16:
                        fetch_list = [
                            loss.name, accuracy.name, scheduled_lr.name,
                            num_seqs.name, loss_scaling.name
                        ]
                    else:
                        fetch_list = [
                            loss.name, accuracy.name, scheduled_lr.name,
                            num_seqs.name
                        ]
                else:
                    fetch_list = []

                outputs = exe.run(train_compiled_program,
                                  fetch_list=fetch_list)

                if steps % args.skip_steps == 0:
                    if args.use_fp16:
                        np_loss, np_acc, np_lr, np_num_seqs, np_scaling = outputs
                    else:
                        np_loss, np_acc, np_lr, np_num_seqs = outputs

                    total_cost.extend(np_loss * np_num_seqs)
                    total_acc.extend(np_acc * np_num_seqs)
                    total_num_seqs.extend(np_num_seqs)

                    if args.verbose:
                        verbose = "train data_loader queue size: %d, " % train_data_loader.queue.size(
                        )
                        verbose += "learning rate: %f" % np_lr[0]
                        if args.use_fp16:
                            verbose += ", loss scaling: %f" % np_scaling[0]
                        print(verbose)

                    current_example, current_epoch = processor.get_train_progress(
                    )
                    time_end = time.time()
                    used_time = time_end - time_begin

                    # profiler tools
                    if args.is_profiler and current_epoch == 0 and steps == args.skip_steps:
                        profiler.start_profiler("All")
                    elif args.is_profiler and current_epoch == 0 and steps == args.skip_steps * 2:
                        profiler.stop_profiler("total", args.profiler_path)
                        return

                    log_record = "epoch: {}, progress: {}/{}, step: {}, ave loss: {}, ave acc: {}".format(
                        current_epoch, current_example, num_train_examples,
                        steps,
                        np.sum(total_cost) / np.sum(total_num_seqs),
                        np.sum(total_acc) / np.sum(total_num_seqs))
                    ce_info.append([
                        np.sum(total_cost) / np.sum(total_num_seqs),
                        np.sum(total_acc) / np.sum(total_num_seqs), used_time
                    ])
                    if steps > 0:
                        throughput.append(args.skip_steps / used_time)
                        log_record = log_record + ", speed: %f steps/s" % (
                            args.skip_steps / used_time)
                        print(log_record)
                    else:
                        print(log_record)
                    total_cost, total_acc, total_num_seqs = [], [], []
                    time_begin = time.time()

                if steps % args.save_steps == 0:
                    save_path = os.path.join(args.checkpoints,
                                             "step_" + str(steps))
                    fluid.save(program=train_program, model_path=save_path)

                if steps % args.validation_steps == 0:
                    print("Average throughtput: %s" % (np.average(throughput)))
                    throughput = []
                    # evaluate dev set
                    if args.do_val:
                        evaluate(exe, dev_prog, dev_data_loader,
                                 [loss.name, accuracy.name, num_seqs.name],
                                 "dev")
                    # evaluate test set
                    if args.do_test:
                        evaluate(exe, test_prog, test_data_loader,
                                 [loss.name, accuracy.name, num_seqs.name],
                                 "test")
            except fluid.core.EOFException:
                save_path = os.path.join(args.checkpoints,
                                         "step_" + str(steps))
                fluid.save(program=train_program, model_path=save_path)
                train_data_loader.reset()
                break
        if args.enable_ce:
            card_num = get_cards()
            ce_cost = 0
            ce_acc = 0
            ce_time = 0
            try:
                ce_cost = ce_info[-2][0]
                ce_acc = ce_info[-2][1]
                ce_time = ce_info[-2][2]
            except:
                print("ce info error")
            print("kpis\ttrain_duration_%s_card%s\t%s" %
                  (args.task_name, card_num, ce_time))
            print("kpis\ttrain_cost_%s_card%s\t%f" %
                  (args.task_name, card_num, ce_cost))
            print("kpis\ttrain_acc_%s_card%s\t%f" %
                  (args.task_name, card_num, ce_acc))

    # final eval on dev set
    if args.do_val:
        print("Final validation result:")
        evaluate(exe, dev_prog, dev_data_loader,
                 [loss.name, accuracy.name, num_seqs.name], "dev")

    # final eval on test set
    if args.do_test:
        print("Final test result:")
        evaluate(exe, test_prog, test_data_loader,
                 [loss.name, accuracy.name, num_seqs.name], "test")
Ejemplo n.º 16
0
def train_with_dataloader(exe,
                          train_prog,
                          compiled_train_prog,
                          train_dataloader,
                          train_fetch_list,
                          train_metrics,
                          train_batch_size=None,
                          epochs=10,
                          log_interval=0,
                          valid_interval=0,
                          save_dir='./',
                          num_trainers=1,
                          trainer_id=0,
                          save_model_name='model',
                          fix_random_seed=False,
                          compiled_test_prog=None,
                          test_dataloader=None,
                          test_fetch_list=None,
                          test_metrics=None,
                          is_profiler=None,
                          profiler_path=None):
    if not train_dataloader:
        logger.error("[TRAIN] get dataloader failed.")

    train_loss = 0

    epoch_periods = []
    reader_cost_averager = TimeAverager()
    batch_cost_averager = TimeAverager()
    for epoch in range(epochs):
        log_lr_and_step()

        train_iter = 0
        epoch_periods = []

        batch_start = time.time()
        for data in train_dataloader():
            reader_cost_averager.record(time.time() - batch_start)

            train_outs = exe.run(compiled_train_prog,
                                 fetch_list=train_fetch_list,
                                 feed=data)

            batch_cost = time.time() - batch_start
            epoch_periods.append(batch_cost)
            batch_cost_averager.record(batch_cost, num_samples=train_batch_size)

            local_time = time.localtime(time.time())
            str_time = time.strftime("%Y-%m-%d %H:%M:%S", local_time)
            if log_interval > 0 and (train_iter % log_interval == 0):
                time_info_str = "batch_cost: {:.5f} sec, reader_cost: {:.5f} sec".format(
                    batch_cost_averager.get_average(),
                    reader_cost_averager.get_average())
                if train_batch_size:
                    time_info_str += ", ips: {:.5f} samples/sec".format(
                        batch_cost_averager.get_ips_average())
                train_metrics.calculate_and_log_out(
                    train_outs,
                    info='[TRAIN {}] Epoch {}, iter {}, {}'.format(
                        str_time, epoch, train_iter, time_info_str))
                reader_cost_averager.reset()
                batch_cost_averager.reset()

            train_iter += 1
            batch_start = time.time()

            # NOTE: profiler tools, used for benchmark
            if is_profiler and epoch == 0 and train_iter == log_interval:
                profiler.start_profiler("All")
            elif is_profiler and epoch == 0 and train_iter == log_interval + 5:
                profiler.stop_profiler("total", profiler_path)
                return

        if len(epoch_periods) < 1:
            logger.info(
                'No iteration was executed, please check the data reader')
            sys.exit(1)

        logger.info(
            '[TRAIN] Epoch {} training finished, average time: {:.5f} sec'.
            format(epoch, np.mean(epoch_periods[1:])))

        if trainer_id == 0:
            save_model(exe, train_prog, save_dir, save_model_name,
                       "_epoch{}".format(epoch))
        if compiled_test_prog and valid_interval > 0 and (
                epoch + 1) % valid_interval == 0:
            test_with_dataloader(exe, compiled_test_prog, test_dataloader,
                                 test_fetch_list, test_metrics, log_interval,
                                 save_model_name)

    if trainer_id == 0:
        save_model(exe, train_prog, save_dir, save_model_name)
    #when fix_random seed for debug
    if fix_random_seed:
        cards = os.environ.get('CUDA_VISIBLE_DEVICES')
        gpu_num = len(cards.split(","))
        print("kpis\ttrain_cost_card{}\t{}".format(gpu_num, train_loss))
        print("kpis\ttrain_speed_card{}\t{}".format(gpu_num,
                                                    np.mean(epoch_periods)))
Ejemplo n.º 17
0
def do_train(args):
    if args.use_cuda:
        if num_trainers > 1:  # for multi-process gpu training
            dev_count = 1
        else:
            dev_count = fluid.core.get_cuda_device_count()
        gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0))
        place = fluid.CUDAPlace(gpu_id)
    else:
        dev_count = int(os.environ.get('CPU_NUM', 1))
        place = fluid.CPUPlace()

    # define the data generator
    processor = reader.DataProcessor(fpattern=args.training_file,
                                     src_vocab_fpath=args.src_vocab_fpath,
                                     trg_vocab_fpath=args.trg_vocab_fpath,
                                     token_delimiter=args.token_delimiter,
                                     use_token_batch=args.use_token_batch,
                                     batch_size=args.batch_size,
                                     device_count=dev_count,
                                     pool_size=args.pool_size,
                                     sort_type=args.sort_type,
                                     shuffle=args.shuffle,
                                     shuffle_batch=args.shuffle_batch,
                                     start_mark=args.special_token[0],
                                     end_mark=args.special_token[1],
                                     unk_mark=args.special_token[2],
                                     max_length=args.max_length,
                                     n_head=args.n_head)
    batch_generator = processor.data_generator(phase="train")
    if num_trainers > 1:  # for multi-process gpu training
        batch_generator = fluid.contrib.reader.distributed_batch_reader(
            batch_generator)
    args.src_vocab_size, args.trg_vocab_size, args.bos_idx, args.eos_idx, \
        args.unk_idx = processor.get_vocab_summary()

    train_prog = fluid.default_main_program()
    startup_prog = fluid.default_startup_program()
    random_seed = eval(str(args.random_seed))
    if random_seed is not None:
        train_prog.random_seed = random_seed
        startup_prog.random_seed = random_seed

    with fluid.program_guard(train_prog, startup_prog):
        with fluid.unique_name.guard():

            # define input and reader

            input_field_names = desc.encoder_data_input_fields + \
                    desc.decoder_data_input_fields[:-1] + desc.label_data_input_fields
            input_descs = desc.get_input_descs(args.args)
            input_slots = [{
                "name": name,
                "shape": input_descs[name][0],
                "dtype": input_descs[name][1]
            } for name in input_field_names]

            input_field = InputField(input_slots)
            input_field.build(build_pyreader=True)

            # define the network

            sum_cost, avg_cost, token_num = create_net(is_training=True,
                                                       model_input=input_field,
                                                       args=args)

            # define the optimizer

            with fluid.default_main_program()._lr_schedule_guard():
                learning_rate = fluid.layers.learning_rate_scheduler.noam_decay(
                    args.d_model, args.warmup_steps) * args.learning_rate

            optimizer = fluid.optimizer.Adam(learning_rate=learning_rate,
                                             beta1=args.beta1,
                                             beta2=args.beta2,
                                             epsilon=float(args.eps))
            optimizer.minimize(avg_cost)

    # prepare training

    ## decorate the pyreader with batch_generator
    input_field.loader.set_batch_generator(batch_generator)

    ## define the executor and program for training

    exe = fluid.Executor(place)

    exe.run(startup_prog)
    # init position_encoding
    for pos_enc_param_name in desc.pos_enc_param_names:
        pos_enc_param = fluid.global_scope().find_var(
            pos_enc_param_name).get_tensor()

        pos_enc_param.set(
            position_encoding_init(args.max_length + 1, args.d_model), place)

    assert (args.init_from_checkpoint == "") or (args.init_from_pretrain_model
                                                 == "")

    ## init from some checkpoint, to resume the previous training
    if args.init_from_checkpoint:
        load(train_prog, os.path.join(args.init_from_checkpoint,
                                      "transformer"), exe)
        print("finish initing model from checkpoint from %s" %
              (args.init_from_checkpoint))

    ## init from some pretrain models, to better solve the current task
    if args.init_from_pretrain_model:
        load(train_prog,
             os.path.join(args.init_from_pretrain_model, "transformer"), exe)
        print("finish initing model from pretrained params from %s" %
              (args.init_from_pretrain_model))

    build_strategy = fluid.compiler.BuildStrategy()
    build_strategy.enable_inplace = True
    exec_strategy = fluid.ExecutionStrategy()
    if num_trainers > 1:
        dist_utils.prepare_for_multi_process(exe, build_strategy, train_prog)
        exec_strategy.num_threads = 1

    compiled_train_prog = fluid.CompiledProgram(train_prog).with_data_parallel(
        loss_name=avg_cost.name,
        build_strategy=build_strategy,
        exec_strategy=exec_strategy)

    # the best cross-entropy value with label smoothing
    loss_normalizer = -(
        (1. - args.label_smooth_eps) * np.log((1. - args.label_smooth_eps)) +
        args.label_smooth_eps * np.log(args.label_smooth_eps /
                                       (args.trg_vocab_size - 1) + 1e-20))
    # start training

    step_idx = 0
    total_batch_num = 0  # this is for benchmark
    total_batch_token_num = 0  # this is for benchmark word count
    for pass_id in range(args.epoch):
        pass_start_time = time.time()
        input_field.loader.start()

        batch_id = 0
        while True:
            if args.max_iter and total_batch_num == args.max_iter:  # this for benchmark
                return
            try:
                outs = exe.run(compiled_train_prog,
                               fetch_list=[sum_cost.name, token_num.name])

                total_batch_token_num += np.asarray(outs[1]).sum()
                if step_idx % args.print_step == 0:
                    sum_cost_val, token_num_val = np.asarray(
                        outs[0]), np.asarray(outs[1])
                    # sum the cost from multi-devices
                    total_sum_cost = sum_cost_val.sum()
                    total_token_num = token_num_val.sum()
                    total_avg_cost = total_sum_cost / total_token_num

                    if step_idx == 0:
                        logging.info(
                            "step_idx: %d, epoch: %d, batch: %d, avg loss: %f, "
                            "normalized loss: %f, ppl: %f" %
                            (step_idx, pass_id, batch_id, total_avg_cost,
                             total_avg_cost - loss_normalizer,
                             np.exp([min(total_avg_cost, 100)])))
                        avg_batch_time = time.time()
                    else:
                        logging.info(
                            "step_idx: %d, epoch: %d, batch: %d, avg loss: %f, "
                            "normalized loss: %f, ppl: %f, batch speed: %.2f steps/s, ips: %.2f words/sec"
                            % (step_idx, pass_id, batch_id, total_avg_cost,
                               total_avg_cost - loss_normalizer,
                               np.exp([min(total_avg_cost, 100)
                                       ]), args.print_step /
                               (time.time() - avg_batch_time),
                               total_batch_token_num /
                               (time.time() - avg_batch_time)))
                        avg_batch_time = time.time()

                    total_batch_token_num = 0

                if step_idx % args.save_step == 0 and step_idx != 0:
                    if args.save_model_path:
                        model_path = os.path.join(args.save_model_path,
                                                  "step_" + str(step_idx),
                                                  "transformer")
                        fluid.save(train_prog, model_path)

                batch_id += 1
                step_idx += 1
                total_batch_num = total_batch_num + 1  # this is for benchmark

                # profiler tools for benchmark
                if args.is_profiler and pass_id == 0 and batch_id == args.print_step:
                    profiler.start_profiler("All")
                elif args.is_profiler and pass_id == 0 and batch_id == args.print_step + 5:
                    profiler.stop_profiler("total", args.profiler_path)
                    return

            except fluid.core.EOFException:
                input_field.loader.reset()
                break

        time_consumed = time.time() - pass_start_time

    if args.save_model_path:
        model_path = os.path.join(args.save_model_path, "step_final",
                                  "transformer")
        fluid.save(train_prog, model_path)

    if args.enable_ce:  # For CE
        print("kpis\ttrain_cost_card%d\t%f" % (dev_count, total_avg_cost))
        print("kpis\ttrain_duration_card%d\t%f" % (dev_count, time_consumed))
Ejemplo n.º 18
0
def main():
    args = parse_args()
    model_type = args.model_type
    rnn_model = args.rnn_model

    logger = logging.getLogger("lm")
    logger.setLevel(logging.INFO)
    formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    if args.log_path:
        file_handler = logging.FileHandler(args.log_path)
        file_handler.setLevel(logging.INFO)
        file_handler.setFormatter(formatter)
        logger.addHandler(file_handler)
    else:
        console_handler = logging.StreamHandler()
        console_handler.setLevel(logging.INFO)
        console_handler.setFormatter(formatter)
        logger.addHandler(console_handler)

    logger.info('Running with args : {}'.format(args))

    vocab_size = 10000
    if model_type == "test":
        num_layers = 1
        batch_size = 2
        hidden_size = 10
        num_steps = 3
        init_scale = 0.1
        max_grad_norm = 5.0
        epoch_start_decay = 1
        max_epoch = 1
        dropout = 0.0
        lr_decay = 0.5
        base_learning_rate = 1.0
    elif model_type == "small":
        num_layers = 2
        batch_size = 20
        hidden_size = 200
        num_steps = 20
        init_scale = 0.1
        max_grad_norm = 5.0
        epoch_start_decay = 4
        max_epoch = 13
        dropout = 0.0
        lr_decay = 0.5
        base_learning_rate = 1.0
    elif model_type == "medium":
        num_layers = 2
        batch_size = 20
        hidden_size = 650
        num_steps = 35
        init_scale = 0.05
        max_grad_norm = 5.0
        epoch_start_decay = 6
        max_epoch = 39
        dropout = 0.5
        lr_decay = 0.8
        base_learning_rate = 1.0
    elif model_type == "large":
        num_layers = 2
        batch_size = 20
        hidden_size = 1500
        num_steps = 35
        init_scale = 0.04
        max_grad_norm = 10.0
        epoch_start_decay = 14
        max_epoch = 55
        dropout = 0.65
        lr_decay = 1.0 / 1.15
        base_learning_rate = 1.0
    else:
        print("model type not support")
        return

    if not args.save_model_dir:
        save_model_dir = model_type + "_models"
        if args.use_gpu:
            save_model_dir = "gpu_" + save_model_dir
        else:
            save_model_dir = "cpu_" + save_model_dir
        if args.inference_only:
            save_model_dir = "infer_" + save_model_dir
        else:
            save_model_dir = "train_" + save_model_dir
    else:
        save_model_dir = args.save_model_dir

    if args.batch_size > 0:
        batch_size = args.batch_size

    if args.max_epoch > 0:
        max_epoch = args.max_epoch

    if args.profile:
        print(
            "\nProfiler is enabled, only 1 epoch will be ran (set max_epoch = 1).\n"
        )
        max_epoch = 1

    main_program = fluid.Program()
    startup_program = fluid.Program()
    if args.enable_ce:
        startup_program.random_seed = SEED

    with fluid.program_guard(main_program, startup_program):
        # Training process
        loss, last_hidden, last_cell, feed_order = lm_model.lm_model(
            hidden_size,
            vocab_size,
            batch_size,
            num_layers=num_layers,
            num_steps=num_steps,
            init_scale=init_scale,
            dropout=dropout,
            rnn_model=rnn_model)

        # clone from default main program and use it as the validation program
        inference_program = fluid.default_main_program().clone(for_test=True)

        #print(inference_program)

        fluid.clip.set_gradient_clip(clip=fluid.clip.GradientClipByGlobalNorm(
            clip_norm=max_grad_norm))

        learning_rate = fluid.layers.create_global_var(name="learning_rate",
                                                       shape=[1],
                                                       value=1.0,
                                                       dtype='float32',
                                                       persistable=True)

        optimizer = fluid.optimizer.SGD(learning_rate=learning_rate)
        optimizer.minimize(loss)

    place = core.CUDAPlace(0) if args.use_gpu else core.CPUPlace()
    exe = Executor(place)
    exe.run(startup_program)

    device_count = fluid.core.get_cuda_device_count()

    exec_strategy = fluid.ExecutionStrategy()
    exec_strategy.num_threads = device_count
    exec_strategy.use_experimental_executor = False
    exec_strategy.num_iteration_per_drop_scope = 100

    build_strategy = fluid.BuildStrategy()
    build_strategy.enable_inplace = True
    build_strategy.memory_optimize = True

    build_strategy.remove_unnecessary_lock = True
    build_strategy.enable_sequential_execution = False
    build_strategy.cache_runtime_context = True
    build_strategy.cache_expected_kernel = True
    build_strategy.fuse_all_optimizer_ops = True

    if args.parallel:
        train_program = fluid.compiler.CompiledProgram(
            main_program).with_data_parallel(loss_name=loss.name,
                                             build_strategy=build_strategy,
                                             exec_strategy=exec_strategy)
    else:
        train_program = fluid.compiler.CompiledProgram(main_program)

    data_path = args.data_path
    print("begin to load data")
    raw_data = reader.ptb_raw_data(data_path)
    print("finished load data")
    train_data, valid_data, test_data, _ = raw_data

    def prepare_input(batch,
                      init_hidden,
                      init_cell,
                      epoch_id=0,
                      with_lr=True,
                      device_count=1):
        x, y = batch
        new_lr = base_learning_rate * (lr_decay**max(
            epoch_id + 1 - epoch_start_decay, 0.0))
        res = {}
        if device_count > 1 and args.parallel:
            lr = np.ones((device_count), dtype='float32') * new_lr
            x = x.reshape((-1, num_steps, 1))
            y = y.reshape((-1, 1))
        else:
            lr = np.ones((1), dtype='float32') * new_lr
            x = x.reshape((-1, num_steps, 1))
            y = y.reshape((-1, 1))

        res['x'] = x
        res['y'] = y
        res['init_hidden'] = init_hidden
        res['init_cell'] = init_cell
        if with_lr:
            res['learning_rate'] = lr

        return res

    def eval(data):
        if args.inference_only and args.init_params_path:
            dirname = args.init_params_path
            filename = None
            if not os.path.isdir(args.init_params_path):
                dirname = os.path.dirname(args.init_params_path)
                filename = os.path.basename(args.init_params_path)
            fluid.io.load_persistables(exe,
                                       dirname,
                                       main_program=main_program,
                                       filename=filename)
            print("Load parameters from: %s." % args.init_params_path)

        batch_times = []
        start_time = time.time()
        # when eval the batch_size set to 1
        eval_data_iter = reader.get_data_iter(data, batch_size, num_steps)
        total_loss = 0.0
        iters = 0
        init_hidden = np.zeros((num_layers, batch_size, hidden_size),
                               dtype='float32')
        init_cell = np.zeros((num_layers, batch_size, hidden_size),
                             dtype='float32')
        for batch_id, batch in enumerate(eval_data_iter):
            input_data_feed = prepare_input(batch,
                                            init_hidden,
                                            init_cell,
                                            epoch_id=0,
                                            with_lr=False)

            batch_start_time = time.time()
            # eval should not run the grad op and change the parameters.
            # use Executor to eval
            fetch_outs = exe.run(
                program=inference_program,
                feed=input_data_feed,
                fetch_list=[loss.name, last_hidden.name, last_cell.name],
                use_program_cache=True)
            batch_times.append(time.time() - batch_start_time)

            cost_train = np.array(fetch_outs[0])
            init_hidden = np.array(fetch_outs[1])
            init_cell = np.array(fetch_outs[2])

            total_loss += cost_train
            iters += num_steps

        ppl = np.exp(total_loss / iters)

        eval_time_total = time.time() - start_time
        eval_time_run = np.sum(batch_times)

        # Benchmark
        if args.inference_only:
            print("\n======== Benchmark Result ========")
            print(
                "Eval batch_size: %d; Time (total): %.5f s; Time (only run): %.5f s; ppl: %.5f"
                % (batch_size, eval_time_total, eval_time_run, ppl[0]))
            print("")

            # Save the inference model for C++ inference purpose
            fluid.io.save_inference_model(save_model_dir,
                                          feed_order,
                                          [loss, last_hidden, last_cell],
                                          exe,
                                          main_program=inference_program,
                                          model_filename="model",
                                          params_filename="params")
            print("Save inference model to: %s." % save_model_dir)

        return ppl

    def train_an_epoch(epoch_id, batch_times):
        # get train epoch size
        num_batchs = len(train_data) // batch_size
        epoch_size = (num_batchs - 1) // num_steps
        if args.profile:
            log_interval = 1
        else:
            log_interval = max(1, epoch_size // 10)

        data_iter_size = batch_size
        if device_count > 1 and args.parallel:
            data_iter_size = batch_size * device_count
        train_data_iter = reader.get_data_iter(train_data, data_iter_size,
                                               num_steps)

        total_loss = 0
        iters = 0
        if device_count > 1 and args.parallel:
            init_hidden = np.zeros(
                (num_layers * device_count, batch_size, hidden_size),
                dtype='float32')
            init_cell = np.zeros(
                (num_layers * device_count, batch_size, hidden_size),
                dtype='float32')
        else:
            init_hidden = np.zeros((num_layers, batch_size, hidden_size),
                                   dtype='float32')
            init_cell = np.zeros((num_layers, batch_size, hidden_size),
                                 dtype='float32')
        for batch_id, batch in enumerate(train_data_iter):
            input_data_feed = prepare_input(batch,
                                            init_hidden,
                                            init_cell,
                                            epoch_id=epoch_id,
                                            device_count=device_count)

            batch_start_time = time.time()
            fetch_outs = exe.run(train_program,
                                 feed=input_data_feed,
                                 fetch_list=[
                                     loss.name, last_hidden.name,
                                     last_cell.name, "learning_rate"
                                 ],
                                 use_program_cache=True)
            batch_time = time.time() - batch_start_time
            batch_times.append(batch_time)

            cost_train = np.array(fetch_outs[0])
            init_hidden = np.array(fetch_outs[1])
            init_cell = np.array(fetch_outs[2])

            lr = np.array(fetch_outs[3])

            total_loss += cost_train
            iters += num_steps
            if batch_id > 0 and batch_id % log_interval == 0:
                ppl = np.exp(total_loss / iters)
                print(
                    "-- Epoch:[%d]; Batch:[%d]; Time: %.5f s; ppl: %.5f, lr: %.5f"
                    % (epoch_id, batch_id, batch_time, ppl[0], lr[0]))

            if args.profile:
                if batch_id == 1:
                    profiler.reset_profiler()
                elif batch_id >= 11:
                    break

        ppl = np.exp(total_loss / iters)
        return ppl

    def train():
        total_time = 0.0
        for epoch_id in range(max_epoch):
            batch_times = []
            epoch_start_time = time.time()
            train_ppl = train_an_epoch(epoch_id, batch_times)
            epoch_time = time.time() - epoch_start_time
            total_time += epoch_time
            print(
                "\nTrain epoch:[%d]; epoch Time: %.5f; ppl: %.5f; avg_time: %.5f steps/s \n"
                % (epoch_id, epoch_time, train_ppl[0],
                   len(batch_times) / sum(batch_times)))

            # FIXME(zjl): ppl[0] increases as batch_size increases.
            # We should find a better way to calculate ppl by normalizing batch_size.
            if device_count == 1 and batch_size <= 20 and epoch_id == 0 and train_ppl[
                    0] > 1000:
                # for bad init, after first epoch, the loss is over 1000
                # no more need to continue
                print(
                    "Parameters are randomly initialized and not good this time because the loss is over 1000 after the first epoch."
                )
                print("Abort this training process and please start again.")
                return

            if epoch_id == max_epoch - 1 and args.enable_ce:
                # kpis
                print("ptblm\tlstm_language_model_duration\t%s" %
                      (total_time / max_epoch))
                print("ptblm\tlstm_language_model_loss\t%s" % train_ppl[0])

            if not args.profile:
                # NOTE(zjl): sometimes we have not enough data for eval if batch_size is large, i.e., 2100
                # Just skip to avoid error
                def is_valid_data(data, batch_size, num_steps):
                    data_len = len(data)
                    batch_len = data_len // batch_size
                    epoch_size = (batch_len - 1) // num_steps
                    return epoch_size >= 1

                valid_data_valid = is_valid_data(valid_data, batch_size,
                                                 num_steps)

                test_data_valid = is_valid_data(test_data, batch_size,
                                                num_steps)

                if valid_data_valid and test_data_valid:
                    valid_ppl = eval(valid_data)
                    print("Valid ppl: %.5f" % valid_ppl[0])

                    test_ppl = eval(test_data)
                    print("Test ppl: %.5f" % test_ppl[0])
                else:
                    if not valid_data_valid:
                        print(
                            'WARNING: length of valid_data is {}, which is not enough for batch_size {} and num_steps {}'
                            .format(len(valid_data), batch_size, num_steps))

                    if not test_data_valid:
                        print(
                            'WARNING: length of test_data is {}, which is not enough for batch_size {} and num_steps {}'
                            .format(len(test_data), batch_size, num_steps))

                filename = "params_%05d" % epoch_id
                fluid.io.save_persistables(executor=exe,
                                           dirname=save_model_dir,
                                           main_program=main_program,
                                           filename=filename)
                print("Saved model to: %s/%s.\n" % (save_model_dir, filename))

    if args.profile:
        if args.use_gpu:
            profiler.start_profiler("All")
            if not args.inference_only:
                profile_filename = "train_padding_rnn.gpu.profile"
                train()
            else:
                profile_filename = "infer_padding_rnn.gpu.profile"
                eval(test_data)
            profiler.stop_profiler("total", profile_filename)
        else:
            profiler.start_profiler("CPU")
            if not args.inference_only:
                profile_filename = "train_padding_rnn.cpu.profile"
                train()
            else:
                profile_filename = "infer_padding_rnn.cpu.profile"
                eval(test_data)
            profiler.stop_profiler("total", profile_filename)
    else:
        if not args.inference_only:
            train()
        else:
            eval(test_data)
Ejemplo n.º 19
0
def train(args):
    #获取GPU
    place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id)
    print(place)

    with fluid.dygraph.guard(place):
        #多卡上下文
        strategy = fluid.dygraph.parallel.prepare_context()
        print('strategy', strategy)

        # parse config
        config = parse_config(args.config)
        train_config = merge_configs(config, 'train', vars(args))
        valid_config = merge_configs(config, 'valid', vars(args))
        print_configs(train_config, 'Train')
        print(train_config)

        # if args.fix_random_seed:
        #     startup.random_seed = 1000
        #     train_prog.random_seed = 1000

        train_model = Tpn_Model(
            None, cfg=train_config, mode='train'
        )  # models.get_model(args.model_name, train_config, mode='train')

        valid_model = Tpn_Model(
            None
        )  # models.get_model(args.model_name, valid_config, mode='valid')
        train_model.build_input()
        train_dataloader = train_model.dataloader()
        opt = train_model.optimizer()

        # load weights
        weight, _ = fluid.load_dygraph('./ckpt/k400_tpn_r50f32s2')
        model_weights = train_model.state_dict()
        model_weights.update(
            {k: v
             for k, v in weight.items() if k in model_weights})
        train_model.load_dict(model_weights)
        print('load model success')

        # 模型并行
        train_model = fluid.dygraph.parallel.DataParallel(
            train_model, strategy)

        log_interval = args.log_interval
        is_profiler = args.is_profiler
        profiler_path = args.profiler_path
        trainer_id = 0
        fix_random_seed = args.fix_random_seed
        save_dir = args.save_dir
        save_model_name = args.model_name

        # if args.resume:
        #     # if resume weights is given, load resume weights directly
        #     assert os.path.exists(args.resume + '.pdparams'), \
        #         "Given resume weight dir {}.pdparams not exist.".format(args.resume)
        #     fluid.load(train_prog, model_path=args.resume, executor=exe)
        # else:
        #     # if not in resume mode, load pretrain weights
        #     if args.pretrain:
        #         assert os.path.exists(args.pretrain), \
        #             "Given pretrain weight dir {} not exist.".format(args.pretrain)
        #     pretrain = args.pretrain or train_model.get_pretrain_weights()
        #     if pretrain:
        #         train_model.load_pretrain_params(exe, pretrain, train_prog, place)

        # get reader
        bs_denominator = 1
        if args.use_gpu:
            # check number of GPUs
            gpus = os.getenv("CUDA_VISIBLE_DEVICES", "")
            if gpus == "":
                pass
            else:
                gpus = gpus.split(",")
                num_gpus = len(gpus)
                assert num_gpus == train_config.TRAIN.num_gpus, \
                    "num_gpus({}) set by CUDA_VISIBLE_DEVICES " \
                    "shoud be the same as that " \
                    "set in {}({})".format(
                        num_gpus, args.config, train_config.TRAIN.num_gpus)
            bs_denominator = train_config.TRAIN.num_gpus

        train_config.TRAIN.batch_size = int(train_config.TRAIN.batch_size /
                                            bs_denominator)
        valid_config.VALID.batch_size = int(valid_config.VALID.batch_size /
                                            bs_denominator)
        train_reader = get_reader(args.model_name.upper(), 'train',
                                  train_config)
        valid_reader = get_reader(args.model_name.upper(), 'valid',
                                  valid_config)

        # get metrics
        train_metrics = get_metrics(args.model_name.upper(), 'train',
                                    train_config)
        valid_metrics = get_metrics(args.model_name.upper(), 'valid',
                                    valid_config)

        epochs = args.epoch  #or train_model.epoch_num()

        print()

        train_dataloader.set_sample_list_generator(train_reader, places=place)
        # valid_dataloader.set_sample_list_generator(valid_reader, places=exe_places)

        ##多GPU数据读取,必须确保每个进程读取的数据是不同的
        train_dataloader = fluid.contrib.reader.distributed_batch_reader(
            train_dataloader)

        train_model.train()

        for epoch in range(epochs):
            log_lr_and_step()
            train_iter = 0
            epoch_periods = []
            cur_time = time.time()
            for data in train_dataloader():
                train_outs = train_model(data)
                losses, _, _ = train_outs
                log_vars = OrderedDict()
                for loss_name, loss_value in losses.items():
                    # print(loss_name, ':', loss_value.numpy())
                    log_vars[loss_name] = fluid.layers.reduce_mean(loss_value)
                    # print(loss_name, ':', log_vars[loss_name].numpy())

                loss = sum(_value for _key, _value in log_vars.items()
                           if 'loss' in _key)
                # print('total loss', loss.numpy())

                train_outs = [
                    loss.numpy(), train_outs[1].numpy(), train_outs[2].numpy()
                ]

                # print(train_outs[0])
                # print(train_outs[1].shape)
                # print(train_outs[2].shape)

                # # #分类结果
                # prob = softmax(train_outs[1].squeeze())
                #
                # idx = np.argsort(-prob)
                # #print('idx', idx)
                # for i in range(0, 5):
                #     print('{:.3f} -> {}'.format(prob[idx[i]], [idx[i]]),train_outs[2])

                avg_loss = loss
                # 多GPU训练需要对Loss做出调整,并聚合不同设备上的参数梯度
                #avg_loss = train_model.scale_loss(avg_loss)

                avg_loss.backward()
                # 多GPU
                #train_model.apply_collective_grads()

                opt.minimize(avg_loss)
                train_model.clear_gradients()
                period = time.time() - cur_time
                epoch_periods.append(period)
                timeStamp = time.time()
                localTime = time.localtime(timeStamp)
                strTime = time.strftime("%Y-%m-%d %H:%M:%S", localTime)

                if log_interval > 0 and (train_iter % log_interval == 0):
                    train_metrics.calculate_and_log_out(train_outs, \
                                                        info='[TRAIN {}] Epoch {}, iter {}, time {}, '.format(strTime,
                                                                                                              epoch,
                                                                                                              train_iter,
                                                                                                              period))

                    # print('[TRAIN {}] Epoch {}, iter {}, time {}, total_loss {}, loss_cls {},loss_aux {}'.
                    #       format(strTime, epoch, train_iter, period, loss.numpy(),
                    #              log_vars['loss_cls'].numpy(), log_vars['loss_aux'].numpy()
                    #              ))
                train_iter += 1
                cur_time = time.time()

                # NOTE: profiler tools, used for benchmark
                if is_profiler and epoch == 0 and train_iter == log_interval:
                    profiler.start_profiler("All")
                elif is_profiler and epoch == 0 and train_iter == log_interval + 5:
                    profiler.stop_profiler("total", profiler_path)
                    return
            if len(epoch_periods) < 1:
                logger.info(
                    'No iteration was executed, please check the data reader')
                sys.exit(1)

            logger.info(
                '[TRAIN] Epoch {} training finished, average time: {}'.format(
                    epoch, np.mean(epoch_periods[1:])))

            # if trainer_id == 0:
            #     save_model(exe, train_prog, save_dir, save_model_name,
            #                "_epoch{}".format(epoch))
            # if compiled_test_prog and valid_interval > 0 and (
            #         epoch + 1) % valid_interval == 0:
            #     test_with_dataloader(exe, compiled_test_prog, test_dataloader,
            #                          test_fetch_list, test_metrics, log_interval,
            #                          save_model_name)

        if trainer_id == 0:
            # save_model(exe, train_prog, save_dir, save_model_name)
            fluid.save_dygraph(train_model.state_dict(),
                               "{}/{}".format(save_dir, save_model_name))
            fluid.save_dygraph(opt.state_dict(),
                               "{}/{}}".format(save_dir, save_model_name))
        # when fix_random seed for debug
        if fix_random_seed:
            cards = os.environ.get('CUDA_VISIBLE_DEVICES')
            gpu_num = len(cards.split(","))
            print("kpis\ttrain_cost_card{}\t{}".format(gpu_num, loss))
            print("kpis\ttrain_speed_card{}\t{}".format(
                gpu_num, np.mean(epoch_periods)))
Ejemplo n.º 20
0
def main():
    env = os.environ
    FLAGS.dist = 'PADDLE_TRAINER_ID' in env and 'PADDLE_TRAINERS_NUM' in env
    if FLAGS.dist:
        trainer_id = int(env['PADDLE_TRAINER_ID'])
        local_seed = (99 + trainer_id)
        random.seed(local_seed)
        np.random.seed(local_seed)

    if FLAGS.enable_ce:
        random.seed(0)
        np.random.seed(0)

    cfg = load_config(FLAGS.config)
    merge_config(FLAGS.opt)
    check_config(cfg)
    # check if set use_gpu=True in paddlepaddle cpu version
    check_gpu(cfg.use_gpu)
    # check if paddlepaddle version is satisfied
    check_version()

    save_only = getattr(cfg, 'save_prediction_only', False)
    if save_only:
        raise NotImplementedError('The config file only support prediction,'
                                  ' training stage is not implemented now')
    main_arch = cfg.architecture

    if cfg.use_gpu:
        devices_num = fluid.core.get_cuda_device_count()
    else:
        devices_num = int(os.environ.get('CPU_NUM', 1))

    if 'FLAGS_selected_gpus' in env:
        device_id = int(env['FLAGS_selected_gpus'])
    else:
        device_id = 0
    place = fluid.CUDAPlace(device_id) if cfg.use_gpu else fluid.CPUPlace()
    exe = fluid.Executor(place)

    lr_builder = create('LearningRate')
    optim_builder = create('OptimizerBuilder')

    # build program
    startup_prog = fluid.Program()
    train_prog = fluid.Program()
    if FLAGS.enable_ce:
        startup_prog.random_seed = 1000
        train_prog.random_seed = 1000
    with fluid.program_guard(train_prog, startup_prog):
        with fluid.unique_name.guard():
            model = create(main_arch)
            if FLAGS.fp16:
                assert (getattr(model.backbone, 'norm_type', None)
                        != 'affine_channel'), \
                    '--fp16 currently does not support affine channel, ' \
                    ' please modify backbone settings to use batch norm'

            with mixed_precision_context(FLAGS.loss_scale, FLAGS.fp16) as ctx:
                inputs_def = cfg['TrainReader']['inputs_def']
                feed_vars, train_loader = model.build_inputs(**inputs_def)
                train_fetches = model.train(feed_vars)
                loss = train_fetches['loss']
                if FLAGS.fp16:
                    loss *= ctx.get_loss_scale_var()
                lr = lr_builder()
                optimizer = optim_builder(lr)
                optimizer.minimize(loss)

                if FLAGS.fp16:
                    loss /= ctx.get_loss_scale_var()

            if 'use_ema' in cfg and cfg['use_ema']:
                global_steps = _decay_step_counter()
                ema = ExponentialMovingAverage(
                    cfg['ema_decay'], thres_steps=global_steps)
                ema.update()

    # parse train fetches
    train_keys, train_values, _ = parse_fetches(train_fetches)
    train_values.append(lr)

    if FLAGS.eval:
        eval_prog = fluid.Program()
        with fluid.program_guard(eval_prog, startup_prog):
            with fluid.unique_name.guard():
                model = create(main_arch)
                inputs_def = cfg['EvalReader']['inputs_def']
                feed_vars, eval_loader = model.build_inputs(**inputs_def)
                fetches = model.eval(feed_vars)
        eval_prog = eval_prog.clone(True)

        eval_reader = create_reader(cfg.EvalReader, devices_num=1)
        eval_loader.set_sample_list_generator(eval_reader, place)

        # parse eval fetches
        extra_keys = []
        if cfg.metric == 'COCO':
            extra_keys = ['im_info', 'im_id', 'im_shape']
        if cfg.metric == 'VOC':
            extra_keys = ['gt_bbox', 'gt_class', 'is_difficult']
        if cfg.metric == 'WIDERFACE':
            extra_keys = ['im_id', 'im_shape', 'gt_bbox']
        eval_keys, eval_values, eval_cls = parse_fetches(fetches, eval_prog,
                                                         extra_keys)

    # compile program for multi-devices
    build_strategy = fluid.BuildStrategy()
    build_strategy.fuse_all_optimizer_ops = False
    # only enable sync_bn in multi GPU devices
    sync_bn = getattr(model.backbone, 'norm_type', None) == 'sync_bn'
    build_strategy.sync_batch_norm = sync_bn and devices_num > 1 \
        and cfg.use_gpu

    exec_strategy = fluid.ExecutionStrategy()
    # iteration number when CompiledProgram tries to drop local execution scopes.
    # Set it to be 1 to save memory usages, so that unused variables in
    # local execution scopes can be deleted after each iteration.
    exec_strategy.num_iteration_per_drop_scope = 1
    if FLAGS.dist:
        dist_utils.prepare_for_multi_process(exe, build_strategy, startup_prog,
                                             train_prog)
        exec_strategy.num_threads = 1

    exe.run(startup_prog)
    compiled_train_prog = fluid.CompiledProgram(train_prog).with_data_parallel(
        loss_name=loss.name,
        build_strategy=build_strategy,
        exec_strategy=exec_strategy)

    if FLAGS.eval:
        compiled_eval_prog = fluid.CompiledProgram(eval_prog)

    fuse_bn = getattr(model.backbone, 'norm_type', None) == 'affine_channel'

    ignore_params = cfg.finetune_exclude_pretrained_params \
                 if 'finetune_exclude_pretrained_params' in cfg else []

    start_iter = 0
    if FLAGS.resume_checkpoint:
        checkpoint.load_checkpoint(exe, train_prog, FLAGS.resume_checkpoint)
        start_iter = checkpoint.global_step()
    elif cfg.pretrain_weights and fuse_bn and not ignore_params:
        checkpoint.load_and_fusebn(exe, train_prog, cfg.pretrain_weights)
    elif cfg.pretrain_weights:
        checkpoint.load_params(
            exe, train_prog, cfg.pretrain_weights, ignore_params=ignore_params)

    train_reader = create_reader(
        cfg.TrainReader, (cfg.max_iters - start_iter) * devices_num,
        cfg,
        devices_num=devices_num)
    train_loader.set_sample_list_generator(train_reader, place)

    # whether output bbox is normalized in model output layer
    is_bbox_normalized = False
    if hasattr(model, 'is_bbox_normalized') and \
            callable(model.is_bbox_normalized):
        is_bbox_normalized = model.is_bbox_normalized()

    # if map_type not set, use default 11point, only use in VOC eval
    map_type = cfg.map_type if 'map_type' in cfg else '11point'

    train_stats = TrainingStats(cfg.log_smooth_window, train_keys)
    train_loader.start()
    start_time = time.time()
    end_time = time.time()

    cfg_name = os.path.basename(FLAGS.config).split('.')[0]
    save_dir = os.path.join(cfg.save_dir, cfg_name)
    time_stat = deque(maxlen=cfg.log_smooth_window)
    best_box_ap_list = [0.0, 0]  #[map, iter]

    # use VisualDL to log data
    if FLAGS.use_vdl:
        from visualdl import LogWriter
        vdl_writer = LogWriter(FLAGS.vdl_log_dir)
        vdl_loss_step = 0
        vdl_mAP_step = 0

    for it in range(start_iter, cfg.max_iters):
        start_time = end_time
        end_time = time.time()
        time_stat.append(end_time - start_time)
        time_cost = np.mean(time_stat)
        eta_sec = (cfg.max_iters - it) * time_cost
        eta = str(datetime.timedelta(seconds=int(eta_sec)))
        outs = exe.run(compiled_train_prog, fetch_list=train_values)
        stats = {k: np.array(v).mean() for k, v in zip(train_keys, outs[:-1])}

        # use vdl-paddle to log loss
        if FLAGS.use_vdl:
            if it % cfg.log_iter == 0:
                for loss_name, loss_value in stats.items():
                    vdl_writer.add_scalar(loss_name, loss_value, vdl_loss_step)
                vdl_loss_step += 1

        train_stats.update(stats)
        logs = train_stats.log()
        if it % cfg.log_iter == 0 and (not FLAGS.dist or trainer_id == 0):
            strs = 'iter: {}, lr: {:.6f}, {}, time: {:.3f}, eta: {}'.format(
                it, np.mean(outs[-1]), logs, time_cost, eta)
            logger.info(strs)

        # NOTE : profiler tools, used for benchmark
        if FLAGS.is_profiler and it == 5:
            profiler.start_profiler("All")
        elif FLAGS.is_profiler and it == 10:
            profiler.stop_profiler("total", FLAGS.profiler_path)
            return


        if (it > 0 and it % cfg.snapshot_iter == 0 or it == cfg.max_iters - 1) \
           and (not FLAGS.dist or trainer_id == 0):
            save_name = str(it) if it != cfg.max_iters - 1 else "model_final"
            if 'use_ema' in cfg and cfg['use_ema']:
                exe.run(ema.apply_program)
            checkpoint.save(exe, train_prog, os.path.join(save_dir, save_name))

            if FLAGS.eval:
                # evaluation
                resolution = None
                if 'Mask' in cfg.architecture:
                    resolution = model.mask_head.resolution
                results = eval_run(
                    exe,
                    compiled_eval_prog,
                    eval_loader,
                    eval_keys,
                    eval_values,
                    eval_cls,
                    cfg,
                    resolution=resolution)
                box_ap_stats = eval_results(
                    results, cfg.metric, cfg.num_classes, resolution,
                    is_bbox_normalized, FLAGS.output_eval, map_type,
                    cfg['EvalReader']['dataset'])

                # use vdl_paddle to log mAP
                if FLAGS.use_vdl:
                    vdl_writer.add_scalar("mAP", box_ap_stats[0], vdl_mAP_step)
                    vdl_mAP_step += 1

                if box_ap_stats[0] > best_box_ap_list[0]:
                    best_box_ap_list[0] = box_ap_stats[0]
                    best_box_ap_list[1] = it
                    checkpoint.save(exe, train_prog,
                                    os.path.join(save_dir, "best_model"))
                logger.info("Best test box ap: {}, in iter: {}".format(
                    best_box_ap_list[0], best_box_ap_list[1]))

            if 'use_ema' in cfg and cfg['use_ema']:
                exe.run(ema.restore_program)

    train_loader.reset()
Ejemplo n.º 21
0
def train(num_pass=300, use_cuda=False, mem_opt=False):
    dict_size = 100000
    hash_size = 100000
    print_iter = 100
    eval_iter = 6000
    batch_size = 1280
    cpu_num = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
    debug = False

    fluid.default_startup_program().random_seed = 1
    fluid.default_main_program().random_seed = 1
    np.random.seed = 1

    # construct network
    loss, pos_sim, train_program, test_program = net(hash_size=hash_size,
                                                     dict_size=dict_size)

    #  optimizer = fluid.optimizer.Adam(learning_rate=1e-4)
    #  optimizer = fluid.optimizer.SGD(learning_rate=1e-4)
    #  optimizer.minimize(loss)

    # memory optimize
    if mem_opt:
        fluid.memory_optimize(fluid.default_main_program())

    for var in train_program.blocks[0].vars:
        #  if "GRAD" not in var and not train_program.blocks[0].var(var).is_data:
        if not train_program.blocks[0].var(var).is_data:
            train_program.blocks[0].var(var).persistable = True
            print(var, train_program.blocks[0].var(var).persistable,
                  train_program.blocks[0].var(var).shape)

    # initialize
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())

    print('startup_program', fluid.default_startup_program())
    print('train_program', train_program)
    #  print('test_program', test_program)

    if debug:
        var_name_list = (
            "cos_sim_1.tmp_0@GRAD", "fc_2.tmp_1@GRAD", "fc_2.tmp_0@GRAD",
            "softsign_2.tmp_0@GRAD", "reduce_sum_2.tmp_0@GRAD",
            "stack_2.tmp_0@GRAD", "sequence_pool_23.tmp_0@GRAD",
            "sequence_pool_23.tmp_0@GRAD", "embedding_23.tmp_0@GRAD",
            "PyramidHash_emb_0@GRAD@RENAME@0",
            "PyramidHash_emb_0@GRAD@RENAME@1",
            "PyramidHash_emb_0@GRAD@RENAME@2",
            "PyramidHash_emb_0@GRAD@RENAME@3",
            "PairwiseMarginLoss_0.tmp_0@GRAD", "cos_sim_1.tmp_0",
            "cos_sim_1.tmp_0@GRAD", "fc_2.tmp_1@GRAD", "fc_2.tmp_0@GRAD",
            "softsign_2.tmp_0@GRAD", "reduce_sum_2.tmp_0@GRAD",
            "stack_2.tmp_0@GRAD", "sequence_pool_23.tmp_0@GRAD",
            "embedding_23.tmp_0@GRAD", "PyramidHash_emb_0@GRAD", "FC_1@GRAD",
            "EmbeddingWithVSum_emb_0@GRAD", "fc_0.w_0@GRAD",
            "PairwiseMarginLoss_0.tmp_0", "PairwiseMarginLoss_0.tmp_1")
        #  var_name_list = ("sequence_pool_23.tmp_0@GRAD", "embedding_23.tmp_0@GRAD", "PyramidHash_emb_0@GRAD@RENAME@0", "PyramidHash_emb_0@GRAD", "FC_1@GRAD", "EmbeddingWithVSum_emb_0@GRAD", "fc_0.w_0@GRAD", "PairwiseMarginLoss_0.tmp_0", "PairwiseMarginLoss_0.tmp_1")
        for name in var_name_list:
            train_program.blocks[0].var(name).persistable = True
            print('find var', name,
                  train_program.blocks[0].var(name).persistable)

    # PE
    exec_strategy = fluid.ExecutionStrategy()
    exec_strategy.use_cuda = use_cuda
    exec_strategy.allow_op_delay = True
    exec_strategy.num_threads = 1
    #  exec_strategy.num_threads = int(os.environ.get('THREAD_NUM', 1)) * cpu_num - 1
    #  exec_strategy.num_threads = 25
    exec_strategy.use_experimental_executor = True
    build_strategy = fluid.BuildStrategy()
    build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
    #  build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
    #  build_strategy.optimize_strategy = fluid.BuildStrategy.OptimizeStrategy.NoLock
    #  pass_builder = build_strategy._create_passes_from_strategy()
    #  pass_builder.insert_pass(0, "lock_free_optimize_pass")
    train_exe = fluid.ParallelExecutor(use_cuda=use_cuda,
                                       loss_name=None,
                                       main_program=train_program,
                                       build_strategy=build_strategy,
                                       exec_strategy=exec_strategy)

    test_exe = fluid.ParallelExecutor(
        use_cuda=use_cuda,
        main_program=test_program,
        share_vars_from=train_exe,
    )

    # DataFeeder
    feed_var_names = [
        'query_basic', 'query_phrase', 'pos_title_basic', 'pos_title_phrase',
        'neg_title_basic', 'neg_title_phrase', 'label'
    ]
    feed_list = [
        train_program.global_block().var(var_name)
        for var_name in feed_var_names
    ]
    feeder = fluid.DataFeeder(feed_list, place)
    #  batch_train_reader = feeder.decorate_reader(
    #  paddle.batch(reader.train_reader, batch_size=batch_size // cpu_num),
    #  multi_devices=true)
    batch_train_reader = feeder.decorate_reader(paddle.batch(
        reader.train_reader, batch_size=1280),
                                                multi_devices=True)

    test_feed_var_names = [
        'query_basic', 'query_phrase', 'pos_title_basic', 'pos_title_phrase',
        'neg_title_basic', 'neg_title_phrase'
    ]
    test_feed_list = [
        train_program.global_block().var(var_name)
        for var_name in test_feed_var_names
    ]
    test_feeder = fluid.DataFeeder(test_feed_list, place)

    # train
    for epoch in six.moves.xrange(num_pass):
        count = 0
        total_loss = .0
        total_time = .0

        read_data_start = time.time()
        for train_data in batch_train_reader():
            read_data_end = time.time()
            #  print('read data: ', read_data_end - read_data_start)

            if count == 1 and epoch >= 1:
                #  if count % eval_iter == 0:
                print('start eval')
                t2 = time.time()
                #  with open('./eval_log/train_mini_data_' + str(epoch) + '_' + str(count) + '_' + str(time.time()), 'w') as f:
                with open(
                        './eval_res/z_' + paddle.version.commit +
                        'sgd_nolock_result_' + str(epoch) + '_' +
                        str(time.time()), 'w') as f:
                    test_batch_reader = paddle.batch(
                        reader.test_reader,
                        #  batch_size=cpu_num * 128)
                        batch_size=1280)
                    for test_data in test_batch_reader():
                        qids = []
                        labels = []
                        data_list = []
                        for one_data in test_data:
                            qids.append(one_data[0])
                            labels.append(int(one_data[-1][0]))
                            data_list.append((one_data[1:-1]))
                        predicts = test_exe.run(
                            feed=test_feeder.feed(data_list),
                            fetch_list=[pos_sim.name])
                        scores = np.array(predicts[0])

                        for qid, label, score in six.moves.zip(
                                qids, labels, scores):
                            f.write(
                                str(qid) + '\t' + str(score[0]) + '\t' +
                                str(label) + '\n')

                print('end eval', time.time() - t2)

                start = time.time()

            if epoch == 0 and count == 5:
                profiler.start_profiler("CPU")
            elif epoch == 0 and count == 10:
                profiler.stop_profiler("total",
                                       "/paddle/Pyramid_DNN/fluid/profile")

            t1 = time.time()

            cost = train_exe.run(feed=train_data, fetch_list=[])

            total_time += time.time() - t1
            #  total_loss += np.array(cost[0]).mean()
            count += 1

            if debug:
                for name in var_name_list:
                    var = np.array(
                        fluid.executor._fetch_var(name, return_numpy=False))
                    if name == "PyramidHash_emb_0@GRAD@RENAME@0":
                        print('fetch var', name, var)
                        print('check not zero', name, np.count_nonzero(var))

                    print('fetch var', name, var)
                    print('check nan var', name, np.isnan(var).any())
                    print('check inf var', name, np.isinf(var).any())

            if count % print_iter == 0:
                print('epoch: %d, batch_id: %d, avg_cost: %s, avg_time: %f' %
                      (epoch, count, total_loss / print_iter,
                       float(total_time) / print_iter))
                import sys
                sys.stdout.flush()
                total_time = .0
                total_loss = .0

            read_data_start = time.time()