Ejemplo n.º 1
0
def train():
    args = parse_args()

    config_path = args.config_path
    train_path = args.train_dir
    epoch_num = args.epoch_num
    use_cuda = True if args.use_cuda else False
    use_parallel = True if args.parallel else False

    logger.info("reading data begins")
    user_count, item_count, cat_count = reader.config_read(config_path)
    #data_reader, max_len = reader.prepare_reader(train_path, args.batch_size)
    logger.info("reading data completes")

    avg_cost, pred = network.network(item_count, cat_count, 433)
    #fluid.clip.set_gradient_clip(clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=5.0))
    base_lr = args.base_lr
    boundaries = [410000]
    values = [base_lr, 0.2]
    sgd_optimizer = fluid.optimizer.SGD(
        learning_rate=fluid.layers.piecewise_decay(boundaries=boundaries,
                                                   values=values))
    sgd_optimizer.minimize(avg_cost)

    def train_loop(main_program):
        data_reader, max_len = reader.prepare_reader(train_path,
                                                     args.batch_size)
        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
        exe = fluid.Executor(place)
        exe.run(fluid.default_startup_program())
        feeder = fluid.DataFeeder(feed_list=[
            "hist_item_seq", "hist_cat_seq", "target_item", "target_cat",
            "label", "mask", "target_item_seq", "target_cat_seq"
        ],
                                  place=place)
        if use_parallel:
            train_exe = fluid.ParallelExecutor(use_cuda=use_cuda,
                                               loss_name=avg_cost.name,
                                               main_program=main_program)
        else:
            train_exe = exe
        logger.info("train begins")
        global_step = 0
        PRINT_STEP = 1000

        start_time = time.time()
        loss_sum = 0.0
        for id in range(epoch_num):
            epoch = id + 1
            for data in data_reader():
                global_step += 1
                results = train_exe.run(main_program,
                                        feed=feeder.feed(data),
                                        fetch_list=[avg_cost.name, pred.name],
                                        return_numpy=True)
                loss_sum += results[0].mean()

                if global_step % PRINT_STEP == 0:
                    logger.info(
                        "epoch: %d\tglobal_step: %d\ttrain_loss: %.4f\t\ttime: %.2f"
                        % (epoch, global_step, loss_sum / PRINT_STEP,
                           time.time() - start_time))
                    start_time = time.time()
                    loss_sum = 0.0

                    if (global_step > 400000 and global_step % PRINT_STEP
                            == 0) or (global_step < 400000
                                      and global_step % 50000 == 0):
                        save_dir = args.model_dir + "/global_step_" + str(
                            global_step)
                        feed_var_name = [
                            "hist_item_seq", "hist_cat_seq", "target_item",
                            "target_cat", "label", "mask", "target_item_seq",
                            "target_cat_seq"
                        ]
                        fetch_vars = [avg_cost, pred]
                        fluid.io.save_inference_model(save_dir, feed_var_name,
                                                      fetch_vars, exe)
        train_exe.close()

    t = fluid.DistributeTranspiler()
    t.transpile(args.trainer_id,
                pservers=args.endpoints,
                trainers=args.trainers)
    if args.role == "pserver":
        logger.info("run psever")
        prog, startup = t.get_pserver_programs(args.current_endpoint)
        exe = fluid.Executor(fluid.CPUPlace())
        exe.run(startup)
        exe.run(prog)
    elif args.role == "trainer":
        logger.info("run trainer")
        train_loop(t.get_trainer_program())
Ejemplo n.º 2
0
def train():
    args = parse_args()

    if args.enable_ce:
        SEED = 102
        fluid.default_main_program().random_seed = SEED
        fluid.default_startup_program().random_seed = SEED

    config_path = args.config_path
    train_path = args.train_dir
    epoch_num = args.epoch_num
    use_cuda = True if args.use_cuda else False
    use_parallel = True if args.parallel else False

    logger.info("reading data begins")
    user_count, item_count, cat_count = reader.config_read(config_path)
    data_reader, max_len = reader.prepare_reader(
        train_path, args.batch_size * args.num_devices)
    logger.info("reading data completes")

    avg_cost, pred, feed_list = network.network(item_count, cat_count)
    fluid.clip.set_gradient_clip(clip=fluid.clip.GradientClipByGlobalNorm(
        clip_norm=5.0))
    base_lr = args.base_lr
    boundaries = [410000]
    values = [base_lr, 0.2]
    sgd_optimizer = fluid.optimizer.SGD(
        learning_rate=fluid.layers.piecewise_decay(boundaries=boundaries,
                                                   values=values))
    sgd_optimizer.minimize(avg_cost)

    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()

    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())

    loader = fluid.io.DataLoader.from_generator(feed_list=feed_list,
                                                capacity=10000,
                                                iterable=True)
    loader.set_sample_list_generator(data_reader, places=place)
    if use_parallel:
        train_exe = fluid.ParallelExecutor(use_cuda=use_cuda,
                                           loss_name=avg_cost.name)
    else:
        train_exe = exe

    logger.info("train begins")

    global_step = 0
    PRINT_STEP = 1000

    total_time = []
    ce_info = []
    start_time = time.time()
    loss_sum = 0.0
    for id in range(epoch_num):
        epoch = id + 1
        for data in loader():
            global_step += 1
            results = train_exe.run(feed=data,
                                    fetch_list=[avg_cost.name, pred.name],
                                    return_numpy=True)
            loss_sum += results[0].mean()

            if global_step % PRINT_STEP == 0:
                ce_info.append(loss_sum / PRINT_STEP)
                total_time.append(time.time() - start_time)
                logger.info(
                    "epoch: %d\tglobal_step: %d\ttrain_loss: %.4f\t\ttime: %.2f"
                    % (epoch, global_step, loss_sum / PRINT_STEP,
                       time.time() - start_time))
                start_time = time.time()
                loss_sum = 0.0

                if (global_step > 400000 and global_step % PRINT_STEP
                        == 0) or (global_step <= 400000
                                  and global_step % 50000 == 0):
                    save_dir = os.path.join(args.model_dir,
                                            "global_step_" + str(global_step))
                    feed_var_name = [
                        "hist_item_seq", "hist_cat_seq", "target_item",
                        "target_cat", "label", "mask", "target_item_seq",
                        "target_cat_seq"
                    ]
                    fetch_vars = [avg_cost, pred]
                    fluid.io.save_inference_model(save_dir, feed_var_name,
                                                  fetch_vars, exe)
                    logger.info("model saved in " + save_dir)
            if args.enable_ce and global_step >= args.batch_num:
                break
    # only for ce
    if args.enable_ce:
        gpu_num = get_cards(args)
        ce_loss = 0
        ce_time = 0
        try:
            ce_loss = ce_info[-1]
            ce_time = total_time[-1]
        except:
            print("ce info error")
        print("kpis\teach_pass_duration_card%s\t%s" % (gpu_num, ce_time))
        print("kpis\ttrain_loss_card%s\t%s" % (gpu_num, ce_loss))