Ejemplo n.º 1
0
def train():
    args = parse_args()
    # add ce
    if args.enable_ce:
        SEED = 102
        fluid.default_main_program().random_seed = SEED
        fluid.default_startup_program().random_seed = SEED

    print('---------- Configuration Arguments ----------')
    for key, value in args.__dict__.items():
        print(key + ':' + str(value))

    if not os.path.isdir(args.model_output_dir):
        os.mkdir(args.model_output_dir)

    loss, auc, data_list, auc_states = ctr_deepfm_model(
        args.embedding_size, args.num_field, args.num_feat, args.layer_sizes,
        args.act, args.reg)
    optimizer = fluid.optimizer.SGD(
        learning_rate=args.lr,
        regularization=fluid.regularizer.L2DecayRegularizer(args.reg))
    optimizer.minimize(loss)

    exe = fluid.Executor(fluid.CPUPlace())
    exe.run(fluid.default_startup_program())

    dataset = fluid.DatasetFactory().create_dataset()
    dataset.set_use_var(data_list)
    pipe_command = 'python criteo_reader.py {}'.format(args.feat_dict)
    dataset.set_pipe_command(pipe_command)
    dataset.set_batch_size(args.batch_size)
    dataset.set_thread(args.num_thread)
    train_filelist = [
        os.path.join(args.train_data_dir, x)
        for x in os.listdir(args.train_data_dir)
    ]

    print('---------------------------------------------')
    for epoch_id in range(args.num_epoch):
        start = time.time()
        dataset.set_filelist(train_filelist)
        exe.train_from_dataset(
            program=fluid.default_main_program(),
            dataset=dataset,
            fetch_list=[loss, auc],
            fetch_info=['epoch %d batch loss' % (epoch_id + 1), "auc"],
            print_period=1000,
            debug=False)
        model_dir = os.path.join(args.model_output_dir,
                                 'epoch_' + str(epoch_id + 1))
        sys.stderr.write('epoch%d is finished and takes %f s\n' %
                         ((epoch_id + 1), time.time() - start))
        fluid.io.save_persistables(executor=exe,
                                   dirname=model_dir,
                                   main_program=fluid.default_main_program())
Ejemplo n.º 2
0
def infer():
    args = parse_args()

    place = fluid.CPUPlace()
    inference_scope = fluid.Scope()

    test_files = [
        os.path.join(args.test_data_dir, x)
        for x in os.listdir(args.test_data_dir)
    ]
    criteo_dataset = CriteoDataset()
    criteo_dataset.setup(args.feat_dict)
    test_reader = paddle.batch(
        criteo_dataset.test(test_files), batch_size=args.batch_size)

    startup_program = fluid.framework.Program()
    test_program = fluid.framework.Program()
    cur_model_path = os.path.join(args.model_output_dir,
                                  'epoch_' + args.test_epoch)

    with fluid.scope_guard(inference_scope):
        with fluid.framework.program_guard(test_program, startup_program):
            loss, auc, data_list, auc_states = ctr_deepfm_model(
                args.embedding_size, args.num_field, args.num_feat,
                args.layer_sizes, args.act, args.reg)

            exe = fluid.Executor(place)
            feeder = fluid.DataFeeder(feed_list=data_list, place=place)
            fluid.io.load_persistables(
                executor=exe,
                dirname=cur_model_path,
                main_program=fluid.default_main_program())

            for var in auc_states:  # reset auc states
                set_zero(var.name, scope=inference_scope, place=place)

            loss_all = 0
            num_ins = 0
            for batch_id, data_test in enumerate(test_reader()):
                loss_val, auc_val = exe.run(test_program,
                                            feed=feeder.feed(data_test),
                                            fetch_list=[loss.name, auc.name])
                num_ins += len(data_test)
                loss_all += loss_val
                logger.info('TEST --> batch: {} loss: {} auc_val: {}'.format(
                    batch_id, loss_all / num_ins, auc_val))

            print(
                'The last log info is the total Logloss and AUC for all test data. '
            )
Ejemplo n.º 3
0
def train():
    """ do training """
    args = parse_args()
    print(args)

    if args.trainer_id == 0 and not os.path.isdir(args.model_output_dir):
        os.mkdir(args.model_output_dir)

    loss, auc, data_list, auc_states = ctr_deepfm_model(
        args.embedding_size, args.num_field, args.num_feat, args.layer_sizes,
        args.act, args.reg, args.is_sparse)
    optimizer = fluid.optimizer.SGD(
        learning_rate=args.lr,
        regularization=fluid.regularizer.L2DecayRegularizer(args.reg))
    optimizer.minimize(loss)

    def train_loop(main_program):
        """ train network """
        start_time = time.time()
        dataset = fluid.DatasetFactory().create_dataset()
        dataset.set_use_var(data_list)
        pipe_command = 'python criteo_reader.py {}'.format(args.feat_dict)
        dataset.set_pipe_command(pipe_command)
        dataset.set_batch_size(args.batch_size)
        dataset.set_thread(args.num_thread)
        train_filelist = [
            os.path.join(args.train_data_dir, x)
            for x in os.listdir(args.train_data_dir)
        ]

        if args.use_gpu == 1:
            exe = fluid.Executor(fluid.CUDAPlace(0))
            dataset.set_thread(1)
        else:
            exe = fluid.Executor(fluid.CPUPlace())
            dataset.set_thread(args.num_thread)
        exe.run(fluid.default_startup_program())

        for epoch_id in range(args.num_epoch):
            start = time.time()
            sys.stderr.write('\nepoch%d start ...\n' % (epoch_id + 1))
            dataset.set_filelist(train_filelist)
            exe.train_from_dataset(
                program=main_program,
                dataset=dataset,
                fetch_list=[loss, auc],
                fetch_info=['epoch %d batch loss' % (epoch_id + 1), "auc"],
                print_period=5,
                debug=False)
            model_dir = os.path.join(args.model_output_dir,
                                     'epoch_' + str(epoch_id + 1))
            sys.stderr.write('epoch%d is finished and takes %f s\n' % (
                (epoch_id + 1), time.time() - start))
            if args.trainer_id == 0:  # only trainer 0 save model
                print("save model in {}".format(model_dir))
                fluid.save(main_program, model_dir)

        print("train time cost {:.4f}".format(time.time() - start_time))
        print("finish training")

    if args.is_local:
        print("run local training")
        train_loop(fluid.default_main_program())
    else:
        print("run distribute training")
        t = fluid.DistributeTranspiler()
        t.transpile(
            args.trainer_id, pservers=args.endpoints, trainers=args.trainers)
        if args.role == "pserver":
            print("run psever")
            pserver_prog, pserver_startup = t.get_pserver_programs(
                args.current_endpoint)

            exe = fluid.Executor(fluid.CPUPlace())
            exe.run(pserver_startup)
            exe.run(pserver_prog)
        elif args.role == "trainer":
            print("run trainer")
            train_loop(t.get_trainer_program())