Beispiel #1
0
        def save_inference_model():
            save_interval = envs.get_global_env(
                "save.inference.epoch_interval", -1, namespace)

            if not need_save(epoch_id, save_interval, False):
                return

            feed_varnames = envs.get_global_env("save.inference.feed_varnames",
                                                None, namespace)
            fetch_varnames = envs.get_global_env(
                "save.inference.fetch_varnames", None, namespace)
            if feed_varnames is None or fetch_varnames is None:
                return

            fetch_vars = [
                fluid.default_main_program().global_block().vars[varname]
                for varname in fetch_varnames
            ]
            dirname = envs.get_global_env("save.inference.dirname", None,
                                          namespace)

            assert dirname is not None
            dirname = os.path.join(dirname, str(epoch_id))

            if is_fleet:
                fleet.save_inference_model(self._exe, dirname, feed_varnames,
                                           fetch_vars)
            else:
                fluid.io.save_inference_model(dirname, feed_varnames,
                                              fetch_vars, self._exe)
            self.inference_models.append((epoch_id, dirname))
Beispiel #2
0
        def save_inference_model():
            save_interval = envs.get_global_env(
                "save.inference.epoch_interval", -1, namespace)

            if not need_save(epoch_id, save_interval, False):
                return

            print("save inference model is not supported now.")
            return

            feed_varnames = envs.get_global_env("save.inference.feed_varnames",
                                                None, namespace)
            fetch_varnames = envs.get_global_env(
                "save.inference.fetch_varnames", None, namespace)
            fetch_vars = [
                fluid.global_scope().vars[varname]
                for varname in fetch_varnames
            ]
            dirname = envs.get_global_env("save.inference.dirname", None,
                                          namespace)

            assert dirname is not None
            dirname = os.path.join(dirname, str(epoch_id))

            if is_fleet:
                fleet.save_inference_model(dirname, feed_varnames, fetch_vars)
            else:
                fluid.io.save_inference_model(dirname, feed_varnames,
                                              fetch_vars, self._exe)
            self.inference_models.append((epoch_id, dirname))
Beispiel #3
0
    def save_model(self, FLAGS, net_output, global_step):
        """
            save model
        """
        if (global_step != "final" and global_step % FLAGS.save_model_steps != 0) \
                or not fleet.is_first_worker():
            return

        path = "%s/checkpoint_%s" % (FLAGS.train_dir, global_step)
        fleet.save_inference_model(self.paddle_env['exe'],
                path, 
                net_output['model_output']['feeded_var_names'],
                net_output['model_output']['fetch_targets'])
        #or
        fleet.save_persistables(self.paddle_env['exe'], path)
        self.record_checkpoint(FLAGS, global_step)
     exe.train_from_dataset(
         program=fluid.default_main_program(),
         dataset=dataset,
         fetch_handler=FH([auc_var.name], 10, True),
         # fetch_list=[auc_var],
         # fetch_info=["auc"],
         debug=False)
 path = "./saved_models/" + current_date_hr.strftime(
     DATE_TIME_STRING_FORMAT) + "_model/"
 logger.info("save inference program: " + path)
 if len(y_auc) <= 1:
     logger.info("Current AUC: " + str(y_auc[-1]))
 else:
     logger.info("Dataset is too small, cannot get AUC.")
 fetch_list = fleet.save_inference_model(
     exe, path,
     [x.name for x in sparse_input_ids] + [label.name],
     [auc_var])
 os.system("hadoop fs -D hadoop.job.ugi=" + hdfs_ugi +
           " -D fs.defaultFS=" + hdfs_address + " -put -f " +
           path + " " + os.path.join(
               dataset_prefix,
               current_date_hr.strftime(
                   DATE_TIME_STRING_FORMAT).split("/")[0]) +
           " >/dev/null 2>&1")
 os.system('touch donefile')
 os.system(
     "hadoop fs -D hadoop.job.ugi=" + hdfs_ugi +
     " -D fs.defaultFS=" + hdfs_address + " -put -f donefile" +
     " " + os.path.join(
         dataset_prefix,
         current_date_hr.strftime(DATE_TIME_STRING_FORMAT) +
Beispiel #5
0
def train(use_cuda, save_dirname, is_local, is_increment):
    """
    train
    """
    # predict, avg_cost, feed_order, auc_var, auc_batch, auc_states = model()
    old_model = None
    model_args = model()
    predict = model_args['predict']
    avg_cost = model_args['avg_cost']
    feed_order = model_args['feed_order']
    loader = model_args['loader']
    auc_batch = model_args['auc'][1]

    # 加入 fleet distributed_optimizer 加入分布式策略配置及多机优化
    sgd_optimizer = AdamOptimizer(learning_rate=2e-4)
    # sgd_optimizer = fluid.optimizer.Adam(learning_rate=2e-5)

    if is_local:
        sgd_optimizer.minimize(avg_cost)
        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()

        exe = Executor(place)
        readers = []
        for i in range(16):
            readers.append(data_reader(cluster_train_dir))
        multi_readers = paddle.reader.multiprocess_reader(readers)
        loader.set_sample_generator(
            multi_readers, batch_size=BATCH_SIZE, places=fluid.cpu_places(CPU_NUM))
            # data_reader(cluster_train_dir), batch_size=BATCH_SIZE, places=fluid.cpu_places(CPU_NUM))
        # feeder = fluid.DataFeeder(feed_order, place)
        # train_reader = feeder.decorate_reader(
        #     paddle.batch(paddle.reader.shuffle(
        #         data_reader(cluster_train_dir), buf_size=8192), batch_size=BATCH_SIZE),
        #          multi_devices=False, drop_last=True)

        start_program = fluid.default_startup_program()
        exe.run(start_program)
        main_prog = fluid.default_main_program()

        exec_strategy = fluid.ExecutionStrategy()
        exec_strategy.num_threads = CPU_NUM * 2
        build_strategy = fluid.BuildStrategy()
        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce # cpu reduce faster
        build_strategy.fuse_broadcast_ops = True
        # build_strategy.async_mode = True
        main_program = fluid.CompiledProgram(main_prog).with_data_parallel(
            loss_name=avg_cost.name, exec_strategy=exec_strategy, build_strategy=build_strategy)
            #loss_name=avg_cost.name, exec_strategy=exec_strategy, build_strategy=build_strategy, places=fluid.cpu_places(CPU_NUM))

        if is_increment:  # load model to fine-tune
            fluid.io.load_params(exe, old_model, main_program)
            for auc_state in model_args['auc'][2]:
                set_zero(place, fluid.global_scope(), auc_state.name)

        # 并行训练,速度更快
        # train_pe = fluid.ParallelExecutor(use_cuda=use_cuda,
        #                                   main_program=main_program, loss_name=avg_cost.name,
        #                                   exec_strategy=exec_strategy, build_strategy=build_strategy)

        cost_list = []
        auc_list = []
        import time
        pass_s_time = time.time()
        for pass_id in range(PASS_NUM):
            s_time = time.time()
            for batch_id, data in enumerate(loader()):
                r_time = time.time() - s_time
                st_time = time.time()
                cost_value, auc_value = exe.run(
                    program=main_program,
                    feed=data,
                    fetch_list=[avg_cost.name, auc_batch.name])
                t_time = time.time() - st_time
                cost_list.append(np.array(cost_value))
                auc_list.append(np.array(auc_value))

                if batch_id % 10 == 0 and batch_id != 0:
                    print "Pass %d, batch %d, cost %s auc %s readtime %f triantime %f" % \
                          (pass_id, batch_id, np.array(cost_list).mean(),
                           np.array(auc_list).mean(), r_time, t_time)
                    cost_list = []
                    auc_list = []
                if batch_id % 1000 == 0:
                    if save_dirname is not None:
                        fluid.io.save_inference_model(
                            save_dirname,
                            feed_order,
                            [predict, avg_cost, auc_batch], exe
                        )
                        fluid.io.save_persistables(exe, save_dirname)
                        infer(cluster_test_dir, save_dirname, feed_order)
                s_time = time.time()
        pass_time = time.time() - pass_s_time
        print("Pass train time: %f" % pass_time)

    else:
        role = role_maker.PaddleCloudRoleMaker()
        # 全异步训练
        config = DistributeTranspilerConfig()
        config.sync_mode = False
        config.runtime_split_send_recv = True
        # 加入 fleet init 初始化环境
        fleet.init(role)

        optimizer = fleet.distributed_optimizer(sgd_optimizer, config)
        optimizer.minimize(avg_cost)

        if fleet.is_server():
            fleet.init_server()
            fleet.run_server()
        # 启动worker
        if fleet.is_worker():
            # 初始化worker配置
            fleet.init_worker()

            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
            exe = Executor(place)

            feeder = fluid.DataFeeder(feed_order, place)
            train_reader = feeder.decorate_reader(
                paddle.batch(paddle.reader.shuffle(
                    data_reader(cluster_train_dir), buf_size=8192), batch_size=BATCH_SIZE),
                multi_devices=False, drop_last=True)

            exe.run(fleet.startup_program)

            exec_strategy = fluid.ExecutionStrategy()
            exec_strategy.num_threads = CPU_NUM
            build_strategy = fluid.BuildStrategy()
            build_strategy.async_mode = True

            if CPU_NUM > 1:
                build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce

            compiled_prog = fluid.compiler.CompiledProgram(
                fleet.main_program).with_data_parallel(
                loss_name=avg_cost.name, build_strategy=build_strategy, exec_strategy=exec_strategy)

            for pass_id in range(PASS_NUM):
                cost_list = []
                auc_list = []
                import time
                s_time = time.time()
                for batch_id, data in enumerate(train_reader()):
                    r_time = time.time() - s_time
                    cost_value, auc_value = exe.run(
                        program=compiled_prog, feed=data,
                        fetch_list=[avg_cost.name, auc_batch.name])
                    t_time = time.time() - r_time
                    cost_list.append(np.array(cost_value))
                    auc_list.append(np.array(auc_value))

                    if batch_id % 10 == 0 and batch_id != 0:
                        print "Pass %d, batch %d, cost %s auc %s readtime %f traintime %f" % \
                              (pass_id, batch_id, np.array(cost_list).mean(),
                               np.array(auc_list).mean(), r_time, t_time)
                        cost_list = []
                        auc_list = []
                    if batch_id % 1000 == 0 and fleet.is_first_worker():
                        if save_dirname is not None:
                            fleet.save_inference_model(
                                exe,
                                save_dirname,
                                feed_order,
                                [predict, avg_cost, auc_batch]
                            )
                            fleet.save_persistables(exe, save_dirname)
                            infer(cluster_test_dir, save_dirname, feed_order)
                    s_time = time.time()
        fleet.stop_worker()
def save(predict,savaPath,exe):
    if not os.path.exists(savaPath):
     os.makedirs(savaPath)
    print('save models to %s' % (savaPath))
    fleet.save_inference_model(dirname=savaPath, feeded_var_names=['images'],target_vars=[predict], executor=exe)
def train(use_cuda, train_sample_dir, test_sample_dir, old_model, output_model,
          is_local, is_increment):
    """
    train
    """
    # predict, avg_cost, feed_order, auc_var, auc_batch, auc_states = model()
    model_args = model()
    navi_predict = model_args['predict'][0]
    voice_navi_predict = model_args['predict'][1]
    speed_navi_predict = model_args['predict'][2]
    avg_cost = model_args['avg_cost']
    feed_order = model_args['feed_order']

    role = role_maker.PaddleCloudRoleMaker()
    # 全异步训练
    config = DistributeTranspilerConfig()
    config.sync_mode = False
    config.runtime_split_send_recv = True

    sgd_optimizer = AdamOptimizer(learning_rate=2e-4)

    if is_local:
        sgd_optimizer.minimize(avg_cost)
        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()

        exe = Executor(place)
        # train_reader = paddle.batch(
        #     paddle.reader.shuffle(
        #         streaming_data_reader(), buf_size=8192), batch_size=BATCH_SIZE)

        feeder = fluid.DataFeeder(feed_order, place)
        train_reader = feeder.decorate_reader(paddle.batch(
            paddle.reader.shuffle(streaming_data_reader(), buf_size=8192),
            batch_size=BATCH_SIZE),
                                              multi_devices=False,
                                              drop_last=True)
        start_program = fluid.default_startup_program()
        exe.run(start_program)
        main_program = fluid.default_main_program()
        if is_increment:  # load model to fine-tune
            fluid.io.load_params(exe, old_model, main_program)
            # for auc_state in model_args['auc'][2]:
            #     set_zero(place, fluid.global_scope(), auc_state.name)

        exec_strategy = fluid.ExecutionStrategy()
        exec_strategy.num_threads = CPU_NUM
        main_program.num_threads = CPU_NUM
        build_strategy = fluid.BuildStrategy()
        build_strategy.async_mode = True

        # 并行训练,速度更快
        train_pe = fluid.ParallelExecutor(use_cuda=use_cuda,
                                          main_program=main_program,
                                          loss_name=avg_cost.name)

        cost_list = []
        for pass_id in range(PASS_NUM):
            for batch_id, data in enumerate(train_reader()):
                cost_value = train_pe.run(feed=data,
                                          fetch_list=[avg_cost.name])
                cost_list.append(np.array(cost_value))

                if batch_id % 100 == 0 and batch_id != 0:
                    print "Pass %d, batch %d, cost %s" % \
                          (pass_id, batch_id, np.array(cost_list).mean())
                    cost_list = []
                if batch_id % 2000 == 0:
                    if output_model is not None:
                        fluid.io.save_inference_model(
                            output_model, feed_order, [
                                navi_predict, voice_navi_predict,
                                speed_navi_predict, avg_cost
                            ], exe)
                        fluid.io.save_persistables(exe, output_model)
                        infer(test_sample_dir, output_model, feed_order)

    else:
        # 加入 fleet init 初始化环境
        fleet.init(role)
        # 加入 fleet distributed_optimizer 加入分布式策略配置及多机优化
        optimizer = fleet.distributed_optimizer(sgd_optimizer, config)
        optimizer.minimize(avg_cost)

        if fleet.is_server():
            if is_increment:
                fleet.init_server(old_model)
            else:
                fleet.init_server()
            fleet.run_server()
        # 启动worker
        if fleet.is_worker():
            # 初始化worker配置
            fleet.init_worker()

            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()

            exe = Executor(place)
            # train_reader = paddle.batch(
            #     paddle.reader.shuffle(
            #         data_reader(train_sample_dir), buf_size=8192), batch_size=BATCH_SIZE)

            feeder = fluid.DataFeeder(feed_order, place)
            train_reader = feeder.decorate_reader(paddle.batch(
                paddle.reader.shuffle(data_reader(train_sample_dir),
                                      buf_size=8192),
                batch_size=BATCH_SIZE),
                                                  multi_devices=False,
                                                  drop_last=True)
            exe.run(fleet.startup_program)

            exec_strategy = fluid.ExecutionStrategy()
            exec_strategy.num_threads = CPU_NUM
            build_strategy = fluid.BuildStrategy()
            build_strategy.async_mode = True

            if CPU_NUM > 1:
                build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce

            compiled_prog = fluid.compiler.CompiledProgram(
                fleet.main_program).with_data_parallel(
                    loss_name=avg_cost.name,
                    build_strategy=build_strategy,
                    exec_strategy=exec_strategy)

            cost_list = []
            for pass_id in range(PASS_NUM):
                for batch_id, data in enumerate(train_reader()):
                    cost_value = exe.run(program=compiled_prog,
                                         feed=data,
                                         fetch_list=[avg_cost.name])
                    cost_list.append(np.array(cost_value))

                    if batch_id % 100 == 0 and batch_id != 0:
                        print "Pass %d, batch %d, cost %s" % \
                              (pass_id, batch_id, np.array(cost_list).mean())
                        cost_list = []
                    if batch_id % 1000 == 0 and fleet.is_first_worker():
                        if output_model is not None:
                            fleet.save_inference_model(
                                exe, output_model, feed_order, [
                                    navi_predict, voice_navi_predict,
                                    speed_navi_predict, avg_cost
                                ])
                            fleet.save_persistables(exe, output_model)
                            infer(test_sample_dir, output_model, feed_order)
        fleet.stop_worker()