def _get_dataset(self):
        namespace = "train.reader"

        inputs = self.model.get_inputs()
        threads = envs.get_global_env("train.threads", None)
        batch_size = envs.get_global_env("batch_size", None, namespace)
        reader_class = envs.get_global_env("class", None, namespace)
        abs_dir = os.path.dirname(os.path.abspath(__file__))
        reader = os.path.join(abs_dir, '../utils', 'dataset_instance.py')
        pipe_cmd = "python {} {} {} {}".format(reader, reader_class, "TRAIN",
                                               self._config_yaml)
        train_data_path = envs.get_global_env("train_data_path", None,
                                              namespace)

        dataset = fluid.DatasetFactory().create_dataset()
        dataset.set_use_var(inputs)
        dataset.set_pipe_command(pipe_cmd)
        dataset.set_batch_size(batch_size)
        dataset.set_thread(threads)
        file_list = [
            os.path.join(train_data_path, x)
            for x in os.listdir(train_data_path)
        ]

        dataset.set_filelist(file_list)
        return dataset
    def wait_and_prepare_infer_dataset(self, day, pass_index):
        test_data_path = self.config.get("runner.infer_data_dir", [])
        dataset = fluid.DatasetFactory().create_dataset(self.reader_type)
        dataset.set_use_var(self.input_data)
        dataset.set_batch_size(self.config.get('runner.infer_batch_size', 1))
        dataset.set_thread(self.config.get('runner.infer_thread_num', 1))
        dataset.set_hdfs_config(self.hadoop_fs_name, self.hadoop_fs_ugi)
        dataset.set_parse_ins_id(self.config.get("runner.parse_ins_id", False))
        dataset.set_parse_content(
            self.config.get("runner.parse_content", False))

        cur_path = []
        for i in self.online_intervals[pass_index - 1]:
            p = os.path.join(train_data_path, day, str(i))
            if self.data_donefile:
                cur_donefile = os.path.join(p, self.data_donefile)
                data_ready(cur_donefile, self.data_sleep_second,
                           self.hadoop_client)
            cur_path.append(p)
        global_file_list = file_ls(cur_path, self.hadoop_client)
        my_file_list = fleet.util.get_file_shard(global_file_list)
        logger.info("my_file_list = {}".format(my_file_list))
        dataset.set_filelist(my_file_list)

        self.pipe_command = "{} {} {}".format(
            self.config.get("runner.pipe_command"),
            config.get("yaml_path"), get_utils_file_path())
        dataset.set_pipe_command(self.pipe_command)
        dataset.load_into_memory()
        return dataset
Beispiel #3
0
    def do_training(self, fleet):
        dnn_input_dim, lr_input_dim, train_file_path = ctr_dataset_reader.prepare_data(
        )

        exe = fluid.Executor(fluid.CPUPlace())

        fleet.init_worker()
        exe.run(fleet.startup_program)

        thread_num = 2
        filelist = []
        for _ in range(thread_num):
            filelist.append(train_file_path)

        # config dataset
        dataset = fluid.DatasetFactory().create_dataset()
        dataset.set_batch_size(128)
        dataset.set_use_var(self.feeds)
        pipe_command = 'python ctr_dataset_reader.py'
        dataset.set_pipe_command(pipe_command)

        dataset.set_filelist(filelist)
        dataset.set_thread(thread_num)

        for epoch_id in range(2):
            pass_start = time.time()
            dataset.set_filelist(filelist)
            exe.train_from_dataset(
                program=fleet.main_program,
                dataset=dataset,
                fetch_list=[self.avg_cost],
                fetch_info=["cost"],
                print_period=2,
                debug=False)
            pass_time = time.time() - pass_start

        class FH(fluid.executor.FetchHandler):
            def handler(self, fetch_target_vars):
                for i in range(len(fetch_target_vars)):
                    print("{}: \n {}\n".format(self.fetch_target_names[0],
                                               fetch_target_vars[0]))

        for epoch_id in range(2):
            pass_start = time.time()
            dataset.set_filelist(filelist)
            exe.train_from_dataset(
                program=fleet.main_program,
                dataset=dataset,
                fetch_handler=FH([self.avg_cost.name],
                                 period_secs=2,
                                 return_np=True),
                debug=False)
            pass_time = time.time() - pass_start

        model_dir = tempfile.mkdtemp()
        fleet.save_inference_model(
            exe, model_dir, [feed.name for feed in self.feeds], self.avg_cost)
        self.check_model_right(model_dir)
        shutil.rmtree(model_dir)
        fleet.stop_worker()
Beispiel #4
0
    def create_dataset(self, FLAGS, factory):
        """
        DatasetFactory is a factory which create dataset by its name, 
        We can create "QueueDataset" or "InMemoryDataset",  or 
        "FileInstantDataset" the default is "QueueDataset". 
        """
        if FLAGS.data_reader != "dataset":
            return None

        dataset = fluid.DatasetFactory().create_dataset(FLAGS.dataset_mode)
        dataset.set_batch_size(FLAGS.batch_size) 
        dataset.set_use_var(self.input_layers)

        dir_name = os.path.dirname(__file__)
        pipe_command = (FLAGS.fluid_bin + " " + dir_name + "/dataset_reader.py " + 
                        ObjectTransform.pickle_dumps_to_str(factory['dataset']) + " " + 
                        ObjectTransform.pickle_dumps_to_str(self.input_names))
   
        """
        Set pipe command of current dataset 
        A pipe command is a UNIX pipeline command
        """
        dataset.set_pipe_command(pipe_command)
        #TODO: shuffle
        #Set thread num, it is the num of readers.
        dataset.set_thread(self.get_thread_num(FLAGS)) 

        return dataset 
Beispiel #5
0
def save_program():
    dataset = fluid.DatasetFactory().create_dataset()
    dataset.set_use_var(sparse_input_ids + [label])
    pipe_command = "python criteo_dataset.py {}".format(sys.argv[1])
    dataset.set_pipe_command(pipe_command)
    dataset.set_batch_size(32)
    dataset.set_thread(10)
    optimizer = fluid.optimizer.SGD(0.0001)
    optimizer.minimize(avg_cost)
    exe = fluid.Executor(fluid.CPUPlace())

    input_folder = "hdfs:"
    output = sp.check_output( "hadoop fs -D hadoop.job.ugi=" + hdfs_ugi
                              + " -D fs.defaultFS=" + hdfs_address +" -ls " + os.path.join(dataset_prefix, current_date_hr) + "/ | awk '{if(NR>1) print $8}'", shell=True)
    train_filelist = ["{}{}".format(input_folder, f) for f in output.decode('ascii').strip().split('\n')]
    train_filelist.remove('hdfs:' + os.path.join(dataset_prefix, current_date_hr, 'donefile'))
    train_filelist = [train_filelist[0]] 
    print(train_filelist)

    exe.run(fluid.default_startup_program())
    print("startup save program done.")
    dataset.set_filelist(train_filelist)
    exe.train_from_dataset(
        program=fluid.default_main_program(),
        dataset=dataset,
        fetch_list=[auc_var],
        fetch_info=["auc"],
        debug=False,)
        #print_period=10000)
    # save model here
    fetch_list = fluid.io.save_inference_model(inference_path, [x.name for x in inference_feed_vars], [predict], exe)
Beispiel #6
0
 def create_dataset():
     dataset = fluid.DatasetFactory().create_dataset("BoxPSDataset")
     dataset.set_use_var([x, y])
     dataset.set_batch_size(2)
     dataset.set_thread(1)
     dataset.set_filelist(filelist)
     return dataset
def get_dataset(inputs):
    dataset = fluid.DatasetFactory().create_dataset()
    dataset.set_use_var(inputs)
    dataset.set_batch_size(1)
    dataset.set_filelist([])
    dataset.set_thread(1)
    return dataset
Beispiel #8
0
    def _get_dataset(self):
        namespace = "train.reader"

        inputs = self.model.get_inputs()
        threads = int(envs.get_runtime_environ("train.trainer.threads"))
        batch_size = envs.get_global_env("batch_size", None, namespace)
        reader_class = envs.get_global_env("class", None, namespace)
        abs_dir = os.path.dirname(os.path.abspath(__file__))
        reader = os.path.join(abs_dir, '../utils', 'dataset_instance.py')
        pipe_cmd = "python {} {} {} {}".format(reader, reader_class, "TRAIN",
                                               self._config_yaml)

        train_data_path = envs.get_global_env("train_data_path", None,
                                              namespace)

        if train_data_path.startswith("fleetrec::"):
            package_base = envs.get_runtime_environ("PACKAGE_BASE")
            assert package_base is not None
            train_data_path = os.path.join(package_base,
                                           train_data_path.split("::")[1])

        dataset = fluid.DatasetFactory().create_dataset()
        dataset.set_use_var(inputs)
        dataset.set_pipe_command(pipe_cmd)
        dataset.set_batch_size(batch_size)
        dataset.set_thread(threads)
        file_list = [
            os.path.join(train_data_path, x)
            for x in os.listdir(train_data_path)
        ]

        dataset.set_filelist(file_list)
        return dataset
Beispiel #9
0
    def test_dataset_create(self):
        """ Testcase for dataset create. """
        try:
            dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
        except:
            self.assertTrue(False)

        try:
            dataset = fluid.DatasetFactory().create_dataset("QueueDataset")
        except:
            self.assertTrue(False)

        try:
            dataset = fluid.DatasetFactory().create_dataset("MyOwnDataset")
            self.assertTrue(False)
        except:
            self.assertTrue(True)
Beispiel #10
0
 def get_dataset(self, inputs, files):
     dataset = fluid.DatasetFactory().create_dataset("QueueDataset")
     dataset.set_batch_size(32)
     dataset.set_thread(3)
     dataset.set_filelist(files)
     dataset.set_pipe_command("cat")
     dataset.set_use_var(inputs)
     return dataset
Beispiel #11
0
    def test_in_memory_dataset_run(self):
        """
        Testcase for InMemoryDataset from create to run.
        """
        with open("test_in_memory_dataset_run_a.txt", "w") as f:
            data = "1 1 2 3 3 4 5 5 5 5 1 1\n"
            data += "1 2 2 3 4 4 6 6 6 6 1 2\n"
            data += "1 3 2 3 5 4 7 7 7 7 1 3\n"
            f.write(data)
        with open("test_in_memory_dataset_run_b.txt", "w") as f:
            data = "1 4 2 3 3 4 5 5 5 5 1 4\n"
            data += "1 5 2 3 4 4 6 6 6 6 1 5\n"
            data += "1 6 2 3 5 4 7 7 7 7 1 6\n"
            data += "1 7 2 3 6 4 8 8 8 8 1 7\n"
            f.write(data)

        slots = ["slot1", "slot2", "slot3", "slot4"]
        slots_vars = []
        for slot in slots:
            var = fluid.layers.data(name=slot,
                                    shape=[1],
                                    dtype="int64",
                                    lod_level=1)
            slots_vars.append(var)

        dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
        dataset.set_batch_size(32)
        dataset.set_thread(3)
        dataset.set_filelist([
            "test_in_memory_dataset_run_a.txt",
            "test_in_memory_dataset_run_b.txt"
        ])
        dataset.set_pipe_command("cat")
        dataset.set_use_var(slots_vars)
        dataset.load_into_memory()
        dataset.set_fea_eval(10000, True)
        dataset.slots_shuffle(["slot1"])
        dataset.local_shuffle()
        dataset.set_generate_unique_feasigns(True, 15)
        dataset.generate_local_tables_unlock(0, 11, 1, 25, 15)
        exe = fluid.Executor(fluid.CPUPlace())
        exe.run(fluid.default_startup_program())
        if self.use_data_loader:
            data_loader = fluid.io.DataLoader.from_dataset(
                dataset, fluid.cpu_places(), self.drop_last)
            for i in range(self.epoch_num):
                for data in data_loader():
                    exe.run(fluid.default_main_program(), feed=data)
        else:
            for i in range(self.epoch_num):
                try:
                    exe.train_from_dataset(fluid.default_main_program(),
                                           dataset)
                except Exception as e:
                    self.assertTrue(False)

        os.remove("./test_in_memory_dataset_run_a.txt")
        os.remove("./test_in_memory_dataset_run_b.txt")
    def _get_dataset(self, state="TRAIN"):
        if state == "TRAIN":
            inputs = self.model.get_inputs()
            namespace = "train.reader"
            train_data_path = envs.get_global_env("train_data_path", None,
                                                  namespace)
        else:
            inputs = self.model.get_infer_inputs()
            namespace = "evaluate.reader"
            train_data_path = envs.get_global_env("test_data_path", None,
                                                  namespace)

        sparse_slots = envs.get_global_env("sparse_slots", None, namespace)
        dense_slots = envs.get_global_env("dense_slots", None, namespace)

        threads = int(envs.get_runtime_environ("train.trainer.threads"))
        batch_size = envs.get_global_env("batch_size", None, namespace)
        reader_class = envs.get_global_env("class", None, namespace)
        abs_dir = os.path.dirname(os.path.abspath(__file__))
        reader = os.path.join(abs_dir, '../utils', 'dataset_instance.py')

        if sparse_slots is None and dense_slots is None:
            pipe_cmd = "python {} {} {} {}".format(reader, reader_class, state,
                                                   self._config_yaml)
        else:
            padding = envs.get_global_env("padding", 0, namespace)
            pipe_cmd = "python {} {} {} {} {} {} {} {}".format(
                reader, "slot", "slot", self._config_yaml, namespace, \
                sparse_slots.replace(" ", "#"), dense_slots.replace(" ", "#"), str(padding))

        if train_data_path.startswith("paddlerec::"):
            package_base = envs.get_runtime_environ("PACKAGE_BASE")
            assert package_base is not None
            train_data_path = os.path.join(package_base,
                                           train_data_path.split("::")[1])

        dataset = fluid.DatasetFactory().create_dataset()
        dataset.set_use_var(inputs)
        dataset.set_pipe_command(pipe_cmd)
        dataset.set_batch_size(batch_size)
        dataset.set_thread(threads)
        file_list = [
            os.path.join(train_data_path, x)
            for x in os.listdir(train_data_path)
        ]
        self.files = file_list
        dataset.set_filelist(self.files)

        debug_mode = envs.get_global_env("reader_debug_mode", False, namespace)
        if debug_mode:
            print("--- Dataset Debug Mode Begin , show pre 10 data of {}---".
                  format(file_list[0]))
            os.system("cat {} | {} | head -10".format(file_list[0], pipe_cmd))
            print("--- Dataset Debug Mode End , show pre 10 data of {}---".
                  format(file_list[0]))
            exit(0)

        return dataset
Beispiel #13
0
    def test_queue_dataset_run_3(self):
        """
        Testcase for QueueDataset from create to run.
        Use CUDAPlace
        Use float type id
        """
        with open("test_queue_dataset_run_a.txt", "w") as f:
            data = "2 1 2 2 5 4 2 2 7 2 1 3\n"
            data += "2 6 2 2 1 4 2 2 4 2 2 3\n"
            data += "2 5 2 2 9 9 2 2 7 2 1 3\n"
            data += "2 7 2 2 1 9 2 3 7 2 5 3\n"
            f.write(data)
        with open("test_queue_dataset_run_b.txt", "w") as f:
            data = "2 1 2 2 5 4 2 2 7 2 1 3\n"
            data += "2 6 2 2 1 4 2 2 4 2 2 3\n"
            data += "2 5 2 2 9 9 2 2 7 2 1 3\n"
            data += "2 7 2 2 1 9 2 3 7 2 5 3\n"
            f.write(data)

        slots = ["slot1", "slot2", "slot3", "slot4"]
        slots_vars = []
        for slot in slots:
            var = fluid.data(
                name=slot, shape=[None, 1], dtype="int64", lod_level=1)
            slots_vars.append(var)

        dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
        dataset.set_input_type(1)
        dataset.set_batch_size(1)
        dataset.set_thread(2)
        dataset.set_filelist(
            ["test_queue_dataset_run_a.txt", "test_queue_dataset_run_b.txt"])
        dataset.set_pipe_command("cat")
        dataset.set_use_var(slots_vars)
        dataset.load_into_memory()

        exe = fluid.Executor(fluid.CPUPlace() if not core.is_compiled_with_cuda(
        ) else fluid.CUDAPlace(0))
        exe.run(fluid.default_startup_program())
        if self.use_data_loader:
            data_loader = fluid.io.DataLoader.from_dataset(dataset,
                                                           fluid.cpu_places(),
                                                           self.drop_last)
            for i in range(self.epoch_num):
                for data in data_loader():
                    exe.run(fluid.default_main_program(), feed=data)
        else:
            for i in range(self.epoch_num):
                try:
                    exe.train_from_dataset(fluid.default_main_program(),
                                           dataset)
                except Exception as e:
                    self.assertTrue(False)

        os.remove("./test_queue_dataset_run_a.txt")
        os.remove("./test_queue_dataset_run_b.txt")
Beispiel #14
0
 def dataset_reader(self):
     """dataset reader"""
     dataset = fluid.DatasetFactory().create_dataset()
     dataset.set_use_var(self.inputs)
     pipe_command = "python dataset_generator.py"
     dataset.set_pipe_command(pipe_command)
     dataset.set_batch_size(params.batch_size)
     thread_num = int(params.cpu_num)
     dataset.set_thread(thread_num)
     return dataset
Beispiel #15
0
def train():
    args = parse_args()
    # add ce
    if args.enable_ce:
        SEED = 102
        fluid.default_main_program().random_seed = SEED
        fluid.default_startup_program().random_seed = SEED

    print('---------- Configuration Arguments ----------')
    for key, value in args.__dict__.items():
        print(key + ':' + str(value))

    if not os.path.isdir(args.model_output_dir):
        os.mkdir(args.model_output_dir)

    loss, auc, data_list, auc_states = ctr_deepfm_model(
        args.embedding_size, args.num_field, args.num_feat, args.layer_sizes,
        args.act, args.reg)
    optimizer = fluid.optimizer.SGD(
        learning_rate=args.lr,
        regularization=fluid.regularizer.L2DecayRegularizer(args.reg))
    optimizer.minimize(loss)

    exe = fluid.Executor(fluid.CPUPlace())
    exe.run(fluid.default_startup_program())

    dataset = fluid.DatasetFactory().create_dataset()
    dataset.set_use_var(data_list)
    pipe_command = 'python criteo_reader.py {}'.format(args.feat_dict)
    dataset.set_pipe_command(pipe_command)
    dataset.set_batch_size(args.batch_size)
    dataset.set_thread(args.num_thread)
    train_filelist = [
        os.path.join(args.train_data_dir, x)
        for x in os.listdir(args.train_data_dir)
    ]

    print('---------------------------------------------')
    for epoch_id in range(args.num_epoch):
        start = time.time()
        dataset.set_filelist(train_filelist)
        exe.train_from_dataset(
            program=fluid.default_main_program(),
            dataset=dataset,
            fetch_list=[loss, auc],
            fetch_info=['epoch %d batch loss' % (epoch_id + 1), "auc"],
            print_period=1000,
            debug=False)
        model_dir = os.path.join(args.model_output_dir,
                                 'epoch_' + str(epoch_id + 1))
        sys.stderr.write('epoch%d is finished and takes %f s\n' %
                         ((epoch_id + 1), time.time() - start))
        fluid.io.save_persistables(executor=exe,
                                   dirname=model_dir,
                                   main_program=fluid.default_main_program())
Beispiel #16
0
def create_dataset(use_var_list, my_filelist):
    dataset = fluid.DatasetFactory().create_dataset(config.dataset_type)
    dataset.set_batch_size(config.batch_size)
    dataset.set_thread(config.thread_num)
    dataset.set_hdfs_config(config.fs_name, config.fs_ugi)
    dataset.set_pipe_command(
        "./read_feasign | python ins_weight.py | awk -f format_newcate_hotnews.awk | ./parse_feasign all_slot.dict"
    )
    dataset.set_filelist(["part-00000_1"])
    dataset.set_use_var(use_var_list)
    return dataset
Beispiel #17
0
 def set_data_config(self):
     self.dataset = fluid.DatasetFactory().create_dataset("BoxPSDataset")
     self.dataset.set_feed_type("PaddleBoxDataFeed")
     self.dataset.set_parse_logkey(True)
     self.dataset.set_thread(1)
     self.dataset.set_enable_pv_merge(self.enable_pv_merge)
     self.dataset.set_batch_size(self.batch_size)
     if self.enable_pv_merge:
         self.dataset.set_merge_by_sid(self.merge_by_sid)
         self.dataset.set_rank_offset("rank_offset")
         self.dataset.set_pv_batch_size(self.pv_batch_size)
Beispiel #18
0
    def test_queue_dataset_run_2(self):
        """
        Testcase for QueueDataset from create to run.
        Use CUDAPlace
        Use float type id
        """
        with open("test_queue_dataset_run_a.txt", "w") as f:
            data = "1 1 2 3 3 4 5 5 5 5 1 1\n"
            data += "1 2 2 3 4 4 6 6 6 6 1 2\n"
            data += "1 3 2 3 5 4 7 7 7 7 1 3\n"
            f.write(data)
        with open("test_queue_dataset_run_b.txt", "w") as f:
            data = "1 4 2 3 3 4 5 5 5 5 1 4\n"
            data += "1 5 2 3 4 4 6 6 6 6 1 5\n"
            data += "1 6 2 3 5 4 7 7 7 7 1 6\n"
            data += "1 7 2 3 6 4 8 8 8 8 1 7\n"
            f.write(data)

        slots = ["slot1_f", "slot2_f", "slot3_f", "slot4_f"]
        slots_vars = []
        for slot in slots:
            var = fluid.layers.data(name=slot,
                                    shape=[1],
                                    dtype="float32",
                                    lod_level=1)
            slots_vars.append(var)

        dataset = fluid.DatasetFactory().create_dataset("QueueDataset")
        dataset.set_batch_size(32)
        dataset.set_thread(3)
        dataset.set_filelist(
            ["test_queue_dataset_run_a.txt", "test_queue_dataset_run_b.txt"])
        dataset.set_pipe_command("cat")
        dataset.set_use_var(slots_vars)

        exe = fluid.Executor(fluid.CPUPlace(
        ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
        exe.run(fluid.default_startup_program())
        if self.use_data_loader:
            data_loader = fluid.io.DataLoader.from_dataset(
                dataset, fluid.cpu_places(), self.drop_last)
            for i in range(self.epoch_num):
                for data in data_loader():
                    exe.run(fluid.default_main_program(), feed=data)
        else:
            for i in range(self.epoch_num):
                try:
                    exe.train_from_dataset(fluid.default_main_program(),
                                           dataset)
                except Exception as e:
                    self.assertTrue(False)

        os.remove("./test_queue_dataset_run_a.txt")
        os.remove("./test_queue_dataset_run_b.txt")
 def dataset_reader(self):
     """get dataset_reader."""
     dataset = fluid.DatasetFactory().create_dataset()
     dataset.set_use_var([self.dense_input] + self.sparse_input_ids +
                         [self.label])
     pipe_command = "python ./thirdparty/ctr/dataset_generator.py"
     dataset.set_pipe_command(pipe_command)
     dataset.set_batch_size(4)
     thread_num = int(2)
     dataset.set_thread(thread_num)
     return dataset
Beispiel #20
0
    def do_dataset_training(self, fleet):
        train_file_list = ctr_dataset_reader.prepare_fake_data()

        exe = self.get_executor()
        exe.run(fluid.default_startup_program())
        fleet.init_worker()

        thread_num = 2
        batch_size = 128
        filelist = train_file_list

        # config dataset
        dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
        dataset.set_use_var(self.feeds)
        dataset.set_batch_size(128)
        dataset.set_thread(2)
        dataset.set_filelist(filelist)
        dataset.set_pipe_command('python ctr_dataset_reader.py')
        dataset.load_into_memory()

        dataset.global_shuffle(fleet, 12)  ##TODO: thread configure
        shuffle_data_size = dataset.get_shuffle_data_size(fleet)
        local_data_size = dataset.get_shuffle_data_size()
        data_size_list = fleet.util.all_gather(local_data_size)
        print('after global_shuffle data_size_list: ', data_size_list)
        print('after global_shuffle data_size: ', shuffle_data_size)

        for epoch_id in range(1):
            pass_start = time.time()
            exe.train_from_dataset(program=fluid.default_main_program(),
                                   dataset=dataset,
                                   fetch_list=[self.avg_cost],
                                   fetch_info=["cost"],
                                   print_period=2,
                                   debug=int(os.getenv("Debug", "0")))
            pass_time = time.time() - pass_start
        dataset.release_memory()

        if os.getenv("SAVE_MODEL") == "1":
            model_dir = tempfile.mkdtemp()
            fleet.save_inference_model(exe, model_dir,
                                       [feed.name for feed in self.feeds],
                                       self.avg_cost)
            self.check_model_right(model_dir)
            shutil.rmtree(model_dir)

        dirname = os.getenv("SAVE_DIRNAME", None)
        if dirname:
            fleet.save_persistables(exe, dirname=dirname)

        cache_dirname = os.getenv("SAVE_CACHE_DIRNAME", None)
        if cache_dirname:
            fleet.save_cache_model(cache_dirname)
Beispiel #21
0
def get_dataset(inputs):
    dataset = fluid.DatasetFactory().create_dataset()
    dataset.set_use_var(inputs)
    dataset.set_pipe_command("python ../datasets/criteo_dataset_generator.py")
    dataset.set_batch_size(cfg.batch_size)
    thread_num = int(cfg.cpu_num)
    file_list = [
        os.path.join(cfg.train_files_path, x)
        for x in os.listdir(cfg.train_files_path)
    ]
    #logger.info("file list: {}".format(file_list))
    return dataset, file_list
Beispiel #22
0
def train(args):
    if not os.path.isdir(args.model_output_dir):
        os.mkdir(args.model_output_dir)

    input_word = fluid.layers.data(name="context_id",
                                   shape=[1],
                                   dtype="int64",
                                   lod_level=0)
    true_word = fluid.layers.data(name="target",
                                  shape=[1],
                                  dtype="int64",
                                  lod_level=0)
    neg_num = 5
    neg_word = fluid.layers.data(name="neg_label",
                                 shape=[neg_num],
                                 dtype='int64',
                                 lod_level=0)

    loss = skip_gram_word2vec_dataset(input_word,
                                      true_word,
                                      neg_word,
                                      354052,
                                      None,
                                      args.embedding_size,
                                      is_sparse=args.is_sparse)
    optimizer = fluid.optimizer.SGD(
        learning_rate=fluid.layers.exponential_decay(
            learning_rate=args.base_lr,
            decay_steps=100000,
            decay_rate=0.999,
            staircase=True))

    optimizer.minimize(loss)

    main_program = fluid.default_main_program()

    dataset = fluid.DatasetFactory().create_dataset()
    dataset.set_use_var([input_word, true_word, neg_word])
    dataset.set_pipe_command(
        "sudo /home/users/dongdaxiang/paddle_whls/pipe_reader/paddle_release_home/python/bin/python reader.py"
    )
    dataset.set_batch_size(args.batch_size)
    filelist = GetFileList(args.train_data_dir)
    dataset.set_filelist(filelist)
    dataset.set_thread(args.thread_num)

    exe = fluid.Executor(fluid.CPUPlace())
    exe.run(fluid.default_startup_program())
    for i in range(args.epochs):
        logger.info("Going to train epoch {}".format(i))
        exe.train_from_dataset(program=fluid.default_main_program(),
                               dataset=dataset)
Beispiel #23
0
def train():
    args = parse_args()
    if not os.path.isdir(args.model_output_dir):
        os.mkdir(args.model_output_dir)

    dense_input = fluid.layers.data(name="dense_input",
                                    shape=[dense_feature_dim],
                                    dtype='float32')
    sparse_input_ids = [
        fluid.layers.data(name="C" + str(i),
                          shape=[1],
                          lod_level=1,
                          dtype="int64") for i in range(1, 27)
    ]
    label = fluid.layers.data(name='label', shape=[1], dtype='int64')

    loss, auc_var, batch_auc_var = ctr_dnn_model_dataset(
        dense_input, sparse_input_ids, label, args.embedding_size,
        args.sparse_feature_dim)

    optimizer = fluid.optimizer.SGD(learning_rate=1e-4)
    optimizer.minimize(loss)

    exe = fluid.Executor(fluid.CPUPlace())
    exe.run(fluid.default_startup_program())
    dataset = fluid.DatasetFactory().create_dataset()
    dataset.set_use_var([dense_input] + sparse_input_ids + [label])
    pipe_command = "python criteo_reader.py %d" % args.sparse_feature_dim
    dataset.set_pipe_command(pipe_command)
    dataset.set_batch_size(100)
    thread_num = 10
    dataset.set_thread(thread_num)
    whole_filelist = [
        "raw_data/part-%d" % x for x in range(len(os.listdir("raw_data")))
    ]

    epochs = 20
    for i in range(epochs):
        dataset.set_filelist(whole_filelist[:int(0.8 * len(whole_filelist))])
        exe.train_from_dataset(program=fluid.default_main_program(),
                               dataset=dataset,
                               fetch_list=[auc_var],
                               fetch_info=["auc"],
                               debug=False)
        model_dir = args.model_output_dir + '/epoch' + str(i + 1) + ".model"
        sys.stderr.write("epoch%d finished" % (i + 1))
        fluid.io.save_inference_model(
            model_dir,
            [dense_input.name] + [x.name
                                  for x in sparse_input_ids] + [label.name],
            [loss, auc_var], exe)
def train():
    args = parse_args()
    # add ce
    if args.enable_ce:
        SEED = 102
        fluid.default_main_program().random_seed = SEED
        fluid.default_startup_program().random_seed = SEED

    print(args)
    if not os.path.isdir(args.model_output_dir):
        os.mkdir(args.model_output_dir)

    loss, auc, data_list, auc_states = eval('network_conf.' + args.model_name)(
        args.embedding_size, args.num_field, args.num_feat,
        args.layer_sizes_dnn, args.act, args.reg, args.layer_sizes_cin)
    optimizer = fluid.optimizer.SGD(
        learning_rate=args.lr,
        regularization=fluid.regularizer.L2DecayRegularizer(args.reg))
    optimizer.minimize(loss)

    dataset = fluid.DatasetFactory().create_dataset()
    dataset.set_use_var(data_list)
    dataset.set_pipe_command('python criteo_reader.py')
    dataset.set_batch_size(args.batch_size)
    dataset.set_filelist([
        os.path.join(args.train_data_dir, x)
        for x in os.listdir(args.train_data_dir)
    ])

    if args.use_gpu == 1:
        exe = fluid.Executor(fluid.CUDAPlace(0))
        dataset.set_thread(1)
    else:
        exe = fluid.Executor(fluid.CPUPlace())
        dataset.set_thread(args.num_thread)
    exe.run(fluid.default_startup_program())

    for epoch_id in range(args.num_epoch):
        start = time.time()
        sys.stderr.write('\nepoch%d start ...\n' % (epoch_id + 1))
        exe.train_from_dataset(program=fluid.default_main_program(),
                               dataset=dataset,
                               fetch_list=[loss, auc],
                               fetch_info=['loss', 'auc'],
                               debug=False,
                               print_period=args.print_steps)
        model_dir = os.path.join(args.model_output_dir,
                                 'epoch_' + str(epoch_id + 1), "checkpoint")
        sys.stderr.write('epoch%d is finished and takes %f s\n' %
                         ((epoch_id + 1), time.time() - start))
        fluid.io.save_persistables(fluid.default_main_program(), model_dir)
Beispiel #25
0
    def _get_dataset(self, dataset_name, context):
        name = "dataset." + dataset_name + "."
        reader_class = envs.get_global_env(name + "data_converter")
        reader_class_name = envs.get_global_env(name + "reader_class_name",
                                                "Reader")
        abs_dir = os.path.dirname(os.path.abspath(__file__))
        reader = os.path.join(abs_dir, '../../utils', 'dataset_instance.py')
        sparse_slots = envs.get_global_env(name + "sparse_slots", "").strip()
        dense_slots = envs.get_global_env(name + "dense_slots", "").strip()
        if sparse_slots == "" and dense_slots == "":
            pipe_cmd = "python {} {} {} {}".format(reader, reader_class,
                                                   reader_class_name,
                                                   context["config_yaml"])
        else:
            if sparse_slots == "":
                sparse_slots = "?"
            if dense_slots == "":
                dense_slots = "?"
            padding = envs.get_global_env(name + "padding", 0)
            pipe_cmd = "python {} {} {} {} {} {} {} {}".format(
                reader, "slot", "slot", context["config_yaml"], "fake",
                sparse_slots.replace(" ", "?"), dense_slots.replace(" ", "?"),
                str(padding))

        batch_size = envs.get_global_env(name + "batch_size")
        dataset = fluid.DatasetFactory().create_dataset()
        dataset.set_batch_size(batch_size)
        dataset.set_pipe_command(pipe_cmd)
        train_data_path = envs.get_global_env(name + "data_path")

        file_list = [
            os.path.join(train_data_path, x)
            for x in os.listdir(train_data_path)
        ]
        if context["engine"] == EngineMode.LOCAL_CLUSTER:
            file_list = split_files(file_list, context["fleet"].worker_index(),
                                    context["fleet"].worker_num())
        print("File_list: {}".format(file_list))
        dataset.set_filelist(file_list)
        for model_dict in context["phases"]:
            if model_dict["dataset_name"] == dataset_name:
                model = context["model"][model_dict["name"]]["model"]
                thread_num = int(model_dict["thread_num"])
                dataset.set_thread(thread_num)
                if context["is_infer"]:
                    inputs = model._infer_data_var
                else:
                    inputs = model._data_var
                dataset.set_use_var(inputs)
                break
        return dataset
Beispiel #26
0
 def _alloc_dataset(self, file_list):
     """ """
     dataset = fluid.DatasetFactory().create_dataset(
         self._config['dataset_type'])
     dataset.set_batch_size(self._config['batch_size'])
     dataset.set_thread(self._config['load_thread'])
     dataset.set_hdfs_config(self._config['fs_name'],
                             self._config['fs_ugi'])
     dataset.set_pipe_command(self._config['data_converter'])
     dataset.set_filelist(file_list)
     dataset.set_use_var(self._config['data_vars'])
     # dataset.set_fleet_send_sleep_seconds(2)
     # dataset.set_fleet_send_batch_size(80000)
     return dataset
Beispiel #27
0
    def test_run_with_dump(self):
        """
        Testcase for InMemoryDataset from create to run.
        """
        with open("test_run_with_dump_a.txt", "w") as f:
            data = "1 a 1 a 1 1 2 3 3 4 5 5 5 5 1 1\n"
            data += "1 b 1 b 1 2 2 3 4 4 6 6 6 6 1 2\n"
            data += "1 c 1 c 1 3 2 3 5 4 7 7 7 7 1 3\n"
            f.write(data)
        with open("test_run_with_dump_b.txt", "w") as f:
            data = "1 d 1 d 1 4 2 3 3 4 5 5 5 5 1 4\n"
            data += "1 e 1 e 1 5 2 3 4 4 6 6 6 6 1 5\n"
            data += "1 f 1 f 1 6 2 3 5 4 7 7 7 7 1 6\n"
            data += "1 g 1 g 1 7 2 3 6 4 8 8 8 8 1 7\n"
            f.write(data)

        slots = ["slot1", "slot2", "slot3", "slot4"]
        slots_vars = []
        for slot in slots:
            var = fluid.layers.data(name=slot,
                                    shape=[1],
                                    dtype="int64",
                                    lod_level=1)
            slots_vars.append(var)

        dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
        dataset.set_batch_size(32)
        dataset.set_thread(3)
        dataset.set_filelist(
            ["test_run_with_dump_a.txt", "test_run_with_dump_b.txt"])
        dataset.set_parse_ins_id(True)
        dataset.set_parse_content(True)
        dataset.set_pipe_command("cat")
        dataset.set_use_var(slots_vars)
        dataset.load_into_memory()
        dataset.set_fea_eval(10000, True)
        dataset.local_shuffle()

        exe = fluid.Executor(fluid.CPUPlace())
        exe.run(fluid.default_startup_program())
        for i in range(2):
            try:
                exe.train_from_dataset(fluid.default_main_program(), dataset)
            except ImportError as e:
                pass
            except Exception as e:
                self.assertTrue(False)

        os.remove("./test_run_with_dump_a.txt")
        os.remove("./test_run_with_dump_b.txt")
Beispiel #28
0
    def get_dataset(self, inputs, files):
        """
        Test Dataset With Fetch Handler. TestCases.

        Args:
            inputs(list): inputs of get_dataset
            files(list): files of  get_dataset
        """
        dataset = fluid.DatasetFactory().create_dataset("QueueDataset")
        dataset.set_batch_size(32)
        dataset.set_thread(3)
        dataset.set_filelist(files)
        dataset.set_pipe_command("cat")
        dataset.set_use_var(inputs)
        return dataset
Beispiel #29
0
def get_dataset(inputs, args):
    """
    get dataset
    """
    dataset = fluid.DatasetFactory().create_dataset()
    dataset.set_use_var(inputs)
    dataset.set_pipe_command("python ./dataset_generator.py")
    dataset.set_batch_size(args.batch_size)
    dataset.set_thread(int(args.cpu_num))
    file_list = [
        str(args.train_files_path) + "/%s" % x
        for x in os.listdir(args.train_files_path)
    ]
    dataset.set_filelist(file_list)
    logger.info("file list: {}".format(file_list))
    return dataset
Beispiel #30
0
    def train_loop(main_program):
        """ train network """
        start_time = time.time()
        dataset = fluid.DatasetFactory().create_dataset()
        dataset.set_use_var(dcn_model.data_list)
        pipe_command = 'python reader.py {}'.format(args.vocab_dir)
        dataset.set_pipe_command(pipe_command)
        dataset.set_batch_size(args.batch_size)
        dataset.set_thread(args.num_thread)
        train_filelist = [
            os.path.join(args.train_data_dir, fname)
            for fname in next(os.walk(args.train_data_dir))[2]
        ]
        dataset.set_filelist(train_filelist)

        if args.use_gpu == 1:
            exe = fluid.Executor(fluid.CUDAPlace(0))
            dataset.set_thread(1)
        else:
            exe = fluid.Executor(fluid.CPUPlace())
            dataset.set_thread(args.num_thread)
        exe.run(fluid.default_startup_program())

        for epoch_id in range(args.num_epoch):
            start = time.time()
            sys.stderr.write('\nepoch%d start ...\n' % (epoch_id + 1))
            exe.train_from_dataset(
                program=main_program,
                dataset=dataset,
                fetch_list=[
                    dcn_model.loss, dcn_model.avg_logloss, dcn_model.auc_var
                ],
                fetch_info=['total_loss', 'avg_logloss', 'auc'],
                debug=False,
                print_period=args.print_steps)
            model_dir = os.path.join(args.model_output_dir,
                                     'epoch_' + str(epoch_id + 1),
                                     "checkpoint")
            sys.stderr.write('epoch%d is finished and takes %f s\n' %
                             ((epoch_id + 1), time.time() - start))
            if args.trainer_id == 0:  # only trainer 0 save model
                print("save model in {}".format(model_dir))
                fluid.save(main_program, model_dir)

        print("train time cost {:.4f}".format(time.time() - start_time))
        print("finish training")