Example #1
0
def slotdataloader(readerclass, train, yaml_file, context):
    if train == "TRAIN":
        reader_name = "SlotReader"
        namespace = "train.reader"
        data_path = get_global_env("train_data_path", None, namespace)
    else:
        reader_name = "SlotReader"
        namespace = "evaluate.reader"
        data_path = get_global_env("test_data_path", None, namespace)

    if data_path.startswith("paddlerec::"):
        package_base = get_runtime_environ("PACKAGE_BASE")
        assert package_base is not None
        data_path = os.path.join(package_base, data_path.split("::")[1])

    files = [str(data_path) + "/%s" % x for x in os.listdir(data_path)]
    if context["engine"] == EngineMode.LOCAL_CLUSTER:
        files = split_files(files, context["fleet"].worker_index(),
                            context["fleet"].worker_num())
        print("file_list: {}".format(files))

    sparse = get_global_env("sparse_slots", "#", namespace)
    if sparse == "":
        sparse = "#"
    dense = get_global_env("dense_slots", "#", namespace)
    if dense == "":
        dense = "#"
    padding = get_global_env("padding", 0, namespace)
    reader = SlotReader(yaml_file)
    reader.init(sparse, dense, int(padding))

    def gen_reader():
        for file in files:
            with open(file, 'r') as f:
                for line in f:
                    line = line.rstrip('\n')
                    iter = reader.generate_sample(line)
                    for parsed_line in iter():
                        if parsed_line is None:
                            continue
                        else:
                            values = []
                            for pased in parsed_line:
                                values.append(pased[1])
                            yield values

    def gen_batch_reader():
        return reader.generate_batch_from_trainfiles(files)

    if hasattr(reader, 'generate_batch_from_trainfiles'):
        return gen_batch_reader()
    return gen_reader
Example #2
0
    def get_dataloader(self, context, dataset_name, dataloader):
        name = "dataset." + dataset_name + "."
        sparse_slots = envs.get_global_env(name + "sparse_slots", "").strip()
        dense_slots = envs.get_global_env(name + "dense_slots", "").strip()
        batch_size = envs.get_global_env(name + "batch_size")

        reader_class = envs.get_global_env(name + "data_converter")
        reader_class_name = envs.get_global_env(name + "reader_class_name",
                                                "Reader")

        if sparse_slots == "" and dense_slots == "":
            reader = dataloader_instance.dataloader_by_name(
                reader_class,
                dataset_name,
                context["config_yaml"],
                context,
                reader_class_name=reader_class_name)

            reader_class = envs.lazy_instance_by_fliename(
                reader_class, reader_class_name)
            reader_ins = reader_class(context["config_yaml"])
        else:
            reader = dataloader_instance.slotdataloader_by_name(
                "", dataset_name, context["config_yaml"], context)
            reader_ins = SlotReader(context["config_yaml"])
        if hasattr(reader_ins, 'generate_batch_from_trainfiles'):
            dataloader.set_sample_list_generator(reader)
        elif hasattr(reader_ins, 'batch_tensor_creator'):
            dataloader.set_batch_generator(reader)
        else:
            dataloader.set_sample_generator(reader, batch_size)
        return dataloader
Example #3
0
def slotdataloader_by_name(readerclass, dataset_name, yaml_file):
    name = "dataset." + dataset_name + "."
    reader_name = "SlotReader"
    data_path = get_global_env(name + "data_path")

    if data_path.startswith("paddlerec::"):
        package_base = get_runtime_environ("PACKAGE_BASE")
        assert package_base is not None
        data_path = os.path.join(package_base, data_path.split("::")[1])

    files = [str(data_path) + "/%s" % x for x in os.listdir(data_path)]
    sparse = get_global_env(name + "sparse_slots", "#")
    if sparse == "":
        sparse = "#"
    dense = get_global_env(name + "dense_slots", "#")
    if dense == "":
        dense = "#"
    padding = get_global_env(name + "padding", 0)
    reader = SlotReader(yaml_file)
    reader.init(sparse, dense, int(padding))

    def gen_reader():
        for file in files:
            with open(file, 'r') as f:
                for line in f:
                    line = line.rstrip('\n')
                    iter = reader.generate_sample(line)
                    for parsed_line in iter():
                        if parsed_line is None:
                            continue
                        else:
                            values = []
                            for pased in parsed_line:
                                values.append(pased[1])
                            yield values

    def gen_batch_reader():
        return reader.generate_batch_from_trainfiles(files)

    if hasattr(reader, 'generate_batch_from_trainfiles'):
        return gen_batch_reader()
    return gen_reader
Example #4
0
    def _get_dataloader(self, state="TRAIN"):

        if state == "TRAIN":
            dataloader = self.model._data_loader
            namespace = "train.reader"
            class_name = "TrainReader"
        else:
            dataloader = self.model._infer_data_loader
            namespace = "evaluate.reader"
            class_name = "EvaluateReader"

        sparse_slots = envs.get_global_env("sparse_slots", None, namespace)
        dense_slots = envs.get_global_env("dense_slots", None, namespace)

        batch_size = envs.get_global_env("batch_size", None, namespace)
        print("batch_size: {}".format(batch_size))

        if sparse_slots is None and dense_slots is None:
            reader_class = envs.get_global_env("class", None, namespace)
            reader = dataloader_instance.dataloader(reader_class, state,
                                                    self._config_yaml)
            reader_class = envs.lazy_instance_by_fliename(
                reader_class, class_name)
            reader_ins = reader_class(self._config_yaml)
        else:
            reader = dataloader_instance.slotdataloader(
                "", state, self._config_yaml)
            reader_ins = SlotReader(self._config_yaml)

        if hasattr(reader_ins, 'generate_batch_from_trainfiles'):
            dataloader.set_sample_list_generator(reader)
        else:
            dataloader.set_sample_generator(reader, batch_size)

        debug_mode = envs.get_global_env("reader_debug_mode", False, namespace)
        if debug_mode:
            print("--- DataLoader Debug Mode Begin , show pre 10 data ---")
            for idx, line in enumerate(reader()):
                print(line)
                if idx >= 9:
                    break
            print("--- DataLoader Debug Mode End , show pre 10 data ---")
            exit(0)
        return dataloader
Example #5
0
 def _get_dataloader(self, dataset_name, dataloader):
     name = "dataset." + dataset_name + "."
     thread_num = envs.get_global_env(name + "thread_num")
     batch_size = envs.get_global_env(name + "batch_size")
     reader_class = envs.get_global_env(name + "data_converter")
     abs_dir = os.path.dirname(os.path.abspath(__file__))
     sparse_slots = envs.get_global_env(name + "sparse_slots", "").strip()
     dense_slots = envs.get_global_env(name + "dense_slots", "").strip()
     if sparse_slots == "" and dense_slots == "":
         reader = dataloader_instance.dataloader_by_name(
             reader_class, dataset_name, self._config_yaml)
         reader_class = envs.lazy_instance_by_fliename(
             reader_class, "TrainReader")
         reader_ins = reader_class(self._config_yaml)
     else:
         reader = dataloader_instance.slotdataloader_by_name(
             "", dataset_name, self._config_yaml)
         reader_ins = SlotReader(self._config_yaml)
     if hasattr(reader_ins, 'generate_batch_from_trainfiles'):
         dataloader.set_sample_list_generator(reader)
     else:
         dataloader.set_sample_generator(reader, batch_size)
     return dataloader
from paddlerec.core.reader import SlotReader
from paddlerec.core.utils import envs

if len(sys.argv) < 4:
    raise ValueError("reader only accept 3 argument: 1. reader_class 2.train/evaluate/slotreader 3.yaml_abs_path")

reader_package = sys.argv[1]

if sys.argv[2].upper() == "TRAIN":
    reader_name = "TrainReader"
elif sys.argv[2].upper() == "EVALUATE":
    reader_name = "EvaluateReader"
else:
    reader_name = "SlotReader"
    namespace = sys.argv[4]
    sparse_slots = sys.argv[5].replace("#", " ")
    dense_slots = sys.argv[6].replace("#", " ")
    padding = int(sys.argv[7])

yaml_abs_path = sys.argv[3]

if reader_name != "SlotReader":
    reader_class = lazy_instance_by_fliename(reader_package, reader_name)
    reader = reader_class(yaml_abs_path)
    reader.init()
    reader.run_from_stdin()
else:
    reader = SlotReader(yaml_abs_path)
    reader.init(sparse_slots, dense_slots, padding)
    reader.run_from_stdin()
Example #7
0
def slotdataloader_by_name(readerclass, dataset_name, yaml_file, context):
    name = "dataset." + dataset_name + "."
    reader_name = "SlotReader"
    data_path = get_global_env(name + "data_path")

    if data_path.startswith("paddlerec::"):
        package_base = get_runtime_environ("PACKAGE_BASE")
        assert package_base is not None
        data_path = os.path.join(package_base, data_path.split("::")[1])

    hidden_file_list, files = check_filelist(hidden_file_list=[],
                                             data_file_list=[],
                                             train_data_path=data_path)
    if (hidden_file_list is not None):
        print(
            "Warning:please make sure there are no hidden files in the dataset folder and check these hidden files:{}"
            .format(hidden_file_list))

    files.sort()

    # for local cluster: discard some files if files cannot be divided equally between GPUs
    if (context["device"] == "GPU") and "PADDLEREC_GPU_NUMS" in os.environ:
        selected_gpu_nums = int(os.getenv("PADDLEREC_GPU_NUMS"))
        discard_file_nums = len(files) % selected_gpu_nums
        if (discard_file_nums != 0):
            warnings.warn(
                "Because files cannot be divided equally between GPUs,discard these files:{}"
                .format(files[-discard_file_nums:]))
            files = files[:len(files) - discard_file_nums]

    need_split_files = False
    if context["engine"] == EngineMode.LOCAL_CLUSTER:
        # for local cluster: split files for multi process
        need_split_files = True
    elif context["engine"] == EngineMode.CLUSTER and context[
            "cluster_type"] == "K8S":
        # for k8s mount mode, split files for every node
        need_split_files = True

    if need_split_files:
        files = split_files(files, context["fleet"].worker_index(),
                            context["fleet"].worker_num())
    context["file_list"] = files
    sparse = get_global_env(name + "sparse_slots", "#")
    if sparse == "":
        sparse = "#"
    dense = get_global_env(name + "dense_slots", "#")
    if dense == "":
        dense = "#"
    padding = get_global_env(name + "padding", 0)
    reader = SlotReader(yaml_file)
    reader.init(sparse, dense, int(padding))

    def gen_reader():
        for file in files:
            with open(file, 'r') as f:
                for line in f:
                    line = line.rstrip('\n')
                    iter = reader.generate_sample(line)
                    for parsed_line in iter():
                        if parsed_line is None:
                            continue
                        else:
                            values = []
                            for pased in parsed_line:
                                values.append(pased[1])
                            yield values

    def gen_batch_reader():
        return reader.generate_batch_from_trainfiles(files)

    if hasattr(reader, 'generate_batch_from_trainfiles'):
        return gen_batch_reader()
    return gen_reader
Example #8
0
def slotdataloader(readerclass, train, yaml_file, context):
    if train == "TRAIN":
        reader_name = "SlotReader"
        namespace = "train.reader"
        data_path = get_global_env("train_data_path", None, namespace)
    else:
        reader_name = "SlotReader"
        namespace = "evaluate.reader"
        data_path = get_global_env("test_data_path", None, namespace)

    if data_path.startswith("paddlerec::"):
        package_base = get_runtime_environ("PACKAGE_BASE")
        assert package_base is not None
        data_path = os.path.join(package_base, data_path.split("::")[1])

    hidden_file_list, files = check_filelist(hidden_file_list=[],
                                             data_file_list=[],
                                             train_data_path=data_path)
    if (hidden_file_list is not None):
        print(
            "Warning:please make sure there are no hidden files in the dataset folder and check these hidden files:{}"
            .format(hidden_file_list))

    files.sort()

    need_split_files = False
    if context["engine"] == EngineMode.LOCAL_CLUSTER:
        # for local cluster: split files for multi process
        need_split_files = True
    elif context["engine"] == EngineMode.CLUSTER and context[
            "cluster_type"] == "K8S":
        # for k8s mount mode, split files for every node
        need_split_files = True

    if need_split_files:
        files = split_files(files, context["fleet"].worker_index(),
                            context["fleet"].worker_num())

    sparse = get_global_env("sparse_slots", "#", namespace)
    if sparse == "":
        sparse = "#"
    dense = get_global_env("dense_slots", "#", namespace)
    if dense == "":
        dense = "#"
    padding = get_global_env("padding", 0, namespace)
    reader = SlotReader(yaml_file)
    reader.init(sparse, dense, int(padding))

    def gen_reader():
        for file in files:
            with open(file, 'r') as f:
                for line in f:
                    line = line.rstrip('\n')
                    iter = reader.generate_sample(line)
                    for parsed_line in iter():
                        if parsed_line is None:
                            continue
                        else:
                            values = []
                            for pased in parsed_line:
                                values.append(pased[1])
                            yield values

    def gen_batch_reader():
        return reader.generate_batch_from_trainfiles(files)

    if hasattr(reader, 'generate_batch_from_trainfiles'):
        return gen_batch_reader()
    return gen_reader