Example #1
0
def _register_arrow_json_readoptions_serializer():
    import ray

    if (
        os.environ.get(
            "RAY_DISABLE_CUSTOM_ARROW_JSON_OPTIONS_SERIALIZATION",
            "0",
        )
        == "1"
    ):
        import logging

        logger = logging.getLogger(__name__)
        logger.info("Disabling custom Arrow JSON ReadOptions serialization.")
        return

    try:
        import pyarrow.json as pajson
    except ModuleNotFoundError:
        return

    ray.util.register_serializer(
        pajson.ReadOptions,
        serializer=lambda opts: (opts.use_threads, opts.block_size),
        deserializer=lambda args: pajson.ReadOptions(*args),
    )
Example #2
0
    def _read_file(self, f: "pyarrow.NativeFile", **arrow_reader_args):
        from pyarrow import json

        read_options = arrow_reader_args.pop(
            "read_options", json.ReadOptions(use_threads=False))
        return json.read_json(f,
                              read_options=read_options,
                              **arrow_reader_args)
Example #3
0
 def json_read(read_paths: List[str]):
     logger.debug(f"Reading {len(read_paths)} files.")
     tables = []
     for read_path in read_paths:
         with filesystem.open_input_file(read_path) as f:
             tables.append(
                 json.read_json(
                     f,
                     read_options=json.ReadOptions(use_threads=False),
                     **arrow_json_args))
     block = ArrowBlock(pa.concat_tables(tables))
     return block, block.get_metadata(input_files=read_paths)
Example #4
0
File: json.py Project: xwild/nlp
class JsonConfig(nlp.BuilderConfig):
    """BuilderConfig for JSON."""
    read_options: paj.ReadOptions = paj.ReadOptions()
    parse_options: paj.ParseOptions = paj.ParseOptions()

    @property
    def pa_read_options(self):
        return self.read_options

    @property
    def pa_parse_options(self):
        return self.parse_options
Example #5
0
 def _init_table_from_path(self):
     if '.jsonl' in self.path.suffixes:
         # Can read ".jsonl" or ".jsonl.gz"
         import pyarrow.json as paj
         self.table = paj.read_json(
             str(self.path),
             read_options=paj.ReadOptions(
                 # magic constants:
                 # 894 - estimated average number of bytes per JSON item manifest
                 # 10000 - how many items we want to have in a chunk (Arrow's "batch")
                 block_size=894 * 10000))
     elif '.arrow' == self.path.suffixes[-1]:
         # Can read ".arrow"
         import pyarrow as pa
         mmap = pa.memory_map(str(self.path))
         stream = pa.ipc.open_file(mmap)
         self.table = stream.read_all()
     else:
         raise ValueError(f"Unknown LazyDict file format : '{self.path}'")
Example #6
0
 def pa_read_options(self):
     return paj.ReadOptions(use_threads=self.use_threads,
                            block_size=self.block_size)
Example #7
0
    def _generate_tables(self, files):
        for file_idx, file in enumerate(files):

            # If the file is one json object and if we need to look at the list of items in one specific field
            if self.config.field is not None:
                with open(file, encoding="utf-8") as f:
                    dataset = json.load(f)

                # We keep only the field we are interested in
                dataset = dataset[self.config.field]

                # We accept two format: a list of dicts or a dict of lists
                if isinstance(dataset, (list, tuple)):
                    mapping = {
                        col: [dataset[i][col] for i in range(len(dataset))]
                        for col in dataset[0].keys()
                    }
                else:
                    mapping = dataset
                pa_table = pa.Table.from_pydict(mapping=mapping)
                yield file_idx, self._cast_classlabels(pa_table)

            # If the file has one json object per line
            else:
                with open(file, "rb") as f:
                    batch_idx = 0
                    # Use block_size equal to the chunk size divided by 32 to leverage multithreading
                    # Set a default minimum value of 16kB if the chunk size is really small
                    block_size = max(self.config.chunksize // 32, 16 << 10)
                    while True:
                        batch = f.read(self.config.chunksize)
                        if not batch:
                            break
                        # Finish current line
                        try:
                            batch += f.readline()
                        except (AttributeError, io.UnsupportedOperation):
                            batch += readline(f)
                        try:
                            while True:
                                try:
                                    pa_table = paj.read_json(
                                        io.BytesIO(batch),
                                        read_options=paj.ReadOptions(
                                            block_size=block_size))
                                    break
                                except (pa.ArrowInvalid,
                                        pa.ArrowNotImplementedError) as e:
                                    if (isinstance(e, pa.ArrowInvalid)
                                            and "straddling" not in str(e)
                                            or block_size > len(batch)):
                                        raise
                                    else:
                                        # Increase the block size in case it was too small.
                                        # The block size will be reset for the next file.
                                        logger.debug(
                                            f"Batch of {len(batch)} bytes couldn't be parsed with block_size={block_size}. Retrying with block_size={block_size * 2}."
                                        )
                                        block_size *= 2
                        except pa.ArrowInvalid as e:
                            logger.error(
                                f"Failed to read file '{file}' with error {type(e)}: {e}"
                            )
                            try:
                                with open(file, encoding="utf-8") as f:
                                    dataset = json.load(f)
                            except json.JSONDecodeError:
                                raise e
                            raise ValueError(
                                f"Not able to read records in the JSON file at {file}. "
                                f"You should probably indicate the field of the JSON file containing your records. "
                                f"This JSON file contain the following fields: {str(list(dataset.keys()))}. "
                                f"Select the correct one and provide it as `field='XXX'` to the dataset loading method. "
                            ) from None
                        # Uncomment for debugging (will print the Arrow table size and elements)
                        # logger.warning(f"pa_table: {pa_table} num rows: {pa_table.num_rows}")
                        # logger.warning('\n'.join(str(pa_table.slice(i, 1).to_pydict()) for i in range(pa_table.num_rows)))
                        yield (file_idx,
                               batch_idx), self._cast_classlabels(pa_table)
                        batch_idx += 1