Example #1
0
    def _generate_tables(self, files):
        for i, file in enumerate(files):
            if self.config.field is not None:
                with open(file, encoding="utf-8") as f:
                    dataset = json.load(f)

                # We keep only the field we are interested in
                dataset = dataset[self.config.field]

                # We accept two format: a list of dicts or a dict of lists
                if isinstance(dataset, (list, tuple)):
                    pa_table = paj.read_json(
                        BytesIO("\n".join(json.dumps(row) for row in dataset).encode("utf-8")),
                        read_options=self.config.pa_read_options,
                        parse_options=self.config.pa_parse_options,
                    )
                else:
                    pa_table = pa.Table.from_pydict(mapping=dataset, schema=self.config.schema)
            else:
                try:
                    pa_table = paj.read_json(
                        file,
                        read_options=self.config.pa_read_options,
                        parse_options=self.config.pa_parse_options,
                    )
                except pa.ArrowInvalid:
                    with open(file, encoding="utf-8") as f:
                        dataset = json.load(f)
                    raise ValueError(
                        f"Not able to read records in the JSON file at {file}. "
                        f"You should probably indicate the field of the JSON file containing your records. "
                        f"This JSON file contain the following fields: {str(list(dataset.keys()))}. "
                        f"Select the correct one and provide it as `field='XXX'` to the `load_dataset` method. "
                    )
            yield i, pa_table
Example #2
0
def stream_json(fn, parquet_fn, schema=None, chunk_size=10000000):
    if isinstance(fn, str):
        fn = [fn]

    if schema is None:
        schema = read_json(fn[0]).schema

    writer = pq.ParquetWriter(parquet_fn, schema)

    for _f in fn:
        check_gz = _f.endswith('.gz')

        if check_gz:
            f = gzip.open(_f, 'r')
        else:
            f = open(_f, 'r')

        while True:
            chunk = f.readlines(chunk_size)
            if not chunk:
                break

            tbl = read_json(io.BytesIO(''.join(chunk).encode()))
            assert tbl.schema == schema  # make sure the read table schema is the same as the parsed schema
            writer.write_table(tbl)

        f.close()

    writer.close()
Example #3
0
def convert_to_arrow(file_paths,
                     save_path,
                     cache_path_prefix="./data_chunk",
                     no_combine=False):
    converted_tables = []

    if len(file_paths) == 1:
        mmap = pa.memory_map(file_paths[0])
        json_input = json.read_json(mmap)
        writer = nlp.arrow_writer.ArrowWriter(path=save_path)
        writer.write_table(json_input)
    else:

        for idx, file in enumerate(file_paths):
            cache_path = cache_path_prefix + "." + str(idx)
            mmap = pa.memory_map(file)
            json_input = json.read_json(mmap)
            writer = nlp.arrow_writer.ArrowWriter(path=cache_path)
            writer.write_table(json_input)

            mmap = pa.memory_map(cache_path)
            f = pa.ipc.open_stream(mmap)
            pa_table = f.read_all()

            converted_tables.append(pa_table)

        if not no_combine:
            pa_table = pa.concat_tables(converted_tables, promote=False)

            writer = nlp.arrow_writer.ArrowWriter(path=save_path)
            writer.write_table(pa_table)
Example #4
0
def convert_ndjsons_to_parquet(files: List[Path], file_name: str,
                               out_dir: Union[Path,
                                              str], schema: pa.Schema) -> Path:
    pq_file = Path(f"{out_dir}/{file_name}.parquet")
    if not schema:
        schema = pa_json.read_json(files[0]).schema
    with pq.ParquetWriter(pq_file, schema) as writer:
        parse_options = pa_json.ParseOptions(explicit_schema=schema)
        for f in files:
            logger.debug(f"Processing {f}")
            table = pa_json.read_json(f, parse_options=parse_options)
            writer.write_table(table)
            remove(f)
    return pq_file
Example #5
0
    def _generate_tables(self, files):
        for i, file in enumerate(files):
            if self.config.field is not None:
                with open(file, encoding="utf-8") as f:
                    dataset = json.load(f)

                # We keep only the field we are interested in
                dataset = dataset[self.config.field]

                # We accept two format: a list of dicts or a dict of lists
                if isinstance(dataset, (list, tuple)):
                    pa_table = paj.read_json(
                        BytesIO("\n".join(json.dumps(row)
                                          for row in dataset).encode("utf-8")),
                        read_options=self.config.pa_read_options,
                        parse_options=self.config.pa_parse_options,
                    )
                else:
                    pa_table = pa.Table.from_pydict(mapping=dataset)
            else:
                try:
                    pa_table = paj.read_json(
                        file,
                        read_options=self.config.pa_read_options,
                        parse_options=self.config.pa_parse_options,
                    )
                except pa.ArrowInvalid:
                    with open(file, encoding="utf-8") as f:
                        dataset = json.load(f)
                    raise ValueError(
                        f"Not able to read records in the JSON file at {file}. "
                        f"You should probably indicate the field of the JSON file containing your records. "
                        f"This JSON file contain the following fields: {str(list(dataset.keys()))}. "
                        f"Select the correct one and provide it as `field='XXX'` to the dataset loading method. "
                    )
            if self.config.features:
                # Encode column if ClassLabel
                for i, col in enumerate(self.config.features.keys()):
                    if isinstance(self.config.features[col],
                                  datasets.ClassLabel):
                        pa_table = pa_table.set_column(
                            i, self.config.schema.field(col),
                            [self.config.features[col].str2int(pa_table[col])])
                # Cast allows str <-> int/float, while parse_option explicit_schema does NOT
                # Before casting, rearrange JSON field names to match passed features schema field names order
                pa_table = pa.Table.from_arrays(
                    [pa_table[name] for name in self.config.features],
                    schema=self.config.schema)
            yield i, pa_table
Example #6
0
def arrow_from_json(r, *args, **kwargs):
    """ Read the stream from s3 and turn JSON into an Arrow Table. """
    table = json.read_json(r)
    print("Created Dataframe with dimensions: (nrow, ncol) = %s" %
          str(table.shape),
          file=sys.stderr)
    return table
Example #7
0
    def from_jsonl(
        cls,
        json_path: str,
        identifier: Identifier = None,
        dataset_fmt: str = "in_memory",
    ) -> Dataset:
        """Load a dataset from a .jsonl file on disk, where each line of the
        json file consists of a single example."""

        if dataset_fmt == "in_memory":
            # Load the .jsonl file
            with open(json_path) as f:
                data = [json.loads(line) for line in f]

            return cls(
                data,
                identifier=identifier,
                dataset_fmt=dataset_fmt,
            )

        elif dataset_fmt == "datasets":
            # Use jsonarrow to directly load the json
            return cls(
                jsonarrow.read_json(json_path),
                identifier=identifier,
                dataset_fmt=dataset_fmt,
            )
        else:
            raise NotImplementedError
def pa_read_json(
    input_file: Union[IO, str],
    schema: Union[pa.Schema, Metadata, dict] = None,
    expect_full_schema: bool = True,
    **kwargs,
):
    """Read a jsonlines file into an Arrow table.
    Args:
        input_file (Union[IO, str]): the JSONL you want to read. string, path or
            file-like object.
        schema (pyarrow.Schema): pyarrow Schema with the expected columns wanted.
            If unset pyarrow will infer datatypes.
        expect_full_schema (bool, optional): if True, pyarrow reader will
            expect the input schema to have fields for every col in the
            input file. If False, then will only cast columns that
            are listed in the schema, leaving all other columns to their
            default type on read.
        **kwargs (optional): Additional kwargs are passed to pyarrow.json.read_json
    Returns:
        pyarrow.Table: the jsonl file in pyarrow format casted to the specified schema
    """

    if schema:
        schema = _get_arrow_schema(schema)

    pa_json_table = json.read_json(input_file, **kwargs)

    if schema:
        pa_json_table = cast_arrow_table_to_schema(
            pa_json_table,
            schema=schema,
            expect_full_schema=expect_full_schema)

    return pa_json_table
Example #9
0
 def from_json(cls, json_path: str, identifier: Identifier) -> Dataset:
     """Load a dataset from a JSON file on disk, where each line of the json
     file consists of a single example."""
     return cls(
         jsonarrow.read_json(json_path),
         identifier=identifier,
     )
Example #10
0
File: json.py Project: xwild/nlp
 def _generate_tables(self, files):
     for i, file in enumerate(files):
         pa_table = paj.read_json(
             file,
             read_options=self.config.pa_read_options,
             parse_options=self.config.pa_parse_options,
         )
         yield i, pa_table
Example #11
0
    def _read_file(self, f: "pyarrow.NativeFile", **arrow_reader_args):
        from pyarrow import json

        read_options = arrow_reader_args.pop(
            "read_options", json.ReadOptions(use_threads=False))
        return json.read_json(f,
                              read_options=read_options,
                              **arrow_reader_args)
Example #12
0
    def _generate_tables(self, files):
        for file_idx, file in enumerate(files):

            # If the file is one json object and if we need to look at the list of items in one specific field
            if self.config.field is not None:
                with open(file, encoding="utf-8") as f:
                    dataset = json.load(f)

                # We keep only the field we are interested in
                dataset = dataset[self.config.field]

                # We accept two format: a list of dicts or a dict of lists
                if isinstance(dataset, (list, tuple)):
                    mapping = {
                        col: [dataset[i][col] for i in range(len(dataset))]
                        for col in dataset[0].keys()
                    }
                else:
                    mapping = dataset
                pa_table = pa.Table.from_pydict(mapping=mapping)
                yield file_idx, self._cast_classlabels(pa_table)

            # If the file has one json object per line
            else:
                with open(file, "rb") as f:
                    batch_idx = 0
                    while True:
                        batch = f.read(self.config.chunksize)
                        if not batch:
                            break
                        batch += f.readline()  # finish current line
                        try:
                            pa_table = paj.read_json(BytesIO(batch))
                        except json.JSONDecodeError as e:
                            logger.error(
                                f"Failed to read file '{file}' with error {type(e)}: {e}"
                            )
                            try:
                                with open(file, encoding="utf-8") as f:
                                    dataset = json.load(f)
                            except json.JSONDecodeError:
                                raise e
                            raise ValueError(
                                f"Not able to read records in the JSON file at {file}. "
                                f"You should probably indicate the field of the JSON file containing your records. "
                                f"This JSON file contain the following fields: {str(list(dataset.keys()))}. "
                                f"Select the correct one and provide it as `field='XXX'` to the dataset loading method. "
                            )
                        # Uncomment for debugging (will print the Arrow table size and elements)
                        # logger.warning(f"pa_table: {pa_table} num rows: {pa_table.num_rows}")
                        # logger.warning('\n'.join(str(pa_table.slice(i, 1).to_pydict()) for i in range(pa_table.num_rows)))
                        yield (file_idx,
                               batch_idx), self._cast_classlabels(pa_table)
                        batch_idx += 1
Example #13
0
 def json_read(read_paths: List[str]):
     logger.debug(f"Reading {len(read_paths)} files.")
     tables = []
     for read_path in read_paths:
         with filesystem.open_input_file(read_path) as f:
             tables.append(
                 json.read_json(
                     f,
                     read_options=json.ReadOptions(use_threads=False),
                     **arrow_json_args))
     block = ArrowBlock(pa.concat_tables(tables))
     return block, block.get_metadata(input_files=read_paths)
def import_table(source: str):
    if not source:
        return sample_table()
    if source.endswith(".csv"):
        from pyarrow import csv
        return csv.read_csv(source)
    if source.endswith(".json"):
        from pyarrow import json
        return json.read_json(source)
    if source.endswith(".parquet"):
        return pq.read_table(source)
    raise ValueError("source must be csv, json or parquet")
def readZippedFile(file: str, verbose: bool = False) -> pd.DataFrame:
    """Read a zipped file.
    Reads a dataset export file as exported and downloaded from Pega. The export
    file is formatted as a zipped multi-line JSON file or CSV file
    and the data is read into a pandas dataframe.

    Parameters
    ----------
    file : str
        The full path to the file
    verbose : str, default=False
        Whether to print the names of the files within the unzipped file for debugging purposes

    Returns
    -------
    pd.DataFrame
        A pandas dataframe with the contents.
    """

    with zipfile.ZipFile(file, mode="r") as z:
        files = z.namelist()
        if verbose:
            print(files)  # pragma: no cover
        if "data.json" in files:
            with z.open("data.json") as zippedfile:
                try:
                    from pyarrow import json

                    return json.read_json(
                        zippedfile).to_pandas()  # pragma: no cover
                except ImportError:  # pragma: no cover
                    try:
                        dataset = pd.read_json(zippedfile, lines=True)
                        return dataset
                    except ValueError:
                        dataset = pd.read_json(zippedfile)
                        return dataset
        if "csv.json" in files:  # pragma: no cover
            with z.open("data.csv") as zippedfile:
                try:
                    from pyarrow import csv

                    return csv.read_json(zippedfile).to_pandas()
                except ImportError:
                    return pd.read_csv(zippedfile)
        else:  # pragma: no cover
            raise FileNotFoundError(
                "Cannot find a 'data' file in the zip folder.")
Example #16
0
def _read_table_from_source(
        source: Union[pd.DataFrame, str]) -> Tuple[pa.Table, List[str]]:
    """
    Infers a data source type (path or Pandas DataFrame) and reads it in as
    a PyArrow Table.

    Args:
        source (Union[pd.DataFrame, str]):
            Either a string path or Pandas DataFrame.

    Returns:
        Tuple[pa.Table, List[str]]:
            Tuple containing PyArrow table of dataset, and column names of PyArrow table.
    """

    # Pandas DataFrame detected
    if isinstance(source, pd.DataFrame):
        table = pa.Table.from_pandas(df=source)

    # Inferring a string path
    elif isinstance(source, str):
        file_path = source
        filename, file_ext = os.path.splitext(file_path)

        if ".csv" in file_ext:
            from pyarrow import csv

            table = csv.read_csv(filename)
        elif ".json" in file_ext:
            from pyarrow import json

            table = json.read_json(filename)
        else:
            table = pq.read_table(file_path)
    else:
        raise ValueError(
            f"Unknown data source provided for ingestion: {source}")

    # Ensure that PyArrow table is initialised
    assert isinstance(table, pa.lib.Table)

    column_names = table.column_names

    return table, column_names
Example #17
0
 def _init_table_from_path(self):
     if '.jsonl' in self.path.suffixes:
         # Can read ".jsonl" or ".jsonl.gz"
         import pyarrow.json as paj
         self.table = paj.read_json(
             str(self.path),
             read_options=paj.ReadOptions(
                 # magic constants:
                 # 894 - estimated average number of bytes per JSON item manifest
                 # 10000 - how many items we want to have in a chunk (Arrow's "batch")
                 block_size=894 * 10000))
     elif '.arrow' == self.path.suffixes[-1]:
         # Can read ".arrow"
         import pyarrow as pa
         mmap = pa.memory_map(str(self.path))
         stream = pa.ipc.open_file(mmap)
         self.table = stream.read_all()
     else:
         raise ValueError(f"Unknown LazyDict file format : '{self.path}'")
Example #18
0
    def _generate_tables(self, files):
        for file_idx, file in enumerate(files):

            # If the file is one json object and if we need to look at the list of items in one specific field
            if self.config.field is not None:
                with open(file, encoding="utf-8") as f:
                    dataset = json.load(f)

                # We keep only the field we are interested in
                dataset = dataset[self.config.field]

                # We accept two format: a list of dicts or a dict of lists
                if isinstance(dataset, (list, tuple)):
                    mapping = {
                        col: [dataset[i][col] for i in range(len(dataset))]
                        for col in dataset[0].keys()
                    }
                else:
                    mapping = dataset
                pa_table = pa.Table.from_pydict(mapping=mapping)
                yield file_idx, self._cast_classlabels(pa_table)

            # If the file has one json object per line
            else:
                with open(file, "rb") as f:
                    batch_idx = 0
                    # Use block_size equal to the chunk size divided by 32 to leverage multithreading
                    # Set a default minimum value of 16kB if the chunk size is really small
                    block_size = max(self.config.chunksize // 32, 16 << 10)
                    while True:
                        batch = f.read(self.config.chunksize)
                        if not batch:
                            break
                        # Finish current line
                        try:
                            batch += f.readline()
                        except (AttributeError, io.UnsupportedOperation):
                            batch += readline(f)
                        try:
                            while True:
                                try:
                                    pa_table = paj.read_json(
                                        io.BytesIO(batch),
                                        read_options=paj.ReadOptions(
                                            block_size=block_size))
                                    break
                                except (pa.ArrowInvalid,
                                        pa.ArrowNotImplementedError) as e:
                                    if (isinstance(e, pa.ArrowInvalid)
                                            and "straddling" not in str(e)
                                            or block_size > len(batch)):
                                        raise
                                    else:
                                        # Increase the block size in case it was too small.
                                        # The block size will be reset for the next file.
                                        logger.debug(
                                            f"Batch of {len(batch)} bytes couldn't be parsed with block_size={block_size}. Retrying with block_size={block_size * 2}."
                                        )
                                        block_size *= 2
                        except pa.ArrowInvalid as e:
                            logger.error(
                                f"Failed to read file '{file}' with error {type(e)}: {e}"
                            )
                            try:
                                with open(file, encoding="utf-8") as f:
                                    dataset = json.load(f)
                            except json.JSONDecodeError:
                                raise e
                            raise ValueError(
                                f"Not able to read records in the JSON file at {file}. "
                                f"You should probably indicate the field of the JSON file containing your records. "
                                f"This JSON file contain the following fields: {str(list(dataset.keys()))}. "
                                f"Select the correct one and provide it as `field='XXX'` to the dataset loading method. "
                            ) from None
                        # Uncomment for debugging (will print the Arrow table size and elements)
                        # logger.warning(f"pa_table: {pa_table} num rows: {pa_table.num_rows}")
                        # logger.warning('\n'.join(str(pa_table.slice(i, 1).to_pydict()) for i in range(pa_table.num_rows)))
                        yield (file_idx,
                               batch_idx), self._cast_classlabels(pa_table)
                        batch_idx += 1
Example #19
0
def query_json_file(f, column, val):
    table = pj.read_json(f)
    return [i for i in table.column(column) if i == val]
def readDSExport(
    filename: Union[pd.DataFrame, str],
    path: str = ".",
    verbose: bool = True,
    force_pandas: bool = False,
    **kwargs,
) -> pd.DataFrame:
    """Read a Pega dataset export file.
    Can accept either a Pandas DataFrame or one of the following formats:
    - .csv
    - .json
    - .zip (zipped json or CSV)

    It automatically infers the default file names for both model data as well as predictor data.
    If you supply either 'modelData' or 'predictorData' as the 'file' argument, it will search for them.
    If you supply the full name of the file in the 'path' directory, it will import that instead.

    Parameters
    ----------
    filename : [pd.DataFrame, str]
        Either a Pandas DataFrame with the source data (for compatibility),
        or a string, in which case it can either be:
        - The name of the file (if a custom name) or
        - Whether we want to look for 'modelData' or 'predictorData' in the path folder.
    path : str, default = '.'
        The location of the file
    verbose : bool, default = True
        Whether to print out which file will be imported

    Keyword arguments:
        Any arguments to plug into the read csv or json function, from either PyArrow or Pandas.

    Returns
    -------
    pd.DataFrame
        The read data from the given file

    Examples:
        >>> df = readDSExport(file = 'modelData', path = './datamart')
        >>> df = readDSExport(file = 'ModelSnapshot.json', path = 'data/ADMData')

        >>> df = pd.read_csv('file.csv')
        >>> df = readDSExport(file = df)

    """
    if isinstance(filename, pd.DataFrame):
        return filename

    is_url = False

    if os.path.isfile(os.path.join(path, filename)):
        file = os.path.join(path, filename)
    else:
        file = get_latest_file(path, filename)
        if file == "Target not found":
            import requests

            try:
                response = requests.get(f"{path}/{filename}")
                is_url = True if response.status_code == 200 else False
            except:
                is_url = False
            if is_url:
                file = f"{path}/{filename}"
                if file.split(".")[-1] == "zip":
                    file = urllib.request.urlopen(f"{path}/{filename}")
                if verbose:
                    print("File found through URL")
    if file in [None, "Target not found"]:
        if verbose:
            print(f"File {filename} not found in dir {path}")
        return None

    if isinstance(file, str):
        extension = file.split(".")[-1]
    elif isinstance(file, http.client.HTTPResponse):
        extension = "zipped"

    if verbose:
        print(f"Importing: {os.path.join(path,filename)}"
              ) if is_url else print(f"Importing: {file}")

    if extension == "parquet":  # pragma: no cover
        try:
            import pyarrow.parquet as pq

            return pq.read_table(file).to_pandas()
        except ImportError:
            print("You need to import pyarrow to read parquet files.")
    if extension == "csv":
        try:
            if force_pandas or is_url:
                raise ImportError("Forcing pandas.")
            from pyarrow import csv, ArrowInvalid

            try:  # pragma: no cover
                return csv.read_csv(
                    file,
                    parse_options=csv.ParseOptions(
                        delimiter=kwargs.get("sep", ",")),
                ).to_pandas()
            except ArrowInvalid:  # pragma: no cover
                raise ImportError()
        except ImportError:
            if not is_url:
                if verbose:
                    print(
                        "Can't import pyarrow, so defaulting to pandas. For faster imports, please install pyarrow."
                    )
            return pd.read_csv(file, **kwargs)
        except OSError:  # pragma: no cover
            raise FileNotFoundError(f"File {file} is not found.")
    elif extension == "json":
        try:  # pragma: no cover
            if force_pandas:
                raise ImportError("Forcing pandas.")
            from pyarrow import json, ArrowInvalid

            try:
                return json.read_json(file, **kwargs).to_pandas()
            except ArrowInvalid:
                raise ImportError()
        except ImportError:  # pragma: no cover
            if verbose:
                print(
                    "Can't import pyarrow, so defaulting to pandas. For faster imports, please install pyarrow."
                )

            try:
                return pd.read_json(file, lines=True, **kwargs)
            except ValueError:
                return pd.read_json(file, **kwargs)
        except OSError:  # pragma: no cover
            raise FileNotFoundError(f"File {file} is not found.")
    else:
        try:
            if is_url and extension == "zipped":
                return readZippedFile(file=BytesIO(file.read()))
            elif extension == "zip":
                return readZippedFile(file=file)
            else:
                return FileNotFoundError(
                    f"File {file} is not found.")  # pragma: no cover
        except OSError:  # pragma: no cover
            raise FileNotFoundError(f"File {file} is not found.")
Example #21
0
 def read_json(self, *args, **kwargs):
     read_options = kwargs.setdefault('read_options', ReadOptions())
     read_options.use_threads = True
     table = read_json(*args, **kwargs)
     table.validate()
     return table
Example #22
0
 def read_json(self, *args, **kwargs):
     read_options = kwargs.setdefault('read_options', ReadOptions())
     read_options.use_threads = True
     table = read_json(*args, **kwargs)
     table._validate()
     return table
Example #23
0
def _read_table_from_source(source: Union[pd.DataFrame, str], chunk_size: int,
                            max_workers: int) -> Tuple[str, str]:
    """
    Infers a data source type (path or Pandas DataFrame) and reads it in as
    a PyArrow Table.

    The PyArrow Table that is read will be written to a parquet file with row
    group size determined by the minimum of:
        * (table.num_rows / max_workers)
        * chunk_size

    The parquet file that is created will be passed as file path to the
    multiprocessing pool workers.

    Args:
        source (Union[pd.DataFrame, str]):
            Either a string path or Pandas DataFrame.

        chunk_size (int):
            Number of worker processes to use to encode values.

        max_workers (int):
            Amount of rows to load and ingest at a time.

    Returns:
        Tuple[str, str]:
            Tuple containing parent directory path and destination path to
            parquet file.
    """

    # Pandas DataFrame detected
    if isinstance(source, pd.DataFrame):
        table = pa.Table.from_pandas(df=source)

    # Inferring a string path
    elif isinstance(source, str):
        file_path = source
        filename, file_ext = os.path.splitext(file_path)

        if ".csv" in file_ext:
            from pyarrow import csv

            table = csv.read_csv(filename)
        elif ".json" in file_ext:
            from pyarrow import json

            table = json.read_json(filename)
        else:
            table = pq.read_table(file_path)
    else:
        raise ValueError(
            f"Unknown data source provided for ingestion: {source}")

    # Ensure that PyArrow table is initialised
    assert isinstance(table, pa.lib.Table)

    # Write table as parquet file with a specified row_group_size
    dir_path = tempfile.mkdtemp()
    tmp_table_name = f"{int(time.time())}.parquet"
    dest_path = f"{dir_path}/{tmp_table_name}"
    row_group_size = min(ceil(table.num_rows / max_workers), chunk_size)
    pq.write_table(table=table, where=dest_path, row_group_size=row_group_size)

    # Remove table from memory
    del table

    return dir_path, dest_path
Example #24
0
    def _split_generators(self, dl_manager):
        if not self.config.data_files:
            raise ValueError(
                f"At least one data file must be specified, but got data_files={self.config.data_files}"
            )

        # Do an early pass if:
        # * `features` are not specified, to infer the class labels
        # * `drop_metadata` is False, to find the metadata files
        do_analyze = (
            self.config.features is None
            and not self.config.drop_labels) or not self.config.drop_metadata
        if do_analyze:
            labels = set()
            metadata_files = collections.defaultdict(list)

            def analyze(files_or_archives, downloaded_files_or_dirs, split):
                if len(downloaded_files_or_dirs) == 0:
                    return
                # The files are separated from the archives at this point, so check the first sample
                # to see if it's a file or a directory and iterate accordingly
                if os.path.isfile(downloaded_files_or_dirs[0]):
                    original_files, downloaded_files = files_or_archives, downloaded_files_or_dirs
                    for original_file, downloaded_file in zip(
                            original_files, downloaded_files):
                        original_file, downloaded_file = str(
                            original_file), str(downloaded_file)
                        _, original_file_ext = os.path.splitext(original_file)
                        if original_file_ext.lower() in self.IMAGE_EXTENSIONS:
                            labels.add(
                                os.path.basename(
                                    os.path.dirname(original_file)))
                        elif os.path.basename(
                                original_file) == self.METADATA_FILENAME:
                            metadata_files[split].append(
                                (original_file, downloaded_file))
                        else:
                            original_file_name = os.path.basename(
                                original_file)
                            logger.debug(
                                f"The file '{original_file_name}' was ignored: it is not an image, and is not {self.METADATA_FILENAME} either."
                            )
                else:
                    archives, downloaded_dirs = files_or_archives, downloaded_files_or_dirs
                    for archive, downloaded_dir in zip(archives,
                                                       downloaded_dirs):
                        archive, downloaded_dir = str(archive), str(
                            downloaded_dir)
                        for downloaded_dir_file in dl_manager.iter_files(
                                downloaded_dir):
                            _, downloaded_dir_file_ext = os.path.splitext(
                                downloaded_dir_file)
                            if downloaded_dir_file_ext in self.IMAGE_EXTENSIONS:
                                labels.add(
                                    os.path.basename(
                                        os.path.dirname(downloaded_dir_file)))
                            elif os.path.basename(downloaded_dir_file
                                                  ) == self.METADATA_FILENAME:
                                metadata_files[split].append(
                                    (None, downloaded_dir_file))
                            else:
                                archive_file_name = os.path.basename(archive)
                                original_file_name = os.path.basename(
                                    downloaded_dir_file)
                                logger.debug(
                                    f"The file '{original_file_name}' from the archive '{archive_file_name}' was ignored: it is not an image, and is not {self.METADATA_FILENAME} either."
                                )

            if not self.config.drop_labels:
                logger.info("Inferring labels from data files...")
            if not self.config.drop_metadata:
                logger.info("Analyzing metadata files...")

        data_files = self.config.data_files
        splits = []
        for split_name, files in data_files.items():
            if isinstance(files, str):
                files = [files]
            files, archives = self._split_files_and_archives(files)
            downloaded_files = dl_manager.download(files)
            downloaded_dirs = dl_manager.download_and_extract(archives)
            if do_analyze:
                analyze(files, downloaded_files, split_name)
                analyze(archives, downloaded_dirs, split_name)
            splits.append(
                datasets.SplitGenerator(
                    name=split_name,
                    gen_kwargs={
                        "files": [(file, downloaded_file)
                                  for file, downloaded_file in zip(
                                      files, downloaded_files)] +
                        [(None, dl_manager.iter_files(downloaded_dir))
                         for downloaded_dir in downloaded_dirs],
                        "metadata_files":
                        metadata_files
                        if not self.config.drop_metadata else None,
                        "split_name":
                        split_name,
                    },
                ))

        if not self.config.drop_metadata and metadata_files:
            # Verify that:
            # * all metadata files have the same set of features
            # * the `file_name` key is one of the metadata keys and is of type string
            features_per_metadata_file: List[Tuple[str,
                                                   datasets.Features]] = []
            for _, downloaded_metadata_file in itertools.chain.from_iterable(
                    metadata_files.values()):
                with open(downloaded_metadata_file, "rb") as f:
                    pa_metadata_table = paj.read_json(f)
                features_per_metadata_file.append(
                    (downloaded_metadata_file,
                     datasets.Features.from_arrow_schema(
                         pa_metadata_table.schema)))
            for downloaded_metadata_file, metadata_features in features_per_metadata_file:
                if metadata_features != features_per_metadata_file[0][1]:
                    raise ValueError(
                        f"Metadata files {downloaded_metadata_file} and {features_per_metadata_file[0][0]} have different features: {features_per_metadata_file[0]} != {metadata_features}"
                    )
            metadata_features = features_per_metadata_file[0][1]
            if "file_name" not in metadata_features:
                raise ValueError(
                    "`file_name` must be present as dictionary key in metadata files"
                )
            if metadata_features["file_name"] != datasets.Value("string"):
                raise ValueError("`file_name` key must be a string")
            del metadata_features["file_name"]
        else:
            metadata_features = None

        # Normally, we would do this in _info, but we need to know the labels and/or metadata
        # before building the features
        if self.config.features is None:
            if not self.config.drop_labels and not metadata_files:
                self.info.features = datasets.Features({
                    "image":
                    datasets.Image(),
                    "label":
                    datasets.ClassLabel(names=sorted(labels))
                })
                task_template = ImageClassification(image_column="image",
                                                    label_column="label")
                task_template = task_template.align_with_features(
                    self.info.features)
                self.info.task_templates = [task_template]
            else:
                self.info.features = datasets.Features(
                    {"image": datasets.Image()})

            if not self.config.drop_metadata and metadata_files:
                # Verify that there are no duplicated keys when compared to the existing features ("image", optionally "label")
                duplicated_keys = set(
                    self.info.features) & set(metadata_features)
                if duplicated_keys:
                    raise ValueError(
                        f"Metadata feature keys {list(duplicated_keys)} are already present as the image features"
                    )
                self.info.features.update(metadata_features)

        return splits
Example #25
0
    def _generate_examples(self, files, metadata_files, split_name):
        if not self.config.drop_metadata and metadata_files:
            split_metadata_files = metadata_files.get(split_name, [])
            image_empty_metadata = {
                k: None
                for k in self.info.features if k != "image"
            }

            last_checked_dir = None
            metadata_dir = None
            metadata_dict = None
            downloaded_metadata_file = None

            file_idx = 0
            for original_file, downloaded_file_or_dir in files:
                if original_file is not None:
                    _, original_file_ext = os.path.splitext(original_file)
                    if original_file_ext.lower() in self.IMAGE_EXTENSIONS:
                        # If the file is an image, and we've just entered a new directory,
                        # find the nereast metadata file (by counting path segments) for the directory
                        current_dir = os.path.dirname(original_file)
                        if last_checked_dir is None or last_checked_dir != current_dir:
                            last_checked_dir = current_dir
                            metadata_file_candidates = [
                                (
                                    os.path.relpath(
                                        original_file,
                                        os.path.dirname(
                                            metadata_file_candidate)),
                                    metadata_file_candidate,
                                    downloaded_metadata_file,
                                ) for metadata_file_candidate,
                                downloaded_metadata_file in
                                split_metadata_files
                                if metadata_file_candidate is
                                not None  # ignore metadata_files that are inside archives
                                and not os.path.relpath(
                                    original_file,
                                    os.path.dirname(metadata_file_candidate)
                                ).startswith("..")
                            ]
                            if metadata_file_candidates:
                                _, metadata_file, downloaded_metadata_file = min(
                                    metadata_file_candidates,
                                    key=lambda x: count_path_segments(x[0]))
                                with open(downloaded_metadata_file, "rb") as f:
                                    pa_metadata_table = paj.read_json(f)
                                pa_file_name_array = pa_metadata_table[
                                    "file_name"]
                                pa_file_name_array = pc.replace_substring(
                                    pa_file_name_array,
                                    pattern="\\",
                                    replacement="/")
                                pa_metadata_table = pa_metadata_table.drop(
                                    ["file_name"])
                                metadata_dir = os.path.dirname(metadata_file)
                                metadata_dict = {
                                    file_name: image_metadata
                                    for file_name, image_metadata in zip(
                                        pa_file_name_array.to_pylist(),
                                        pa_table_to_pylist(pa_metadata_table))
                                }
                            else:
                                raise ValueError(
                                    f"One or several metadata.jsonl were found, but not in the same directory or in a parent directory of {downloaded_file_or_dir}."
                                )
                        if metadata_dir is not None and downloaded_metadata_file is not None:
                            file_relpath = os.path.relpath(
                                original_file, metadata_dir)
                            file_relpath = file_relpath.replace("\\", "/")
                            if file_relpath not in metadata_dict:
                                raise ValueError(
                                    f"Image at {file_relpath} doesn't have metadata in {downloaded_metadata_file}."
                                )
                            image_metadata = metadata_dict[file_relpath]
                        else:
                            raise ValueError(
                                f"One or several metadata.jsonl were found, but not in the same directory or in a parent directory of {downloaded_file_or_dir}."
                            )
                        yield file_idx, {
                            **image_empty_metadata,
                            "image": downloaded_file_or_dir,
                            **image_metadata,
                        }
                        file_idx += 1
                else:
                    for downloaded_dir_file in downloaded_file_or_dir:
                        _, downloaded_dir_file_ext = os.path.splitext(
                            downloaded_dir_file)
                        if downloaded_dir_file_ext.lower(
                        ) in self.IMAGE_EXTENSIONS:
                            current_dir = os.path.dirname(downloaded_dir_file)
                            if last_checked_dir is None or last_checked_dir != current_dir:
                                last_checked_dir = current_dir
                                metadata_file_candidates = [
                                    (
                                        os.path.relpath(
                                            downloaded_dir_file,
                                            os.path.dirname(
                                                downloaded_metadata_file)),
                                        metadata_file_candidate,
                                        downloaded_metadata_file,
                                    ) for metadata_file_candidate,
                                    downloaded_metadata_file in
                                    split_metadata_files
                                    if metadata_file_candidate is
                                    None  # ignore metadata_files that are not inside archives
                                    and not os.path.relpath(
                                        downloaded_dir_file,
                                        os.path.dirname(
                                            downloaded_metadata_file)
                                    ).startswith("..")
                                ]
                                if metadata_file_candidates:
                                    _, metadata_file, downloaded_metadata_file = min(
                                        metadata_file_candidates,
                                        key=lambda x: count_path_segments(x[0]
                                                                          ))
                                    with open(downloaded_metadata_file,
                                              "rb") as f:
                                        pa_metadata_table = paj.read_json(f)
                                    pa_file_name_array = pa_metadata_table[
                                        "file_name"]
                                    pa_file_name_array = pc.replace_substring(
                                        pa_file_name_array,
                                        pattern="\\",
                                        replacement="/")
                                    pa_metadata_table = pa_metadata_table.drop(
                                        ["file_name"])
                                    metadata_dir = os.path.dirname(
                                        downloaded_metadata_file)
                                    metadata_dict = {
                                        file_name: image_metadata
                                        for file_name, image_metadata in zip(
                                            pa_file_name_array.to_pylist(),
                                            pa_table_to_pylist(
                                                pa_metadata_table))
                                    }
                                else:
                                    raise ValueError(
                                        f"One or several metadata.jsonl were found, but not in the same directory or in a parent directory of {downloaded_dir_file}."
                                    )
                            if metadata_dir is not None and downloaded_metadata_file is not None:
                                downloaded_dir_file_relpath = os.path.relpath(
                                    downloaded_dir_file, metadata_dir)
                                downloaded_dir_file_relpath = downloaded_dir_file_relpath.replace(
                                    "\\", "/")
                                if downloaded_dir_file_relpath not in metadata_dict:
                                    raise ValueError(
                                        f"Image at {downloaded_dir_file_relpath} doesn't have metadata in {downloaded_metadata_file}."
                                    )
                                image_metadata = metadata_dict[
                                    downloaded_dir_file_relpath]
                            else:
                                raise ValueError(
                                    f"One or several metadata.jsonl were found, but not in the same directory or in a parent directory of {downloaded_dir_file}."
                                )
                            yield file_idx, {
                                **image_empty_metadata,
                                "image": downloaded_dir_file,
                                **image_metadata,
                            }
                            file_idx += 1
        else:
            file_idx = 0
            for original_file, downloaded_file_or_dir in files:
                if original_file is not None:
                    _, original_file_ext = os.path.splitext(original_file)
                    if original_file_ext.lower() in self.IMAGE_EXTENSIONS:
                        if self.config.drop_labels or metadata_files:
                            yield file_idx, {
                                "image": downloaded_file_or_dir,
                            }
                        else:
                            yield file_idx, {
                                "image":
                                downloaded_file_or_dir,
                                "label":
                                os.path.basename(
                                    os.path.dirname(original_file)),
                            }
                        file_idx += 1
                else:
                    for downloaded_dir_file in downloaded_file_or_dir:
                        _, downloaded_dir_file_ext = os.path.splitext(
                            downloaded_dir_file)
                        if downloaded_dir_file_ext.lower(
                        ) in self.IMAGE_EXTENSIONS:
                            if self.config.drop_labels or metadata_files:
                                yield file_idx, {
                                    "image": downloaded_dir_file,
                                }
                            else:
                                yield file_idx, {
                                    "image":
                                    downloaded_dir_file,
                                    "label":
                                    os.path.basename(
                                        os.path.dirname(downloaded_dir_file)),
                                }
                            file_idx += 1
Example #26
0
from pyarrow import json
import pyarrow.parquet as pq
import pyarrow as pa
from glob import glob

import time

t = time.time_ns()

fn = "tests/data/formats/tweets"
table = json.read_json(fn + ".jsonl")
pq.write_table(table, "parc.parquet", compression="ZSTD")
# table = pq.read_table(fn + '.parquet')

# print(table.schema)
# print(table.column_names)
# print(table.select(['username']))
# print(table.take([0]).to_pydict())
print(table.take([0, 500, 9000])["username"])
# print(table.filter())


def iter_arrow(tbl):
    for batch in tbl.to_batches():
        dict_batch = batch.to_pydict()
        for index in range(len(batch)):
            yield {k: v[index] for k, v in dict_batch.items()}


# , filter=([('user_verified', '=', True)]))
# table = pq.read_table(fn + '.parquet')
Example #27
0
 def read_to_object(self, f: IOBase):
     at = pa_json.read_json(f.name)
     return at
Example #28
0
#! /usr/bin/env nix-shell
#! nix-shell -i python3 -p python3 withPackages(ps: [ ps.pandas ps.numpy ps.pyarrow])

# nix-shell -p "python3.withPackages(ps: [ ps.pandas ps.numpy ps.pyarrow ps.yapf])"

from pyarrow import json as paj
import json as js
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

print("Start ....")

x = np.random.randn(10000)

df = pd.DataFrame({'one': x, 'two': x + 5, 'three': x > 0.1}, )

table = pa.Table.from_pandas(df)
pq.write_table(table, 'example.parquet')

with open("exemplary.json", "w") as f:
    js.dump(dict(x=list(x), y=[dict(a=1, b=5, c=False) for i in list(x)]), f)

table = paj.read_json(f"exemplary.json")
pq.write_table(table, 'example_nested.parquet')
print("Stop ....")
Example #29
0
 def from_jsonl(cls, path: Pathlike) -> 'LazyDict':
     _check_arrow()
     import pyarrow.json as paj
     table = paj.read_json(str(path))
     return cls(table)
Example #30
0
def test_json_file_to_arrow(f):
    return pa_json.read_json(f)