Python ParseOptions Examples

Programming Language: Python

Namespace/Package Name: pyarrow.json

Method/Function: ParseOptions

Examples at hotexamples.com: 5

Python ParseOptions - 5 examples found. These are the top rated real world Python examples of pyarrow.json.ParseOptions extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: json.py Project: xwild/nlp

class JsonConfig(nlp.BuilderConfig):
    """BuilderConfig for JSON."""
    read_options: paj.ReadOptions = paj.ReadOptions()
    parse_options: paj.ParseOptions = paj.ParseOptions()

    @property
    def pa_read_options(self):
        return self.read_options

    @property
    def pa_parse_options(self):
        return self.parse_options

Example #2

Show file

def convert_ndjsons_to_parquet(files: List[Path], file_name: str,
                               out_dir: Union[Path,
                                              str], schema: pa.Schema) -> Path:
    pq_file = Path(f"{out_dir}/{file_name}.parquet")
    if not schema:
        schema = pa_json.read_json(files[0]).schema
    with pq.ParquetWriter(pq_file, schema) as writer:
        parse_options = pa_json.ParseOptions(explicit_schema=schema)
        for f in files:
            logger.debug(f"Processing {f}")
            table = pa_json.read_json(f, parse_options=parse_options)
            writer.write_table(table)
            remove(f)
    return pq_file

Example #3

Show file

File: json.py Project: yngtodd/datasets

 def pa_parse_options(self):
     return paj.ParseOptions(explicit_schema=self.schema,
                             newlines_in_values=self.newlines_in_values)

Example #4

Show file

 def pa_parse_options(self):
     return paj.ParseOptions(newlines_in_values=self.newlines_in_values)

Example #5

Show file

def _parse_data_to_pandas(filepath: str, table_params: dict, metadata: dict):
    """
    Reads in the data from the given filepath and returns
    a dataframe
    """

    meta_col_names = [
        c["name"] for c in metadata["columns"]
        if c["name"] not in metadata.get("partitions", [])
    ]

    # For string based file types convert make arrow readers read them in as strings
    # validators will still treat these as dates but will run validation against strings
    # cols expecting values to match a timestamp format
    if "json" in metadata["file_format"] or "csv" in metadata["file_format"]:
        md_obj = Metadata.from_dict(metadata)
        cols = md_obj.columns

        cols_to_force_str_read_in = []
        for c in cols:
            if c["type"].startswith("time") or c["type"].startswith("date"):
                c["type"] = "string"
                c["type_category"] = "string"
                cols_to_force_str_read_in.append(c["name"])

        md_obj.columns = cols
        ac = ArrowConverter()
        arrow_schema = ac.generate_from_meta(md_obj)

        ts_as_str_schema = pa.schema([])
        for cname in cols_to_force_str_read_in:
            ts_as_str_schema = ts_as_str_schema.append(
                arrow_schema.field(cname))

    # Set the reader type
    if filepath.startswith("s3://"):
        reader_fs = fs.S3FileSystem(region="eu-west-1")
        fp_for_file_reader = filepath.replace("s3://", "", 1)

    else:
        reader_fs = fs.LocalFileSystem()
        fp_for_file_reader = filepath

    with reader_fs.open_input_stream(fp_for_file_reader) as f:
        if "csv" in metadata["file_format"]:

            # Safer CSV load for newlines_in_values set to True
            if table_params.get("expect-header", True):
                po = csv.ParseOptions(newlines_in_values=True)
            else:
                po = csv.ParseOptions(newlines_in_values=True,
                                      column_names=meta_col_names)

            if ts_as_str_schema:
                co = csv.ConvertOptions(column_types=ts_as_str_schema)
            else:
                co = None

            df = pa_read_csv_to_pandas(
                input_file=f,
                schema=arrow_schema,
                expect_full_schema=False,
                parse_options=po,
                convert_options=co,
            )
            # dates/datetimes == string

        elif "json" in metadata["file_format"]:

            po = json.ParseOptions(
                newlines_in_values=True,
                explicit_schema=ts_as_str_schema if ts_as_str_schema else None,
            )

            df = pa_read_json_to_pandas(
                input_file=f,
                schema=arrow_schema,
                expect_full_schema=False,
                parse_options=po,
            )
            # dates/datetimes == string

        elif "parquet" in metadata["file_format"]:
            df = arrow_to_pandas(pq.read_table(f))
            # dates/datetimes == datetime / date

        else:
            raise ValueError(
                f"Unknown file_format in metadata: {metadata['file_format']}.")

    if table_params.get("row-limit"):
        df = df.sample(table_params.get("row-limit"))

    if table_params.get("headers-ignore-case"):
        df_cols = [c.lower() for c in df.columns]
        df.columns = df_cols

    if table_params.get("only-test-cols-in-metadata", False):
        keep_cols = [c for c in df.columns if c in meta_col_names]
        df = df[keep_cols]

    return df