Example #1
0
File: json.py Project: xwild/nlp
class JsonConfig(nlp.BuilderConfig):
    """BuilderConfig for JSON."""
    read_options: paj.ReadOptions = paj.ReadOptions()
    parse_options: paj.ParseOptions = paj.ParseOptions()

    @property
    def pa_read_options(self):
        return self.read_options

    @property
    def pa_parse_options(self):
        return self.parse_options
Example #2
0
def convert_ndjsons_to_parquet(files: List[Path], file_name: str,
                               out_dir: Union[Path,
                                              str], schema: pa.Schema) -> Path:
    pq_file = Path(f"{out_dir}/{file_name}.parquet")
    if not schema:
        schema = pa_json.read_json(files[0]).schema
    with pq.ParquetWriter(pq_file, schema) as writer:
        parse_options = pa_json.ParseOptions(explicit_schema=schema)
        for f in files:
            logger.debug(f"Processing {f}")
            table = pa_json.read_json(f, parse_options=parse_options)
            writer.write_table(table)
            remove(f)
    return pq_file
Example #3
0
 def pa_parse_options(self):
     return paj.ParseOptions(explicit_schema=self.schema,
                             newlines_in_values=self.newlines_in_values)
Example #4
0
 def pa_parse_options(self):
     return paj.ParseOptions(newlines_in_values=self.newlines_in_values)
Example #5
0
def _parse_data_to_pandas(filepath: str, table_params: dict, metadata: dict):
    """
    Reads in the data from the given filepath and returns
    a dataframe
    """

    meta_col_names = [
        c["name"] for c in metadata["columns"]
        if c["name"] not in metadata.get("partitions", [])
    ]

    # For string based file types convert make arrow readers read them in as strings
    # validators will still treat these as dates but will run validation against strings
    # cols expecting values to match a timestamp format
    if "json" in metadata["file_format"] or "csv" in metadata["file_format"]:
        md_obj = Metadata.from_dict(metadata)
        cols = md_obj.columns

        cols_to_force_str_read_in = []
        for c in cols:
            if c["type"].startswith("time") or c["type"].startswith("date"):
                c["type"] = "string"
                c["type_category"] = "string"
                cols_to_force_str_read_in.append(c["name"])

        md_obj.columns = cols
        ac = ArrowConverter()
        arrow_schema = ac.generate_from_meta(md_obj)

        ts_as_str_schema = pa.schema([])
        for cname in cols_to_force_str_read_in:
            ts_as_str_schema = ts_as_str_schema.append(
                arrow_schema.field(cname))

    # Set the reader type
    if filepath.startswith("s3://"):
        reader_fs = fs.S3FileSystem(region="eu-west-1")
        fp_for_file_reader = filepath.replace("s3://", "", 1)

    else:
        reader_fs = fs.LocalFileSystem()
        fp_for_file_reader = filepath

    with reader_fs.open_input_stream(fp_for_file_reader) as f:
        if "csv" in metadata["file_format"]:

            # Safer CSV load for newlines_in_values set to True
            if table_params.get("expect-header", True):
                po = csv.ParseOptions(newlines_in_values=True)
            else:
                po = csv.ParseOptions(newlines_in_values=True,
                                      column_names=meta_col_names)

            if ts_as_str_schema:
                co = csv.ConvertOptions(column_types=ts_as_str_schema)
            else:
                co = None

            df = pa_read_csv_to_pandas(
                input_file=f,
                schema=arrow_schema,
                expect_full_schema=False,
                parse_options=po,
                convert_options=co,
            )
            # dates/datetimes == string

        elif "json" in metadata["file_format"]:

            po = json.ParseOptions(
                newlines_in_values=True,
                explicit_schema=ts_as_str_schema if ts_as_str_schema else None,
            )

            df = pa_read_json_to_pandas(
                input_file=f,
                schema=arrow_schema,
                expect_full_schema=False,
                parse_options=po,
            )
            # dates/datetimes == string

        elif "parquet" in metadata["file_format"]:
            df = arrow_to_pandas(pq.read_table(f))
            # dates/datetimes == datetime / date

        else:
            raise ValueError(
                f"Unknown file_format in metadata: {metadata['file_format']}.")

    if table_params.get("row-limit"):
        df = df.sample(table_params.get("row-limit"))

    if table_params.get("headers-ignore-case"):
        df_cols = [c.lower() for c in df.columns]
        df.columns = df_cols

    if table_params.get("only-test-cols-in-metadata", False):
        keep_cols = [c for c in df.columns if c in meta_col_names]
        df = df[keep_cols]

    return df