def read(
        self, input_path: str, metadata: Metadata = None, **kwargs
    ) -> pd.DataFrame:
        """
        Reads a Parquet file and returns a Pandas DataFrame
        input_path: File to read either local or S3.
        metadata: A metadata object or dict
        **kwargs (optional): Additional kwargs are passed to the arrow reader
            arrow.parquet.read_table
        """

        arrow_tab = pq.read_table(input_path, **kwargs)

        if metadata:
            meta = validate_and_enrich_metadata(metadata)
            schema = ArrowConverter().generate_from_meta(meta)
            arrow_tab = cast_arrow_table_to_schema(
                arrow_tab,
                schema=schema,
                expect_full_schema=self.expect_full_schema,
            )

        df = arrow_to_pandas(
            arrow_tab,
            pd_boolean=self.pd_boolean,
            pd_integer=self.pd_integer,
            pd_string=self.pd_string,
            pd_date_type=self.pd_date_type,
            pd_timestamp_type=self.pd_timestamp_type,
        )

        return df
def pa_read_json_to_pandas(
    input_file: Union[IO, str],
    schema: Union[pa.Schema, Metadata, dict] = None,
    expect_full_schema: bool = True,
    pd_boolean: bool = True,
    pd_integer: bool = True,
    pd_string: bool = True,
    pd_date_type: str = "datetime_object",
    pd_timestamp_type: str = "datetime_object",
    **kwargs,
):
    """Read a jsonlines file into an Arrow table and convert it to a Pandas DataFrame.
    Args:
        input_file (Union[IO, str]): the JSONL you want to read. string, path or
            file-like object.
        schema (pyarrow.Schema): pyarrow Schema with the expected columns wanted.
            If unset pyarrow will infer datatypes.
        expect_full_schema (bool, optional): if True, pyarrow reader will
            expect the input schema to have fields for every col in the
            input file. If False, then will only cast columns that
            are listed in the schema, leaving all other columns to their
            default type on read.
        pd_boolean (bool, optional): if True, converts booleans to Pandas BooleanDtype.
            If False, leaves in the Pandas default bool format.
        pd_integer (bool, optional): if True, converts integers to Pandas Int64Dtype.
            If False, uses float64.
        pd_string (bool, optional): if True, converts integers to Pandas StringDtype.
            If False, leaves in the Pandas default object format.
        pd_date_type (str, optional): specifies the date type. Can be one of:
            "datetime_object", "pd_timestamp" or "pd_period".
        pd_timestamp_type (str, optional): specifies the datetime type. Can be one of:
            "datetime_object", "pd_timestamp" or "pd_period".
        **kwargs (optional): Additional kwargs are passed to pyarrow.json.read_json
    Returns:
        Pandas DataFrame: the jsonl data as a dataframe, with the specified data types
    """
    arrow_table = pa_read_json(input_file, schema, expect_full_schema,
                               **kwargs)

    df = arrow_to_pandas(
        arrow_table,
        pd_boolean=pd_boolean,
        pd_integer=pd_integer,
        pd_string=pd_string,
        pd_date_type=pd_date_type,
        pd_timestamp_type=pd_timestamp_type,
    )

    return df
def pa_read_csv_to_pandas(
    input_file: Union[IO, str],
    schema: Union[pa.Schema, Metadata, dict] = None,
    expect_full_schema: bool = True,
    pd_boolean: bool = True,
    pd_integer: bool = True,
    pd_string: bool = True,
    pd_date_type: str = "datetime_object",
    pd_timestamp_type: str = "datetime_object",
    **kwargs,
):
    """Read a csv file into an Arrow table and convert it to a Pandas DataFrame.
    Args:
        input_file (Union[IO, str]): the CSV you want to read. string, path or
            file-like object.
        schema (pyarrow.Schema): pyarrow Schema with the expected columns wanted.
            If unset pyarrow will infer datatypes.
        expect_full_schema (bool, optional): if True, pyarrow reader will
            expect the input schema to have fields for every col in the
            input file. If False, then will only cast columns that
            are listed in the schema, leaving all other columns to their
            default type on read.
        pd_boolean: whether to use the new pandas boolean format. Defaults to True.
            When set to False, uses a custom boolean format to coerce object type.
        pd_integer: if True, converts integers to Pandas int64 format.
            If False, uses float64. Defaults to True.
        pd_string: Defaults to True.
        pd_date_type (str, optional): specifies the date type. Can be one of:
            "datetime_object", "pd_timestamp" or "pd_period".
        pd_timestamp_type (str, optional): specifies the datetime type. Can be one of:
            "datetime_object", "pd_timestamp" or "pd_period".
        **kwargs (optional): Additional kwargs are passed to pyarrow.csv.read_csv
    Returns:
        Pandas DataFrame: the csv data as a dataframe, with the specified data types
    """
    arrow_table = pa_read_csv(input_file, schema, expect_full_schema, **kwargs)

    df = arrow_to_pandas(
        arrow_table,
        pd_boolean=pd_boolean,
        pd_integer=pd_integer,
        pd_string=pd_string,
        pd_date_type=pd_date_type,
        pd_timestamp_type=pd_timestamp_type,
    )

    return df
def pa_read_parquet_to_pandas(
    input_file: str,
    schema: Union[pa.Schema, Metadata, dict] = None,
    expect_full_schema: bool = True,
    pd_boolean: bool = True,
    pd_integer: bool = True,
    pd_string: bool = True,
    pd_date_type: str = "datetime_object",
    pd_timestamp_type: str = "datetime_object",
    **kwargs,
):
    """
    reads a parquet file to pandas dataframe with various type casting options
    Args:
        input_file (str): path (s3 or local) to the parquet file to read in
        schema (pa.Schema, optional): schema to cast the data to. Defaults to None.
        expect_full_schema (bool, optional): expect full schema. Defaults to True.
        pd_boolean (bool, optional): [description]. Defaults to True.
        pd_integer (bool, optional): [description]. Defaults to True.
        pd_string (bool, optional): [description]. Defaults to True.
        pd_date_type (str, optional): [description]. Defaults to "datetime_object".
        pd_timestamp_type (str, optional): [description]. Defaults to "datetime_object".
        kwargs (optional) : kwargs to pass to pyarrow.parquet.read_table
    Returns:
        pandas dataframe: pandas dataframe of the given input data
    """

    if not isinstance(input_file, str):
        raise TypeError("currently only supports string paths for input")

    arrow_table = pa_read_parquet(input_file, schema, expect_full_schema,
                                  **kwargs)

    df = arrow_to_pandas(
        arrow_table,
        pd_boolean=pd_boolean,
        pd_integer=pd_integer,
        pd_string=pd_string,
        pd_date_type=pd_date_type,
        pd_timestamp_type=pd_timestamp_type,
    )

    return df
Example #5
0
def _parse_data_to_pandas(filepath: str, table_params: dict, metadata: dict):
    """
    Reads in the data from the given filepath and returns
    a dataframe
    """

    meta_col_names = [
        c["name"] for c in metadata["columns"]
        if c["name"] not in metadata.get("partitions", [])
    ]

    # For string based file types convert make arrow readers read them in as strings
    # validators will still treat these as dates but will run validation against strings
    # cols expecting values to match a timestamp format
    if "json" in metadata["file_format"] or "csv" in metadata["file_format"]:
        md_obj = Metadata.from_dict(metadata)
        cols = md_obj.columns

        cols_to_force_str_read_in = []
        for c in cols:
            if c["type"].startswith("time") or c["type"].startswith("date"):
                c["type"] = "string"
                c["type_category"] = "string"
                cols_to_force_str_read_in.append(c["name"])

        md_obj.columns = cols
        ac = ArrowConverter()
        arrow_schema = ac.generate_from_meta(md_obj)

        ts_as_str_schema = pa.schema([])
        for cname in cols_to_force_str_read_in:
            ts_as_str_schema = ts_as_str_schema.append(
                arrow_schema.field(cname))

    # Set the reader type
    if filepath.startswith("s3://"):
        reader_fs = fs.S3FileSystem(region="eu-west-1")
        fp_for_file_reader = filepath.replace("s3://", "", 1)

    else:
        reader_fs = fs.LocalFileSystem()
        fp_for_file_reader = filepath

    with reader_fs.open_input_stream(fp_for_file_reader) as f:
        if "csv" in metadata["file_format"]:

            # Safer CSV load for newlines_in_values set to True
            if table_params.get("expect-header", True):
                po = csv.ParseOptions(newlines_in_values=True)
            else:
                po = csv.ParseOptions(newlines_in_values=True,
                                      column_names=meta_col_names)

            if ts_as_str_schema:
                co = csv.ConvertOptions(column_types=ts_as_str_schema)
            else:
                co = None

            df = pa_read_csv_to_pandas(
                input_file=f,
                schema=arrow_schema,
                expect_full_schema=False,
                parse_options=po,
                convert_options=co,
            )
            # dates/datetimes == string

        elif "json" in metadata["file_format"]:

            po = json.ParseOptions(
                newlines_in_values=True,
                explicit_schema=ts_as_str_schema if ts_as_str_schema else None,
            )

            df = pa_read_json_to_pandas(
                input_file=f,
                schema=arrow_schema,
                expect_full_schema=False,
                parse_options=po,
            )
            # dates/datetimes == string

        elif "parquet" in metadata["file_format"]:
            df = arrow_to_pandas(pq.read_table(f))
            # dates/datetimes == datetime / date

        else:
            raise ValueError(
                f"Unknown file_format in metadata: {metadata['file_format']}.")

    if table_params.get("row-limit"):
        df = df.sample(table_params.get("row-limit"))

    if table_params.get("headers-ignore-case"):
        df_cols = [c.lower() for c in df.columns]
        df.columns = df_cols

    if table_params.get("only-test-cols-in-metadata", False):
        keep_cols = [c for c in df.columns if c in meta_col_names]
        df = df[keep_cols]

    return df