def read( self, input_path: str, metadata: Metadata = None, **kwargs ) -> pd.DataFrame: """ Reads a Parquet file and returns a Pandas DataFrame input_path: File to read either local or S3. metadata: A metadata object or dict **kwargs (optional): Additional kwargs are passed to the arrow reader arrow.parquet.read_table """ arrow_tab = pq.read_table(input_path, **kwargs) if metadata: meta = validate_and_enrich_metadata(metadata) schema = ArrowConverter().generate_from_meta(meta) arrow_tab = cast_arrow_table_to_schema( arrow_tab, schema=schema, expect_full_schema=self.expect_full_schema, ) df = arrow_to_pandas( arrow_tab, pd_boolean=self.pd_boolean, pd_integer=self.pd_integer, pd_string=self.pd_string, pd_date_type=self.pd_date_type, pd_timestamp_type=self.pd_timestamp_type, ) return df
def pa_read_json_to_pandas( input_file: Union[IO, str], schema: Union[pa.Schema, Metadata, dict] = None, expect_full_schema: bool = True, pd_boolean: bool = True, pd_integer: bool = True, pd_string: bool = True, pd_date_type: str = "datetime_object", pd_timestamp_type: str = "datetime_object", **kwargs, ): """Read a jsonlines file into an Arrow table and convert it to a Pandas DataFrame. Args: input_file (Union[IO, str]): the JSONL you want to read. string, path or file-like object. schema (pyarrow.Schema): pyarrow Schema with the expected columns wanted. If unset pyarrow will infer datatypes. expect_full_schema (bool, optional): if True, pyarrow reader will expect the input schema to have fields for every col in the input file. If False, then will only cast columns that are listed in the schema, leaving all other columns to their default type on read. pd_boolean (bool, optional): if True, converts booleans to Pandas BooleanDtype. If False, leaves in the Pandas default bool format. pd_integer (bool, optional): if True, converts integers to Pandas Int64Dtype. If False, uses float64. pd_string (bool, optional): if True, converts integers to Pandas StringDtype. If False, leaves in the Pandas default object format. pd_date_type (str, optional): specifies the date type. Can be one of: "datetime_object", "pd_timestamp" or "pd_period". pd_timestamp_type (str, optional): specifies the datetime type. Can be one of: "datetime_object", "pd_timestamp" or "pd_period". **kwargs (optional): Additional kwargs are passed to pyarrow.json.read_json Returns: Pandas DataFrame: the jsonl data as a dataframe, with the specified data types """ arrow_table = pa_read_json(input_file, schema, expect_full_schema, **kwargs) df = arrow_to_pandas( arrow_table, pd_boolean=pd_boolean, pd_integer=pd_integer, pd_string=pd_string, pd_date_type=pd_date_type, pd_timestamp_type=pd_timestamp_type, ) return df
def pa_read_csv_to_pandas( input_file: Union[IO, str], schema: Union[pa.Schema, Metadata, dict] = None, expect_full_schema: bool = True, pd_boolean: bool = True, pd_integer: bool = True, pd_string: bool = True, pd_date_type: str = "datetime_object", pd_timestamp_type: str = "datetime_object", **kwargs, ): """Read a csv file into an Arrow table and convert it to a Pandas DataFrame. Args: input_file (Union[IO, str]): the CSV you want to read. string, path or file-like object. schema (pyarrow.Schema): pyarrow Schema with the expected columns wanted. If unset pyarrow will infer datatypes. expect_full_schema (bool, optional): if True, pyarrow reader will expect the input schema to have fields for every col in the input file. If False, then will only cast columns that are listed in the schema, leaving all other columns to their default type on read. pd_boolean: whether to use the new pandas boolean format. Defaults to True. When set to False, uses a custom boolean format to coerce object type. pd_integer: if True, converts integers to Pandas int64 format. If False, uses float64. Defaults to True. pd_string: Defaults to True. pd_date_type (str, optional): specifies the date type. Can be one of: "datetime_object", "pd_timestamp" or "pd_period". pd_timestamp_type (str, optional): specifies the datetime type. Can be one of: "datetime_object", "pd_timestamp" or "pd_period". **kwargs (optional): Additional kwargs are passed to pyarrow.csv.read_csv Returns: Pandas DataFrame: the csv data as a dataframe, with the specified data types """ arrow_table = pa_read_csv(input_file, schema, expect_full_schema, **kwargs) df = arrow_to_pandas( arrow_table, pd_boolean=pd_boolean, pd_integer=pd_integer, pd_string=pd_string, pd_date_type=pd_date_type, pd_timestamp_type=pd_timestamp_type, ) return df
def pa_read_parquet_to_pandas( input_file: str, schema: Union[pa.Schema, Metadata, dict] = None, expect_full_schema: bool = True, pd_boolean: bool = True, pd_integer: bool = True, pd_string: bool = True, pd_date_type: str = "datetime_object", pd_timestamp_type: str = "datetime_object", **kwargs, ): """ reads a parquet file to pandas dataframe with various type casting options Args: input_file (str): path (s3 or local) to the parquet file to read in schema (pa.Schema, optional): schema to cast the data to. Defaults to None. expect_full_schema (bool, optional): expect full schema. Defaults to True. pd_boolean (bool, optional): [description]. Defaults to True. pd_integer (bool, optional): [description]. Defaults to True. pd_string (bool, optional): [description]. Defaults to True. pd_date_type (str, optional): [description]. Defaults to "datetime_object". pd_timestamp_type (str, optional): [description]. Defaults to "datetime_object". kwargs (optional) : kwargs to pass to pyarrow.parquet.read_table Returns: pandas dataframe: pandas dataframe of the given input data """ if not isinstance(input_file, str): raise TypeError("currently only supports string paths for input") arrow_table = pa_read_parquet(input_file, schema, expect_full_schema, **kwargs) df = arrow_to_pandas( arrow_table, pd_boolean=pd_boolean, pd_integer=pd_integer, pd_string=pd_string, pd_date_type=pd_date_type, pd_timestamp_type=pd_timestamp_type, ) return df
def _parse_data_to_pandas(filepath: str, table_params: dict, metadata: dict): """ Reads in the data from the given filepath and returns a dataframe """ meta_col_names = [ c["name"] for c in metadata["columns"] if c["name"] not in metadata.get("partitions", []) ] # For string based file types convert make arrow readers read them in as strings # validators will still treat these as dates but will run validation against strings # cols expecting values to match a timestamp format if "json" in metadata["file_format"] or "csv" in metadata["file_format"]: md_obj = Metadata.from_dict(metadata) cols = md_obj.columns cols_to_force_str_read_in = [] for c in cols: if c["type"].startswith("time") or c["type"].startswith("date"): c["type"] = "string" c["type_category"] = "string" cols_to_force_str_read_in.append(c["name"]) md_obj.columns = cols ac = ArrowConverter() arrow_schema = ac.generate_from_meta(md_obj) ts_as_str_schema = pa.schema([]) for cname in cols_to_force_str_read_in: ts_as_str_schema = ts_as_str_schema.append( arrow_schema.field(cname)) # Set the reader type if filepath.startswith("s3://"): reader_fs = fs.S3FileSystem(region="eu-west-1") fp_for_file_reader = filepath.replace("s3://", "", 1) else: reader_fs = fs.LocalFileSystem() fp_for_file_reader = filepath with reader_fs.open_input_stream(fp_for_file_reader) as f: if "csv" in metadata["file_format"]: # Safer CSV load for newlines_in_values set to True if table_params.get("expect-header", True): po = csv.ParseOptions(newlines_in_values=True) else: po = csv.ParseOptions(newlines_in_values=True, column_names=meta_col_names) if ts_as_str_schema: co = csv.ConvertOptions(column_types=ts_as_str_schema) else: co = None df = pa_read_csv_to_pandas( input_file=f, schema=arrow_schema, expect_full_schema=False, parse_options=po, convert_options=co, ) # dates/datetimes == string elif "json" in metadata["file_format"]: po = json.ParseOptions( newlines_in_values=True, explicit_schema=ts_as_str_schema if ts_as_str_schema else None, ) df = pa_read_json_to_pandas( input_file=f, schema=arrow_schema, expect_full_schema=False, parse_options=po, ) # dates/datetimes == string elif "parquet" in metadata["file_format"]: df = arrow_to_pandas(pq.read_table(f)) # dates/datetimes == datetime / date else: raise ValueError( f"Unknown file_format in metadata: {metadata['file_format']}.") if table_params.get("row-limit"): df = df.sample(table_params.get("row-limit")) if table_params.get("headers-ignore-case"): df_cols = [c.lower() for c in df.columns] df.columns = df_cols if table_params.get("only-test-cols-in-metadata", False): keep_cols = [c for c in df.columns if c in meta_col_names] df = df[keep_cols] return df