def parse_green_taxi_csv(fobj): """ Parse a binary file object of cleaned "green taxi" CSV data as returned by the "read_green_taxi_csv" function, and return a PyArrow table. """ convert_options = ConvertOptions( column_types=SCHEMA, false_values=['N'], null_values=[''], timestamp_parsers=['%Y-%m-%d %H:%M:%S'], true_values=['Y'], ) parse_options = ParseOptions(quote_char=False) read_options = ReadOptions( column_names=SCHEMA.names, encoding=ENCODING, ) return read_csv( fobj, convert_options=convert_options, parse_options=parse_options, read_options=read_options, )
def test_options_delimiter(self): rows = b"a;b,c\nde,fg;eh\n" table = self.read_bytes(rows) assert table.to_pydict() == { 'a;b': [u'de'], 'c': [u'fg;eh'], } opts = ParseOptions(delimiter=';') table = self.read_bytes(rows, parse_options=opts) assert table.to_pydict() == { 'a': [u'de,fg'], 'b,c': [u'eh'], }
def test_options_delimiter(self): rows = b"a;b,c\nde,fg;eh\n" reader = self.open_bytes(rows) expected_schema = pa.schema([('a;b', pa.string()), ('c', pa.string())]) self.check_reader(reader, expected_schema, [{ 'a;b': ['de'], 'c': ['fg;eh'] }]) opts = ParseOptions(delimiter=';') reader = self.open_bytes(rows, parse_options=opts) expected_schema = pa.schema([('a', pa.string()), ('b,c', pa.string())]) self.check_reader(reader, expected_schema, [{ 'a': ['de,fg'], 'b,c': ['eh'] }])
def test_empty_lines(self): rows = b"a,b\n\r1,2\r\n\r\n3,4\r\n" table = self.read_bytes(rows) assert table.to_pydict() == { 'a': [1, 3], 'b': [2, 4], } parse_options = ParseOptions(ignore_empty_lines=False) table = self.read_bytes(rows, parse_options=parse_options) assert table.to_pydict() == { 'a': [None, 1, None, 3], 'b': [None, 2, None, 4], } read_options = ReadOptions(skip_rows=2) table = self.read_bytes(rows, parse_options=parse_options, read_options=read_options) assert table.to_pydict() == { '1': [None, 3], '2': [None, 4], }
def read(self, env: CylonEnv, table, relevant_cols=None, **kwargs) -> DataFrame: filepath = self.table_path_mapping[table].replace('$TABLE', table) names, _ = get_schema(table) # csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30) # .with_delimiter('|') read_opts = ReadOptions(column_names=names, block_size=(1 << 30)) parse_opts = ParseOptions(delimiter='|') convert_opts = ConvertOptions(include_columns=relevant_cols) # if table is in refresh_tables list, read that table and concat # NOTE: refresh tables have the same parallelism as its data tables if table in REFRESH_TABLES: data_table = pa_read_csv(filepath, read_options=read_opts, parse_options=parse_opts, convert_options=convert_opts) refresh_path = filepath.replace('/data/', '/data_refresh/') refresh_table = pa_read_csv(refresh_path, read_options=read_opts, parse_options=parse_opts, convert_options=convert_opts) pa_table = pa_concat_tables([data_table, refresh_table]) else: pa_table = pa_read_csv(filepath, read_options=read_opts, parse_options=parse_opts, convert_options=convert_opts) return DataFrame(Table.from_arrow(env.context, pa_table))
def read_csv( cls, filepath_or_buffer, sep=",", delimiter=None, header="infer", names=None, index_col=None, usecols=None, squeeze=False, prefix=None, mangle_dupe_cols=True, dtype=None, engine=None, converters=None, true_values=None, false_values=None, skipinitialspace=False, skiprows=None, nrows=None, na_values=None, keep_default_na=True, na_filter=True, verbose=False, skip_blank_lines=True, parse_dates=False, infer_datetime_format=False, keep_date_col=False, date_parser=None, dayfirst=False, cache_dates=True, iterator=False, chunksize=None, compression="infer", thousands=None, decimal=b".", lineterminator=None, quotechar='"', quoting=0, escapechar=None, comment=None, encoding=None, dialect=None, error_bad_lines=True, warn_bad_lines=True, skipfooter=0, doublequote=True, delim_whitespace=False, low_memory=True, memory_map=False, float_precision=None, storage_options=None, ): items = locals().copy() mykwargs = {k: items[k] for k in items if k in cls.arg_keys} eng = str(engine).lower().strip() try: if eng in ["pandas", "c"]: return cls._read(**mykwargs) if isinstance(dtype, dict): column_types = { c: cls._dtype_to_arrow(t) for c, t in dtype.items() } else: column_types = cls._dtype_to_arrow(dtype) if (type(parse_dates) is list) and type(column_types) is dict: for c in parse_dates: column_types[c] = pa.timestamp("s") if names: if header == 0: skiprows = skiprows + 1 if skiprows is not None else 1 elif header is None or header == "infer": pass else: raise NotImplementedError( "read_csv with 'arrow' engine and provided 'names' parameter supports only 0, None and 'infer' header values" ) else: if header == 0 or header == "infer": pass else: raise NotImplementedError( "read_csv with 'arrow' engine without 'names' parameter provided supports only 0 and 'infer' header values" ) if delimiter is None: delimiter = sep if delim_whitespace and delimiter != ",": raise ValueError( "Specified a delimiter and delim_whitespace=True; you can only specify one." ) usecols_md = cls._prepare_pyarrow_usecols(mykwargs) po = ParseOptions( delimiter="\\s+" if delim_whitespace else delimiter, quote_char=quotechar, double_quote=doublequote, escape_char=escapechar, newlines_in_values=False, ignore_empty_lines=skip_blank_lines, ) co = ConvertOptions( check_utf8=None, column_types=column_types, null_values=None, true_values=None, false_values=None, # timestamp fields should be handled as strings if parse_dates # didn't passed explicitly as an array or a dict timestamp_parsers=[""] if isinstance(parse_dates, bool) else None, strings_can_be_null=None, include_columns=usecols_md, include_missing_columns=None, auto_dict_encode=None, auto_dict_max_cardinality=None, ) ro = ReadOptions( use_threads=True, block_size=None, skip_rows=skiprows, column_names=names, autogenerate_column_names=None, ) at = read_csv( filepath_or_buffer, read_options=ro, parse_options=po, convert_options=co, ) return cls.from_arrow(at) except (pa.ArrowNotImplementedError, NotImplementedError): if eng in ["arrow"]: raise ErrorMessage.default_to_pandas("`read_csv`") return cls._read(**mykwargs)
def read_csv( cls, filepath_or_buffer, sep=",", delimiter=None, header="infer", names=lib.no_default, index_col=None, usecols=None, squeeze=False, prefix=lib.no_default, mangle_dupe_cols=True, dtype=None, engine=None, converters=None, true_values=None, false_values=None, skipinitialspace=False, skiprows=None, nrows=None, na_values=None, keep_default_na=True, na_filter=True, verbose=False, skip_blank_lines=True, parse_dates=False, infer_datetime_format=False, keep_date_col=False, date_parser=None, dayfirst=False, cache_dates=True, iterator=False, chunksize=None, compression="infer", thousands=None, decimal=".", lineterminator=None, quotechar='"', quoting=0, escapechar=None, comment=None, encoding=None, encoding_errors="strict", dialect=None, error_bad_lines=None, warn_bad_lines=None, on_bad_lines=None, skipfooter=0, doublequote=True, delim_whitespace=False, low_memory=True, memory_map=False, float_precision=None, storage_options=None, ): # noqa: PR01 """ Read data from `filepath_or_buffer` according to the passed `kwargs` parameters. For parameters description please refer to pandas API. Returns ------- BaseQueryCompiler Query compiler with imported data for further processing. Notes ----- Reading performed by using of `pyarrow.read_csv` function. """ items = locals().copy() mykwargs = {k: items[k] for k in items if k in cls.arg_keys} eng = str(engine).lower().strip() try: if eng in ["pandas", "c"]: return cls._read(**mykwargs) cls._validate_read_csv_kwargs(mykwargs) use_modin_impl, error_message = cls._read_csv_check_support( mykwargs, ) if not use_modin_impl: raise ArrowEngineException(error_message) if isinstance(dtype, dict): column_types = {c: cls._dtype_to_arrow(t) for c, t in dtype.items()} else: column_types = cls._dtype_to_arrow(dtype) if (type(parse_dates) is list) and type(column_types) is dict: for c in parse_dates: column_types[c] = pa.timestamp("s") if names not in [lib.no_default, None] and header == 0: skiprows = skiprows + 1 if skiprows is not None else 1 if delimiter is None and sep is not lib.no_default: delimiter = sep usecols_md = cls._prepare_pyarrow_usecols(mykwargs) po = ParseOptions( delimiter="\\s+" if delim_whitespace else delimiter, quote_char=quotechar, double_quote=doublequote, escape_char=escapechar, newlines_in_values=False, ignore_empty_lines=skip_blank_lines, ) co = ConvertOptions( check_utf8=None, column_types=column_types, null_values=None, true_values=None, false_values=None, # timestamp fields should be handled as strings if parse_dates # didn't passed explicitly as an array or a dict timestamp_parsers=[""] if isinstance(parse_dates, bool) else None, strings_can_be_null=None, include_columns=usecols_md, include_missing_columns=None, auto_dict_encode=None, auto_dict_max_cardinality=None, ) ro = ReadOptions( use_threads=True, block_size=None, skip_rows=skiprows, column_names=names if names is not lib.no_default else None, autogenerate_column_names=None, ) at = read_csv( filepath_or_buffer, read_options=ro, parse_options=po, convert_options=co, ) return cls.from_arrow(at) except ( pa.ArrowNotImplementedError, pa.ArrowInvalid, NotImplementedError, ArrowEngineException, ): if eng in ["arrow"]: raise ErrorMessage.default_to_pandas("`read_csv`") return cls._read(**mykwargs)