def test_type_from_numpy_dtype_timestamps(): cases = [ (np.dtype('datetime64[s]'), pa.timestamp('s')), (np.dtype('datetime64[ms]'), pa.timestamp('ms')), (np.dtype('datetime64[us]'), pa.timestamp('us')), (np.dtype('datetime64[ns]'), pa.timestamp('ns')) ] for dt, pt in cases: result = pa.from_numpy_dtype(dt) assert result == pt
def get_datetimetz_type(values, dtype, type_): if values.dtype.type != np.datetime64: return values, type_ if _pandas_api.is_datetimetz(dtype) and type_ is None: # If no user type passed, construct a tz-aware timestamp type tz = dtype.tz unit = dtype.unit type_ = pa.timestamp(unit, tz) elif type_ is None: # Trust the NumPy dtype type_ = pa.from_numpy_dtype(values.dtype) return values, type_
def infer_pd_series_spark_type(s: pd.Series) -> types.DataType: """Infer Spark DataType from pandas Series dtype. :param s: :class:`pandas.Series` to be inferred :return: the inferred Spark data type """ dt = s.dtype if dt == np.dtype('object'): if len(s) == 0 or s.isnull().all(): raise ValueError("can not infer schema from empty or null dataset") return types.from_arrow_type(pa.Array.from_pandas(s).type) elif is_datetime64_dtype(dt) or is_datetime64tz_dtype(dt): return types.TimestampType() else: return types.from_arrow_type(pa.from_numpy_dtype(dt))
def get_datetimetz_type(values, dtype, type_): from pyarrow.compat import DatetimeTZDtype if values.dtype.type != np.datetime64: return values, type_ if isinstance(dtype, DatetimeTZDtype): tz = dtype.tz unit = dtype.unit type_ = pa.timestamp(unit, tz) elif type_ is None: # Trust the NumPy dtype type_ = pa.from_numpy_dtype(values.dtype) return values, type_
def test_from_numpy_dtype(): cases = [ (np.dtype('bool'), pa.bool_()), (np.dtype('int8'), pa.int8()), (np.dtype('int16'), pa.int16()), (np.dtype('int32'), pa.int32()), (np.dtype('int64'), pa.int64()), (np.dtype('uint8'), pa.uint8()), (np.dtype('uint16'), pa.uint16()), (np.dtype('uint32'), pa.uint32()), (np.dtype('float16'), pa.float16()), (np.dtype('float32'), pa.float32()), (np.dtype('float64'), pa.float64()), (np.dtype('U'), pa.string()), (np.dtype('S'), pa.binary()), (np.dtype('datetime64[s]'), pa.timestamp('s')), (np.dtype('datetime64[ms]'), pa.timestamp('ms')), (np.dtype('datetime64[us]'), pa.timestamp('us')), (np.dtype('datetime64[ns]'), pa.timestamp('ns')) ] for dt, pt in cases: result = pa.from_numpy_dtype(dt) assert result == pt # Things convertible to numpy dtypes work assert pa.from_numpy_dtype('U') == pa.string() assert pa.from_numpy_dtype(np.unicode) == pa.string() assert pa.from_numpy_dtype('int32') == pa.int32() assert pa.from_numpy_dtype(bool) == pa.bool_() with pytest.raises(NotImplementedError): pa.from_numpy_dtype(np.dtype('O')) with pytest.raises(TypeError): pa.from_numpy_dtype('not_convertible_to_dtype')
def _arrow_type_from_numpy_element_dtype(cls, dtype): # Scalar element dtype arrow_dtype = pa.from_numpy_dtype(dtype) return pa.binary(arrow_dtype.bit_width // 8)
def _write_map_parquet(hsp_map, filepath, clobber=False, nside_io=4): """ Internal method to write a HealSparseMap to a parquet dataset. use the `metadata` property from the map to persist additional information in the parquet metadata. Parameters ---------- hsp_map : `HealSparseMap` HealSparseMap to write to a file. filepath : `str` Name of dataset to save clobber : `bool`, optional Clobber existing file? Not supported. nside_io : `int`, optional The healpix nside to partition the output map files in parquet. Must be less than or equal to nside_coverage, and not greater than 16. Raises ------ RuntimeError if file exists. ValueError if nside_io is out of range. """ if os.path.isfile(filepath) or os.path.isdir(filepath): raise RuntimeError("Filepath %s exists and clobber is not supported." % (filepath)) if nside_io > hsp_map.nside_coverage: raise ValueError("nside_io must be <= nside_coverage.") elif nside_io > 16: raise ValueError("nside_io must be <= 16") elif nside_io < 0: raise ValueError("nside_io must be >= 0") # Make the path os.makedirs(filepath) # Create the nside_io paths cov_mask = hsp_map.coverage_mask cov_pixels = np.where(cov_mask)[0].astype(np.int32) bitshift_io = _compute_bitshift(nside_io, hsp_map.nside_coverage) cov_pixels_io = np.right_shift(cov_pixels, bitshift_io) if hsp_map.is_wide_mask_map: wmult = hsp_map.wide_mask_width else: wmult = 1 if np.isclose(hsp_map._sentinel, hpg.UNSEEN): sentinel_string = 'UNSEEN' else: sentinel_string = str(hsp_map._sentinel) metadata = { 'healsparse::version': '1', 'healsparse::nside_sparse': str(hsp_map.nside_sparse), 'healsparse::nside_coverage': str(hsp_map.nside_coverage), 'healsparse::nside_io': str(nside_io), 'healsparse::filetype': 'healsparse', 'healsparse::primary': '' if hsp_map.primary is None else hsp_map.primary, 'healsparse::sentinel': sentinel_string, 'healsparse::widemask': str(hsp_map.is_wide_mask_map), 'healsparse::wwidth': str(hsp_map._wide_mask_width) } # Add additional metadata if hsp_map.metadata is not None: # Use the fits header serialization for compatibility hdr_string = _make_header(hsp_map.metadata).tostring() metadata['healsparse::header'] = hdr_string if not hsp_map.is_rec_array: schema = pa.schema([('cov_pix', pa.from_numpy_dtype(np.int32)), ('sparse', pa.from_numpy_dtype(hsp_map.dtype))], metadata=metadata) else: type_list = [(name, pa.from_numpy_dtype(hsp_map.dtype[name].type)) for name in hsp_map.dtype.names] type_list[0:0] = [('cov_pix', pa.from_numpy_dtype(np.int32))] schema = pa.schema(type_list, metadata=metadata) cov_map = hsp_map._cov_map sparse_map = hsp_map._sparse_map.ravel() cov_index_map_temp = cov_map[:] + np.arange( hpg.nside_to_npixel( hsp_map.nside_coverage), dtype=np.int64) * cov_map.nfine_per_cov pix_arr = np.zeros(cov_map.nfine_per_cov * wmult, dtype=np.int32) last_pix_io = -1 writer = None row_groups = np.zeros_like(cov_pixels) for ctr, (pix_io, pix) in enumerate(zip(cov_pixels_io, cov_pixels)): # These are always going to be sorted if pix_io > last_pix_io: last_pix_io = pix_io if writer is not None: writer.close() writer = None # Create a new file pixpath = os.path.join(filepath, f'iopix={pix_io:03d}') os.makedirs(pixpath) pixfile = os.path.join(pixpath, f'{pix_io:03d}.parquet') writer = parquet.ParquetWriter(pixfile, schema) row_group_ctr = 0 sparsepix = sparse_map[cov_index_map_temp[pix] * wmult:(cov_index_map_temp[pix] + cov_map.nfine_per_cov) * wmult] pix_arr[:] = pix if not hsp_map.is_rec_array: arrays = [pa.array(pix_arr), pa.array(sparsepix)] else: arrays = [ pa.array(sparsepix[name]) for name in hsp_map.dtype.names ] arrays[0:0] = [pa.array(pix_arr)] tab = pa.Table.from_arrays(arrays, schema=schema) writer.write_table(tab) row_groups[ctr] = row_group_ctr row_group_ctr += 1 if writer is not None: writer.close() # And write the coverage pixels and row groups tab = pa.Table.from_pydict({ 'cov_pix': pa.array(cov_pixels), 'row_group': pa.array(row_groups) }) parquet.write_table(tab, os.path.join(filepath, '_coverage.parquet')) # And write the metadata parquet.write_metadata(schema, os.path.join(filepath, '_common_metadata')) parquet.write_metadata(schema, os.path.join(filepath, '_metadata'))
def _generate_column(column_params, num_rows): # If cardinality is specified, we create a set to sample from. # Otherwise, we simply use the given generator to generate each value. if column_params.cardinality is not None: # Construct set of values to sample from where # set size = cardinality if (isinstance(column_params.dtype, str) and column_params.dtype == "category"): vals = pa.array( column_params.generator, size=column_params.cardinality, safe=False, ) return pa.DictionaryArray.from_arrays( dictionary=vals, indices=np.random.randint(low=0, high=len(vals), size=num_rows), mask=np.random.choice( [True, False], size=num_rows, p=[ column_params.null_frequency, 1 - column_params.null_frequency, ], ) if column_params.null_frequency > 0.0 else None, ) vals = pa.array( column_params.generator, size=column_params.cardinality, safe=False, type=pa.from_numpy_dtype(column_params.dtype) if column_params.dtype is not None else None, ) # Generate data for current column return pa.array( np.random.choice(vals, size=num_rows), mask=np.random.choice( [True, False], size=num_rows, p=[ column_params.null_frequency, 1 - column_params.null_frequency, ], ) if column_params.null_frequency > 0.0 else None, size=num_rows, safe=False, type=pa.from_numpy_dtype(column_params.dtype) if column_params.dtype is not None else None, ) else: # Generate data for current column return pa.array( column_params.generator, mask=np.random.choice( [True, False], size=num_rows, p=[ column_params.null_frequency, 1 - column_params.null_frequency, ], ) if column_params.null_frequency > 0.0 else None, size=num_rows, safe=False, )
def __init__(self, data, dtype=None): super(GeometryFixed, self).__init__(data) self.numpy_dtype = np.dtype(dtype) self.pyarrow_type = pa.from_numpy_dtype(dtype)
def infer_schema(self, data): """ Infer a schema for a given data input. The schema can be used to test with schema validator. This function currently supports DataFrame, Numpy, Dictionary, List and basic python types.:: data = pandas.DataFrame(...) schema = infer_schema(data) This function returns None if it can not infer the schema. """ schema = None if data is None: schema = pa.null() elif isinstance(data, dict): schema = {'type': dict, 'fields': {}} for key, value in data.items(): schema['fields'][key] = self.infer_schema(value) elif isinstance(data, pd.DataFrame): schema = {'type': pd.DataFrame, 'fields': {}} # sample the table to get the schema pa_schema = pa.Table.from_pandas(data[:_SAMPLE_SIZE], preserve_index=False).schema for i, name in enumerate(pa_schema.names): schema['fields'][name] = pa_schema.types[i] elif isinstance(data, pd.Series): schema = { 'type': pd.Series, 'item': pa.Array.from_pandas(data).type, } elif isinstance(data, np.ndarray): pa_type = pa.from_numpy_dtype( data.dtype) if data.dtype.num != 17 else pa.string() if len(data.shape) == 1: # 1d array schema = { 'type': np.ndarray, 'item': pa_type, } else: shape = [ v if i != 0 else None for i, v in enumerate(data.shape) ] schema = { 'type': np.ndarray, 'item': pa_type, 'shape': tuple(shape), } elif isinstance(data, pa.Table): schema = data.schema elif isinstance(data, (list, tuple)) and len(data) > 0: # try to infer type of the data current_type = self.infer_schema(data[0]) for i in range(1, min(len(data), _SAMPLE_SIZE)): new_type = self.infer_schema(data[i]) if new_type != current_type: current_type = None break # does not support multiple type yet if current_type: if isinstance(current_type, pa.DataType): schema = pa.list_(current_type) else: schema = {'type': list, 'item': current_type} elif type(data) in _python_mapping: schema = _python_mapping[type(data)]() else: return {'type': type(data)} return schema
def arrow_type(self): if isinstance(self._value_type, ArrowDtype): arrow_subdtype = self._value_type.arrow_type else: arrow_subdtype = pa.from_numpy_dtype(self._value_type) return pa.list_(arrow_subdtype)
def to_arrow(self): return ArrowIntervalType(pa.from_numpy_dtype(self.subtype), self.closed)
def pandas_read_csv( filepath_or_buffer, sep=',', delimiter=None, # Column and Index Locations and Names header="infer", names=None, index_col=None, usecols=None, squeeze=False, prefix=None, mangle_dupe_cols=True, # General Parsing Configuration dtype=None, engine=None, converters=None, true_values=None, false_values=None, skipinitialspace=False, skiprows=None, skipfooter=0, nrows=None, # NA and Missing Data Handling na_values=None, keep_default_na=True, na_filter=True, verbose=False, skip_blank_lines=True, # Datetime Handling parse_dates=False, infer_datetime_format=False, keep_date_col=False, date_parser=None, dayfirst=False, cache_dates=True, # Iteration iterator=False, chunksize=None, # Quoting, Compression, and File Format compression="infer", thousands=None, decimal=b".", lineterminator=None, quotechar='"', # quoting=csv.QUOTE_MINIMAL, # not supported doublequote=True, escapechar=None, comment=None, encoding=None, dialect=None, # Error Handling error_bad_lines=True, warn_bad_lines=True, # Internal delim_whitespace=False, # low_memory=_c_parser_defaults["low_memory"], # not supported memory_map=False, float_precision=None, ): """Implements pandas.read_csv via pyarrow.csv.read_csv. This function has the same interface as pandas.read_csv. """ if delimiter is None: delimiter = sep autogenerate_column_names = bool(names) include_columns = None if usecols: if type(usecols[0]) == str: if names: include_columns = [f'f{names.index(col)}' for col in usecols] else: include_columns = usecols elif type(usecols[0]) == int: include_columns = [f'f{i}' for i in usecols] else: # usecols should be all str or int assert False # try: # keys = [k for k, v in dtype.items() if isinstance(v, pd.CategoricalDtype)] # if keys: # for k in keys: # del dtype[k] # names_list = list(names) # categories = [f"f{names_list.index(k)}" for k in keys] # except: pass categories = [] if dtype: if names: names_list = list(names) if isinstance(dtype, dict): column_types = {} for k, v in dtype.items(): column_name = "f{}".format(names_list.index(k)) if isinstance(v, pd.CategoricalDtype): categories.append(column_name) column_type = pyarrow.string() else: column_type = pyarrow.from_numpy_dtype(v) column_types[column_name] = column_type else: pa_dtype = pyarrow.from_numpy_dtype(dtype) column_types = { f"f{names_list.index(k)}": pa_dtype for k in names } elif usecols: if isinstance(dtype, dict): column_types = { k: pyarrow.from_numpy_dtype(v) for k, v in dtype.items() } else: column_types = { k: pyarrow.from_numpy_dtype(dtype) for k in usecols } else: if isinstance(dtype, dict): column_types = { k: pyarrow.from_numpy_dtype(v) for k, v in dtype.items() } else: column_types = pyarrow.from_numpy_dtype(dtype) else: column_types = None try: for column in parse_dates: name = f"f{column}" # TODO: Try to help pyarrow infer date type - set DateType. # dtype[name] = pyarrow.from_numpy_dtype(np.datetime64) # string del column_types[name] except: pass parse_options = pyarrow.csv.ParseOptions(delimiter=delimiter, ) read_options = pyarrow.csv.ReadOptions( skip_rows=skiprows, # column_names=column_names, autogenerate_column_names=autogenerate_column_names, ) convert_options = pyarrow.csv.ConvertOptions( column_types=column_types, strings_can_be_null=True, include_columns=include_columns, ) table = pyarrow.csv.read_csv( filepath_or_buffer, read_options=read_options, parse_options=parse_options, convert_options=convert_options, ) dataframe = table.to_pandas( # categories=categories or None, ) if names: if usecols and len(names) != len(usecols): if isinstance(usecols[0], int): dataframe.columns = [names[col] for col in usecols] elif isinstance(usecols[0], str): dataframe.columns = [name for name in names if name in usecols] else: dataframe.columns = names # fix when PyArrow will support predicted categories if isinstance(dtype, dict): for column_name, column_type in dtype.items(): if isinstance(column_type, pd.CategoricalDtype): dataframe[column_name] = dataframe[column_name].astype( column_type) return dataframe
def csv_reader_get_pyarrow_convert_options(names, usecols, dtype, parse_dates): include_columns = None # default value (include all CSV columns) # if names is not given then column names will be defined from from the first row of CSV file # otherwise pyarrow autogenerated column names will be used (see ReadOptions), so # map pandas usecols to pyarrow include_columns accordingly if usecols: if type(usecols[0]) == str: if names: include_columns = [f'f{names.index(col)}' for col in usecols] else: include_columns = usecols # no autogenerated names elif type(usecols[0]) == int: include_columns = [f'f{i}' for i in usecols] else: assert False, f"Failed building pyarrow ConvertOptions due to usecols param value: {usecols}" if dtype: # dtype pandas read_csv argument maps to pyarrow column_types dict, but column names # must match those that are read from CSV (if names is None) or pyarrows generated names (otherwise) if isinstance(dtype, dict): if names: names_list = list(names) column_types = {} for k, v in dtype.items(): # TO-DO: check this is aligned with include_columns column_name = "f{}".format(names_list.index(k)) if isinstance(v, pd.CategoricalDtype): column_type = pa.string() else: column_type = pa.from_numpy_dtype(v) column_types[column_name] = column_type else: column_types = {k: pa.from_numpy_dtype(v) for k, v in dtype.items()} else: # single dtype for all columns pa_dtype = pa.from_numpy_dtype(dtype) if names: column_types = {f"f{names_list.index(k)}": pa_dtype for k in names} elif usecols: column_types = dict.fromkeys(usecols, pa_dtype) else: column_types = pa_dtype else: column_types = None # TO-DO: support all possible parse_dates values (now only list of column positions is supported) try: for column in parse_dates: name = f"f{column}" # starting from pyarrow=3.0.0 strings are parsed to DateType (converted back to 'object' # when using to_pandas), but not TimestampType (that is used to represent np.datetime64) # see: pyarrow.from_numpy_dtype(np.datetime64('NaT', 's')) # so make pyarrow infer needed type manually column_types[name] = pa.timestamp('s') except (KeyError, TypeError): pass convert_options = csv.ConvertOptions( column_types=column_types, strings_can_be_null=True, include_columns=include_columns, ) return convert_options
def type(self): return pyarrow.from_numpy_dtype(self.dtype)
def __init__(self, array, dtype=None): def invalid_array(): err_msg = ( "Invalid array with type {typ}\n" "A {cls} may be constructed from:\n" " - A 1-d array with length divisible by {n} of interleaved\n" " x y coordinates\n" " - A tuple of {n} 1-d arrays\n" " - A pyarrow.FixedSizeBinaryArray. In this case the dtype\n" " argument must also be specified").format( typ=type(array), cls=self.__class__.__name__, n=self._element_len, ) raise ValueError(err_msg) if isinstance(dtype, GeometryDtype): dtype = dtype.subtype numpy_dtype = None pa_type = None if isinstance(array, (pa.Array, pa.ChunkedArray)): if dtype is None: invalid_array() numpy_dtype = np.dtype(dtype) elif isinstance(array, tuple): if len(array) == self._element_len: array = [np.asarray(array[i]) for i in range(len(array))] if dtype: array = [array[i].astype(dtype) for i in range(len(array))] # Capture numpy dtype numpy_dtype = array[0].dtype # Create buffer and FixedSizeBinaryArray pa_type = pa.binary( self._element_len * pa.from_numpy_dtype(numpy_dtype).bit_width // 8) buffers = [ None, pa.py_buffer(np.stack(array, axis=1).tobytes()) ] array = pa.Array.from_buffers(pa_type, len(array[0]), buffers=buffers) else: invalid_array() else: array = np.asarray(array) if array.dtype.kind == 'O': if array.ndim != 1: invalid_array() # Try to infer dtype if dtype is None: for i in range(len(array)): el = array[i] if el is None: continue if isinstance(el, GeometryFixed): numpy_dtype = el.numpy_dtype else: el_array = np.asarray(el) numpy_dtype = el_array.dtype break if numpy_dtype is None: invalid_array() else: numpy_dtype = dtype # Explicitly set the pyarrow binary type pa_type = pa.binary( self._element_len * pa.from_numpy_dtype(numpy_dtype).bit_width // 8) # Convert individual rows to bytes array = array.copy() for i in range(len(array)): el = array[i] if el is None: continue if isinstance(el, bytes): # Nothing to do pass elif isinstance(el, GeometryFixed): array[i] = el.flat_values.tobytes() else: array[i] = np.asarray(el, dtype=numpy_dtype).tobytes() else: if dtype: array = array.astype(dtype) # Capture numpy dtype numpy_dtype = array.dtype pa_type = pa.binary( self._element_len * pa.from_numpy_dtype(numpy_dtype).bit_width // 8) if array.ndim == 2: # Handle 2d array case if array.shape[1] != self._element_len: invalid_array() # Create buffer and FixedSizeBinaryArray buffers = [None, pa.py_buffer(array.tobytes())] array = pa.Array.from_buffers(pa_type, array.shape[0], buffers=buffers) elif array.ndim == 1 and len(array) % self._element_len == 0: buffers = [None, pa.py_buffer(array.tobytes())] array = pa.Array.from_buffers(pa_type, len(array) // self._element_len, buffers=buffers) else: invalid_array() self._numpy_dtype = numpy_dtype super().__init__(array, pa_type)
def from_numpy(obj, batch_size=None): """ Convert a list of numpy.ndarrays with equal shapes or as single numpy.ndarray with outer-dim as batch size to a pyarrow.Array """ if isinstance(obj, (list, tuple)): if batch_size is not None: def list_gen(): for i in range(0, len(obj), batch_size): slc = obj[i:i + batch_size] yield ArrowTensorArray.from_numpy(slc, batch_size=None) return list_gen() elif np.isscalar(obj[0]): return pa.array(obj) elif isinstance(obj[0], np.ndarray): # continue with batched ndarray obj = np.stack(obj, axis=0) if isinstance(obj, dict): names = list(obj.keys()) arrs = [ ArrowTensorArray.from_numpy(obj[k], batch_size=batch_size) for k in names ] batch = pa.RecordBatch.from_arrays(arrs, names) return pa.Table.from_batches([batch]) elif isinstance(obj, np.ndarray): # currently require contiguous ndarray if not obj.flags.c_contiguous: obj = np.ascontiguousarray(obj) pa_dtype = pa.from_numpy_dtype(obj.dtype) batch_size = obj.shape[0] element_shape = obj.shape[1:] total_num_elements = obj.size num_elements = 1 if len(obj.shape) == 1 else np.prod(element_shape) child_buf = pa.py_buffer(obj) child_array = pa.Array.from_buffers(pa_dtype, total_num_elements, [None, child_buf]) offset_buf = pa.py_buffer( np.int32([i * num_elements for i in range(batch_size + 1)])) storage = pa.Array.from_buffers(pa.list_(pa_dtype), batch_size, [None, offset_buf], children=[child_array]) typ = ArrowTensorType(element_shape, pa_dtype) return pa.ExtensionArray.from_storage(typ, storage) elif np.isscalar(obj): return pa.array([obj]) else: def iter_gen(): if batch_size is None: for d in obj: yield ArrowTensorArray.from_numpy( d, batch_size=batch_size) else: batch = [] for o in obj: batch.append(o) if len(batch) == batch_size: # merge dict if isinstance(batch[0], dict): d = {k: [v] for k, v in batch[0].items()} for i in range(1, len(batch)): for k, v in batch[i].items(): d[k].append(v) for k in d.keys(): d[k] = np.stack(d[k], axis=0) batch = d yield ArrowTensorArray.from_numpy(batch, batch_size=None) batch = [] return iter_gen()
def get_dataframe(parameters, use_threads): # Initialize seeds if parameters.seed is not None: np.random.seed(parameters.seed) # For each column, use a generic Mimesis producer to create an Iterable # for generating data for i, column_params in enumerate(parameters.column_parameters): if column_params.dtype is None: column_params.generator = column_params.generator( Generic("en", seed=parameters.seed)) else: column_params.generator = column_params.generator() # Get schema for each column table_fields = [] for i, column_params in enumerate(parameters.column_parameters): if (isinstance(column_params.dtype, str) and column_params.dtype == "category"): arrow_type = pa.dictionary( index_type=pa.int64(), value_type=pa.from_numpy_dtype( type(next(iter(column_params.generator)))), ) elif hasattr(column_params.dtype, "to_arrow"): arrow_type = column_params.dtype.to_arrow() else: arrow_type = pa.from_numpy_dtype( type(next(iter(column_params.generator)) ) if column_params.dtype is None else column_params.dtype) table_fields.append( pa.field( name=str(i), type=arrow_type, nullable=column_params.null_frequency > 0, )) schema = pa.schema(table_fields) # Initialize column data and which columns should be sorted column_data = [None] * len(parameters.column_parameters) columns_to_sort = [ str(i) for i, column_params in enumerate(parameters.column_parameters) if column_params.is_sorted ] # Generate data if not use_threads: for i, column_params in enumerate(parameters.column_parameters): column_data[i] = _generate_column(column_params, parameters.num_rows) else: pool = Pool(pa.cpu_count()) column_data = pool.starmap( _generate_column, [(column_params, parameters.num_rows) for i, column_params in enumerate(parameters.column_parameters)], ) pool.close() pool.join() # Convert to Pandas DataFrame and sort columns appropriately tbl = pa.Table.from_arrays( column_data, schema=schema, ) if columns_to_sort: tbl = tbl.to_pandas() tbl = tbl.sort_values(columns_to_sort) tbl = pa.Table.from_pandas(tbl, schema) return tbl
def pandas_read_csv( filepath_or_buffer, sep=",", # Column and Index Locations and Names names=None, usecols=None, # General Parsing Configuration dtype=None, skiprows=None, # Datetime Handling parse_dates=False, ): """Implements pandas.read_csv via pyarrow.csv.read_csv. This function has the same interface as pandas.read_csv. """ # Fallback to pandas need_categorical = isinstance(dtype, pd.CategoricalDtype) try: need_categorical |= any(isinstance(v, pd.CategoricalDtype) for v in dtype.values()) except: pass if need_categorical: return pd.read_csv( filepath_or_buffer, sep=sep, names=names, usecols=usecols, dtype=dtype, skiprows=skiprows, parse_dates=parse_dates ) autogenerate_column_names = bool(names) include_columns = None # categories = None if usecols is not None: include_columns = [f'f{i}' for i in usecols] read_options = pyarrow.csv.ReadOptions( skip_rows=skiprows, # column_names=column_names, autogenerate_column_names=autogenerate_column_names, ) parse_options = pyarrow.csv.ParseOptions( delimiter=sep, ) # try: # keys = [k for k, v in dtype.items() if isinstance(v, pd.CategoricalDtype)] # if keys: # for k in keys: # del dtype[k] # names_list = list(names) # categories = [f"f{names_list.index(k)}" for k in keys] # except: pass if dtype is not None: names_list = list(names) if not hasattr(dtype, 'items'): dtype = { f"f{names_list.index(k)}": pyarrow.from_numpy_dtype(dtype) for k in names } else: dtype = { f"f{names_list.index(k)}": pyarrow.from_numpy_dtype(v) for k, v in dtype.items() } try: for column in parse_dates: name = f"f{column}" # TODO: Try to help pyarrow infer date type - set DateType. # dtype[name] = pyarrow.from_numpy_dtype(np.datetime64) # string del dtype[name] except: pass convert_options = pyarrow.csv.ConvertOptions( column_types=dtype, strings_can_be_null=True, include_columns=include_columns, ) table = pyarrow.csv.read_csv( filepath_or_buffer, read_options=read_options, parse_options=parse_options, convert_options=convert_options, ) dataframe = table.to_pandas( # categories=categories, ) if names is not None: dataframe.columns = names return dataframe
def generate( path, parameters, format={ "name": "parquet", "row_group_size": 64 }, use_threads=True, ): """ Generate dataset using given parameters and write to given format Parameters ---------- path : str or file-like object Path to write to parameters : Parameters Parameters specifying how to randomly generate data format : Dict Format to write """ # Initialize seeds if parameters.seed is not None: np.random.seed(parameters.seed) column_seeds = np.arange(len(parameters.column_parameters)) np.random.shuffle(column_seeds) # For each column, use a generic Mimesis producer to create an Iterable # for generating data for i, column_params in enumerate(parameters.column_parameters): column_params.generator = column_params.generator( Generic("en", seed=column_seeds[i])) # Get schema for each column schema = pa.schema([ pa.field( name=str(i), type=pa.from_numpy_dtype(type(next(iter( column_params.generator)))), nullable=column_params.null_frequency > 0, ) for i, column_params in enumerate(parameters.column_parameters) ]) # Initialize column data and which columns should be sorted column_data = [None] * len(parameters.column_parameters) columns_to_sort = [ str(i) for i, column_params in enumerate(parameters.column_parameters) if column_params.is_sorted ] # Generate data if not use_threads: for i, column_params in enumerate(parameters.column_parameters): column_data[i] = _generate_column(column_params, parameters.num_rows) else: pool = Pool(pa.cpu_count()) column_data = pool.starmap( _generate_column, [(column_params, parameters.num_rows) for i, column_params in enumerate(parameters.column_parameters)], ) pool.close() pool.join() # Convert to Pandas DataFrame and sort columns appropriately tbl = pa.Table.from_arrays( column_data, schema=schema, ) if columns_to_sort: tbl = tbl.to_pandas() tbl = tbl.sort_values(columns_to_sort) tbl = pa.Table.from_pandas(tbl, schema) # Write _write(tbl, path, format)