Esempio n. 1
0
def test_type_from_numpy_dtype_timestamps():
    cases = [
        (np.dtype('datetime64[s]'), pa.timestamp('s')),
        (np.dtype('datetime64[ms]'), pa.timestamp('ms')),
        (np.dtype('datetime64[us]'), pa.timestamp('us')),
        (np.dtype('datetime64[ns]'), pa.timestamp('ns'))
    ]

    for dt, pt in cases:
        result = pa.from_numpy_dtype(dt)
        assert result == pt
Esempio n. 2
0
def get_datetimetz_type(values, dtype, type_):
    if values.dtype.type != np.datetime64:
        return values, type_

    if _pandas_api.is_datetimetz(dtype) and type_ is None:
        # If no user type passed, construct a tz-aware timestamp type
        tz = dtype.tz
        unit = dtype.unit
        type_ = pa.timestamp(unit, tz)
    elif type_ is None:
        # Trust the NumPy dtype
        type_ = pa.from_numpy_dtype(values.dtype)

    return values, type_
Esempio n. 3
0
def infer_pd_series_spark_type(s: pd.Series) -> types.DataType:
    """Infer Spark DataType from pandas Series dtype.

    :param s: :class:`pandas.Series` to be inferred
    :return: the inferred Spark data type
    """
    dt = s.dtype
    if dt == np.dtype('object'):
        if len(s) == 0 or s.isnull().all():
            raise ValueError("can not infer schema from empty or null dataset")
        return types.from_arrow_type(pa.Array.from_pandas(s).type)
    elif is_datetime64_dtype(dt) or is_datetime64tz_dtype(dt):
        return types.TimestampType()
    else:
        return types.from_arrow_type(pa.from_numpy_dtype(dt))
def get_datetimetz_type(values, dtype, type_):
    from pyarrow.compat import DatetimeTZDtype

    if values.dtype.type != np.datetime64:
        return values, type_

    if isinstance(dtype, DatetimeTZDtype):
        tz = dtype.tz
        unit = dtype.unit
        type_ = pa.timestamp(unit, tz)
    elif type_ is None:
        # Trust the NumPy dtype
        type_ = pa.from_numpy_dtype(values.dtype)

    return values, type_
Esempio n. 5
0
def test_from_numpy_dtype():
    cases = [
        (np.dtype('bool'), pa.bool_()),
        (np.dtype('int8'), pa.int8()),
        (np.dtype('int16'), pa.int16()),
        (np.dtype('int32'), pa.int32()),
        (np.dtype('int64'), pa.int64()),
        (np.dtype('uint8'), pa.uint8()),
        (np.dtype('uint16'), pa.uint16()),
        (np.dtype('uint32'), pa.uint32()),
        (np.dtype('float16'), pa.float16()),
        (np.dtype('float32'), pa.float32()),
        (np.dtype('float64'), pa.float64()),
        (np.dtype('U'), pa.string()),
        (np.dtype('S'), pa.binary()),
        (np.dtype('datetime64[s]'), pa.timestamp('s')),
        (np.dtype('datetime64[ms]'), pa.timestamp('ms')),
        (np.dtype('datetime64[us]'), pa.timestamp('us')),
        (np.dtype('datetime64[ns]'), pa.timestamp('ns'))
    ]

    for dt, pt in cases:
        result = pa.from_numpy_dtype(dt)
        assert result == pt

    # Things convertible to numpy dtypes work
    assert pa.from_numpy_dtype('U') == pa.string()
    assert pa.from_numpy_dtype(np.unicode) == pa.string()
    assert pa.from_numpy_dtype('int32') == pa.int32()
    assert pa.from_numpy_dtype(bool) == pa.bool_()

    with pytest.raises(NotImplementedError):
        pa.from_numpy_dtype(np.dtype('O'))

    with pytest.raises(TypeError):
        pa.from_numpy_dtype('not_convertible_to_dtype')
Esempio n. 6
0
 def _arrow_type_from_numpy_element_dtype(cls, dtype):
     # Scalar element dtype
     arrow_dtype = pa.from_numpy_dtype(dtype)
     return pa.binary(arrow_dtype.bit_width // 8)
Esempio n. 7
0
def _write_map_parquet(hsp_map, filepath, clobber=False, nside_io=4):
    """
    Internal method to write a HealSparseMap to a parquet dataset.
    use the `metadata` property from the map to persist additional
    information in the parquet metadata.

    Parameters
    ----------
    hsp_map : `HealSparseMap`
        HealSparseMap to write to a file.
    filepath : `str`
        Name of dataset to save
    clobber : `bool`, optional
        Clobber existing file?  Not supported.
    nside_io : `int`, optional
        The healpix nside to partition the output map files in parquet.
        Must be less than or equal to nside_coverage, and not greater than 16.

    Raises
    ------
    RuntimeError if file exists.
    ValueError if nside_io is out of range.
    """
    if os.path.isfile(filepath) or os.path.isdir(filepath):
        raise RuntimeError("Filepath %s exists and clobber is not supported." %
                           (filepath))

    if nside_io > hsp_map.nside_coverage:
        raise ValueError("nside_io must be <= nside_coverage.")
    elif nside_io > 16:
        raise ValueError("nside_io must be <= 16")
    elif nside_io < 0:
        raise ValueError("nside_io must be >= 0")

    # Make the path
    os.makedirs(filepath)

    # Create the nside_io paths
    cov_mask = hsp_map.coverage_mask

    cov_pixels = np.where(cov_mask)[0].astype(np.int32)
    bitshift_io = _compute_bitshift(nside_io, hsp_map.nside_coverage)
    cov_pixels_io = np.right_shift(cov_pixels, bitshift_io)

    if hsp_map.is_wide_mask_map:
        wmult = hsp_map.wide_mask_width
    else:
        wmult = 1

    if np.isclose(hsp_map._sentinel, hpg.UNSEEN):
        sentinel_string = 'UNSEEN'
    else:
        sentinel_string = str(hsp_map._sentinel)

    metadata = {
        'healsparse::version': '1',
        'healsparse::nside_sparse': str(hsp_map.nside_sparse),
        'healsparse::nside_coverage': str(hsp_map.nside_coverage),
        'healsparse::nside_io': str(nside_io),
        'healsparse::filetype': 'healsparse',
        'healsparse::primary':
        '' if hsp_map.primary is None else hsp_map.primary,
        'healsparse::sentinel': sentinel_string,
        'healsparse::widemask': str(hsp_map.is_wide_mask_map),
        'healsparse::wwidth': str(hsp_map._wide_mask_width)
    }

    # Add additional metadata
    if hsp_map.metadata is not None:
        # Use the fits header serialization for compatibility
        hdr_string = _make_header(hsp_map.metadata).tostring()
        metadata['healsparse::header'] = hdr_string

    if not hsp_map.is_rec_array:
        schema = pa.schema([('cov_pix', pa.from_numpy_dtype(np.int32)),
                            ('sparse', pa.from_numpy_dtype(hsp_map.dtype))],
                           metadata=metadata)
    else:
        type_list = [(name, pa.from_numpy_dtype(hsp_map.dtype[name].type))
                     for name in hsp_map.dtype.names]
        type_list[0:0] = [('cov_pix', pa.from_numpy_dtype(np.int32))]
        schema = pa.schema(type_list, metadata=metadata)

    cov_map = hsp_map._cov_map
    sparse_map = hsp_map._sparse_map.ravel()
    cov_index_map_temp = cov_map[:] + np.arange(
        hpg.nside_to_npixel(
            hsp_map.nside_coverage), dtype=np.int64) * cov_map.nfine_per_cov

    pix_arr = np.zeros(cov_map.nfine_per_cov * wmult, dtype=np.int32)

    last_pix_io = -1
    writer = None
    row_groups = np.zeros_like(cov_pixels)
    for ctr, (pix_io, pix) in enumerate(zip(cov_pixels_io, cov_pixels)):
        # These are always going to be sorted
        if pix_io > last_pix_io:
            last_pix_io = pix_io

            if writer is not None:
                writer.close()
                writer = None

            # Create a new file
            pixpath = os.path.join(filepath, f'iopix={pix_io:03d}')
            os.makedirs(pixpath)

            pixfile = os.path.join(pixpath, f'{pix_io:03d}.parquet')
            writer = parquet.ParquetWriter(pixfile, schema)
            row_group_ctr = 0

        sparsepix = sparse_map[cov_index_map_temp[pix] *
                               wmult:(cov_index_map_temp[pix] +
                                      cov_map.nfine_per_cov) * wmult]
        pix_arr[:] = pix
        if not hsp_map.is_rec_array:
            arrays = [pa.array(pix_arr), pa.array(sparsepix)]
        else:
            arrays = [
                pa.array(sparsepix[name]) for name in hsp_map.dtype.names
            ]
            arrays[0:0] = [pa.array(pix_arr)]
        tab = pa.Table.from_arrays(arrays, schema=schema)

        writer.write_table(tab)
        row_groups[ctr] = row_group_ctr
        row_group_ctr += 1

    if writer is not None:
        writer.close()

    # And write the coverage pixels and row groups
    tab = pa.Table.from_pydict({
        'cov_pix': pa.array(cov_pixels),
        'row_group': pa.array(row_groups)
    })
    parquet.write_table(tab, os.path.join(filepath, '_coverage.parquet'))

    # And write the metadata
    parquet.write_metadata(schema, os.path.join(filepath, '_common_metadata'))
    parquet.write_metadata(schema, os.path.join(filepath, '_metadata'))
Esempio n. 8
0
def _generate_column(column_params, num_rows):
    # If cardinality is specified, we create a set to sample from.
    # Otherwise, we simply use the given generator to generate each value.
    if column_params.cardinality is not None:
        # Construct set of values to sample from where
        # set size = cardinality

        if (isinstance(column_params.dtype, str)
                and column_params.dtype == "category"):
            vals = pa.array(
                column_params.generator,
                size=column_params.cardinality,
                safe=False,
            )
            return pa.DictionaryArray.from_arrays(
                dictionary=vals,
                indices=np.random.randint(low=0, high=len(vals),
                                          size=num_rows),
                mask=np.random.choice(
                    [True, False],
                    size=num_rows,
                    p=[
                        column_params.null_frequency,
                        1 - column_params.null_frequency,
                    ],
                ) if column_params.null_frequency > 0.0 else None,
            )

        vals = pa.array(
            column_params.generator,
            size=column_params.cardinality,
            safe=False,
            type=pa.from_numpy_dtype(column_params.dtype)
            if column_params.dtype is not None else None,
        )
        # Generate data for current column
        return pa.array(
            np.random.choice(vals, size=num_rows),
            mask=np.random.choice(
                [True, False],
                size=num_rows,
                p=[
                    column_params.null_frequency,
                    1 - column_params.null_frequency,
                ],
            ) if column_params.null_frequency > 0.0 else None,
            size=num_rows,
            safe=False,
            type=pa.from_numpy_dtype(column_params.dtype)
            if column_params.dtype is not None else None,
        )

    else:
        # Generate data for current column
        return pa.array(
            column_params.generator,
            mask=np.random.choice(
                [True, False],
                size=num_rows,
                p=[
                    column_params.null_frequency,
                    1 - column_params.null_frequency,
                ],
            ) if column_params.null_frequency > 0.0 else None,
            size=num_rows,
            safe=False,
        )
Esempio n. 9
0
 def __init__(self, data, dtype=None):
     super(GeometryFixed, self).__init__(data)
     self.numpy_dtype = np.dtype(dtype)
     self.pyarrow_type = pa.from_numpy_dtype(dtype)
Esempio n. 10
0
    def infer_schema(self, data):
        """
        Infer a schema for a given data input. The schema can be used to test with schema validator.
        This function currently supports DataFrame, Numpy, Dictionary, List and basic python types.::

            data = pandas.DataFrame(...)
            schema = infer_schema(data)

        This function returns None if it can not infer the schema.
        """
        schema = None

        if data is None:
            schema = pa.null()
        elif isinstance(data, dict):
            schema = {'type': dict, 'fields': {}}

            for key, value in data.items():
                schema['fields'][key] = self.infer_schema(value)
        elif isinstance(data, pd.DataFrame):
            schema = {'type': pd.DataFrame, 'fields': {}}

            # sample the table to get the schema
            pa_schema = pa.Table.from_pandas(data[:_SAMPLE_SIZE],
                                             preserve_index=False).schema
            for i, name in enumerate(pa_schema.names):
                schema['fields'][name] = pa_schema.types[i]
        elif isinstance(data, pd.Series):
            schema = {
                'type': pd.Series,
                'item': pa.Array.from_pandas(data).type,
            }
        elif isinstance(data, np.ndarray):
            pa_type = pa.from_numpy_dtype(
                data.dtype) if data.dtype.num != 17 else pa.string()

            if len(data.shape) == 1:  # 1d array
                schema = {
                    'type': np.ndarray,
                    'item': pa_type,
                }
            else:
                shape = [
                    v if i != 0 else None for i, v in enumerate(data.shape)
                ]
                schema = {
                    'type': np.ndarray,
                    'item': pa_type,
                    'shape': tuple(shape),
                }
        elif isinstance(data, pa.Table):
            schema = data.schema
        elif isinstance(data, (list, tuple)) and len(data) > 0:
            # try to infer type of the data
            current_type = self.infer_schema(data[0])
            for i in range(1, min(len(data), _SAMPLE_SIZE)):
                new_type = self.infer_schema(data[i])

                if new_type != current_type:
                    current_type = None
                    break

            # does not support multiple type yet
            if current_type:
                if isinstance(current_type, pa.DataType):
                    schema = pa.list_(current_type)
                else:
                    schema = {'type': list, 'item': current_type}
        elif type(data) in _python_mapping:
            schema = _python_mapping[type(data)]()
        else:
            return {'type': type(data)}

        return schema
Esempio n. 11
0
 def arrow_type(self):
     if isinstance(self._value_type, ArrowDtype):
         arrow_subdtype = self._value_type.arrow_type
     else:
         arrow_subdtype = pa.from_numpy_dtype(self._value_type)
     return pa.list_(arrow_subdtype)
Esempio n. 12
0
    def to_arrow(self):

        return ArrowIntervalType(pa.from_numpy_dtype(self.subtype),
                                 self.closed)
Esempio n. 13
0
def pandas_read_csv(
    filepath_or_buffer,
    sep=',',
    delimiter=None,
    # Column and Index Locations and Names
    header="infer",
    names=None,
    index_col=None,
    usecols=None,
    squeeze=False,
    prefix=None,
    mangle_dupe_cols=True,
    # General Parsing Configuration
    dtype=None,
    engine=None,
    converters=None,
    true_values=None,
    false_values=None,
    skipinitialspace=False,
    skiprows=None,
    skipfooter=0,
    nrows=None,
    # NA and Missing Data Handling
    na_values=None,
    keep_default_na=True,
    na_filter=True,
    verbose=False,
    skip_blank_lines=True,
    # Datetime Handling
    parse_dates=False,
    infer_datetime_format=False,
    keep_date_col=False,
    date_parser=None,
    dayfirst=False,
    cache_dates=True,
    # Iteration
    iterator=False,
    chunksize=None,
    # Quoting, Compression, and File Format
    compression="infer",
    thousands=None,
    decimal=b".",
    lineterminator=None,
    quotechar='"',
    # quoting=csv.QUOTE_MINIMAL,  # not supported
    doublequote=True,
    escapechar=None,
    comment=None,
    encoding=None,
    dialect=None,
    # Error Handling
    error_bad_lines=True,
    warn_bad_lines=True,
    # Internal
    delim_whitespace=False,
    # low_memory=_c_parser_defaults["low_memory"],  # not supported
    memory_map=False,
    float_precision=None,
):
    """Implements pandas.read_csv via pyarrow.csv.read_csv.
    This function has the same interface as pandas.read_csv.
    """

    if delimiter is None:
        delimiter = sep

    autogenerate_column_names = bool(names)

    include_columns = None

    if usecols:
        if type(usecols[0]) == str:
            if names:
                include_columns = [f'f{names.index(col)}' for col in usecols]
            else:
                include_columns = usecols
        elif type(usecols[0]) == int:
            include_columns = [f'f{i}' for i in usecols]
        else:
            # usecols should be all str or int
            assert False

    # try:
    #     keys = [k for k, v in dtype.items() if isinstance(v, pd.CategoricalDtype)]
    #     if keys:
    #         for k in keys:
    #             del dtype[k]
    #         names_list = list(names)
    #         categories = [f"f{names_list.index(k)}" for k in keys]
    # except: pass

    categories = []

    if dtype:
        if names:
            names_list = list(names)
            if isinstance(dtype, dict):
                column_types = {}
                for k, v in dtype.items():
                    column_name = "f{}".format(names_list.index(k))
                    if isinstance(v, pd.CategoricalDtype):
                        categories.append(column_name)
                        column_type = pyarrow.string()
                    else:
                        column_type = pyarrow.from_numpy_dtype(v)
                    column_types[column_name] = column_type
            else:
                pa_dtype = pyarrow.from_numpy_dtype(dtype)
                column_types = {
                    f"f{names_list.index(k)}": pa_dtype
                    for k in names
                }
        elif usecols:
            if isinstance(dtype, dict):
                column_types = {
                    k: pyarrow.from_numpy_dtype(v)
                    for k, v in dtype.items()
                }
            else:
                column_types = {
                    k: pyarrow.from_numpy_dtype(dtype)
                    for k in usecols
                }
        else:
            if isinstance(dtype, dict):
                column_types = {
                    k: pyarrow.from_numpy_dtype(v)
                    for k, v in dtype.items()
                }
            else:
                column_types = pyarrow.from_numpy_dtype(dtype)
    else:
        column_types = None

    try:
        for column in parse_dates:
            name = f"f{column}"
            # TODO: Try to help pyarrow infer date type - set DateType.
            # dtype[name] = pyarrow.from_numpy_dtype(np.datetime64) # string
            del column_types[name]
    except:
        pass

    parse_options = pyarrow.csv.ParseOptions(delimiter=delimiter, )

    read_options = pyarrow.csv.ReadOptions(
        skip_rows=skiprows,
        # column_names=column_names,
        autogenerate_column_names=autogenerate_column_names,
    )

    convert_options = pyarrow.csv.ConvertOptions(
        column_types=column_types,
        strings_can_be_null=True,
        include_columns=include_columns,
    )

    table = pyarrow.csv.read_csv(
        filepath_or_buffer,
        read_options=read_options,
        parse_options=parse_options,
        convert_options=convert_options,
    )

    dataframe = table.to_pandas(
        # categories=categories or None,
    )

    if names:
        if usecols and len(names) != len(usecols):
            if isinstance(usecols[0], int):
                dataframe.columns = [names[col] for col in usecols]
            elif isinstance(usecols[0], str):
                dataframe.columns = [name for name in names if name in usecols]
        else:
            dataframe.columns = names

    # fix when PyArrow will support predicted categories
    if isinstance(dtype, dict):
        for column_name, column_type in dtype.items():
            if isinstance(column_type, pd.CategoricalDtype):
                dataframe[column_name] = dataframe[column_name].astype(
                    column_type)

    return dataframe
Esempio n. 14
0
def csv_reader_get_pyarrow_convert_options(names, usecols, dtype, parse_dates):

    include_columns = None  # default value (include all CSV columns)

    # if names is not given then column names will be defined from from the first row of CSV file
    # otherwise pyarrow autogenerated column names will be used (see ReadOptions), so
    # map pandas usecols to pyarrow include_columns accordingly
    if usecols:
        if type(usecols[0]) == str:
            if names:
                include_columns = [f'f{names.index(col)}' for col in usecols]
            else:
                include_columns = usecols  # no autogenerated names
        elif type(usecols[0]) == int:
            include_columns = [f'f{i}' for i in usecols]
        else:
            assert False, f"Failed building pyarrow ConvertOptions due to usecols param value: {usecols}"

    if dtype:
        # dtype pandas read_csv argument maps to pyarrow column_types dict, but column names
        # must match those that are read from CSV (if names is None) or pyarrows generated names (otherwise)
        if isinstance(dtype, dict):
            if names:
                names_list = list(names)
                column_types = {}
                for k, v in dtype.items():
                    # TO-DO: check this is aligned with include_columns
                    column_name = "f{}".format(names_list.index(k))
                    if isinstance(v, pd.CategoricalDtype):
                        column_type = pa.string()
                    else:
                        column_type = pa.from_numpy_dtype(v)
                    column_types[column_name] = column_type

            else:
                column_types = {k: pa.from_numpy_dtype(v) for k, v in dtype.items()}

        else:  # single dtype for all columns
            pa_dtype = pa.from_numpy_dtype(dtype)
            if names:
                column_types = {f"f{names_list.index(k)}": pa_dtype for k in names}
            elif usecols:
                column_types = dict.fromkeys(usecols, pa_dtype)
            else:
                column_types = pa_dtype
    else:
        column_types = None

    # TO-DO: support all possible parse_dates values (now only list of column positions is supported)
    try:
        for column in parse_dates:
            name = f"f{column}"
            # starting from pyarrow=3.0.0 strings are parsed to DateType (converted back to 'object'
            # when using to_pandas), but not TimestampType (that is used to represent np.datetime64)
            # see: pyarrow.from_numpy_dtype(np.datetime64('NaT', 's'))
            # so make pyarrow infer needed type manually
            column_types[name] = pa.timestamp('s')
    except (KeyError, TypeError):
        pass

    convert_options = csv.ConvertOptions(
        column_types=column_types,
        strings_can_be_null=True,
        include_columns=include_columns,
    )
    return convert_options
Esempio n. 15
0
 def type(self):
     return pyarrow.from_numpy_dtype(self.dtype)
Esempio n. 16
0
    def __init__(self, array, dtype=None):
        def invalid_array():
            err_msg = (
                "Invalid array with type {typ}\n"
                "A {cls} may be constructed from:\n"
                "    - A 1-d array with length divisible by {n} of interleaved\n"
                "      x y coordinates\n"
                "    - A tuple of {n} 1-d arrays\n"
                "    - A pyarrow.FixedSizeBinaryArray. In this case the dtype\n"
                "      argument must also be specified").format(
                    typ=type(array),
                    cls=self.__class__.__name__,
                    n=self._element_len,
                )
            raise ValueError(err_msg)

        if isinstance(dtype, GeometryDtype):
            dtype = dtype.subtype

        numpy_dtype = None
        pa_type = None
        if isinstance(array, (pa.Array, pa.ChunkedArray)):
            if dtype is None:
                invalid_array()
            numpy_dtype = np.dtype(dtype)
        elif isinstance(array, tuple):
            if len(array) == self._element_len:
                array = [np.asarray(array[i]) for i in range(len(array))]
                if dtype:
                    array = [array[i].astype(dtype) for i in range(len(array))]

                # Capture numpy dtype
                numpy_dtype = array[0].dtype

                # Create buffer and FixedSizeBinaryArray
                pa_type = pa.binary(
                    self._element_len *
                    pa.from_numpy_dtype(numpy_dtype).bit_width // 8)
                buffers = [
                    None,
                    pa.py_buffer(np.stack(array, axis=1).tobytes())
                ]
                array = pa.Array.from_buffers(pa_type,
                                              len(array[0]),
                                              buffers=buffers)
            else:
                invalid_array()
        else:
            array = np.asarray(array)
            if array.dtype.kind == 'O':
                if array.ndim != 1:
                    invalid_array()

                # Try to infer dtype
                if dtype is None:
                    for i in range(len(array)):
                        el = array[i]
                        if el is None:
                            continue
                        if isinstance(el, GeometryFixed):
                            numpy_dtype = el.numpy_dtype
                        else:
                            el_array = np.asarray(el)
                            numpy_dtype = el_array.dtype
                        break
                    if numpy_dtype is None:
                        invalid_array()
                else:
                    numpy_dtype = dtype

                # Explicitly set the pyarrow binary type
                pa_type = pa.binary(
                    self._element_len *
                    pa.from_numpy_dtype(numpy_dtype).bit_width // 8)

                # Convert individual rows to bytes
                array = array.copy()
                for i in range(len(array)):
                    el = array[i]
                    if el is None:
                        continue
                    if isinstance(el, bytes):
                        # Nothing to do
                        pass
                    elif isinstance(el, GeometryFixed):
                        array[i] = el.flat_values.tobytes()
                    else:
                        array[i] = np.asarray(el, dtype=numpy_dtype).tobytes()
            else:
                if dtype:
                    array = array.astype(dtype)

                # Capture numpy dtype
                numpy_dtype = array.dtype
                pa_type = pa.binary(
                    self._element_len *
                    pa.from_numpy_dtype(numpy_dtype).bit_width // 8)

                if array.ndim == 2:
                    # Handle 2d array case
                    if array.shape[1] != self._element_len:
                        invalid_array()
                    # Create buffer and FixedSizeBinaryArray
                    buffers = [None, pa.py_buffer(array.tobytes())]
                    array = pa.Array.from_buffers(pa_type,
                                                  array.shape[0],
                                                  buffers=buffers)
                elif array.ndim == 1 and len(array) % self._element_len == 0:
                    buffers = [None, pa.py_buffer(array.tobytes())]
                    array = pa.Array.from_buffers(pa_type,
                                                  len(array) //
                                                  self._element_len,
                                                  buffers=buffers)
                else:
                    invalid_array()

        self._numpy_dtype = numpy_dtype
        super().__init__(array, pa_type)
    def from_numpy(obj, batch_size=None):
        """
        Convert a list of numpy.ndarrays with equal shapes or as single
        numpy.ndarray with outer-dim as batch size to a pyarrow.Array
        """
        if isinstance(obj, (list, tuple)):
            if batch_size is not None:

                def list_gen():
                    for i in range(0, len(obj), batch_size):
                        slc = obj[i:i + batch_size]
                        yield ArrowTensorArray.from_numpy(slc, batch_size=None)

                return list_gen()
            elif np.isscalar(obj[0]):
                return pa.array(obj)
            elif isinstance(obj[0], np.ndarray):
                # continue with batched ndarray
                obj = np.stack(obj, axis=0)

        if isinstance(obj, dict):
            names = list(obj.keys())
            arrs = [
                ArrowTensorArray.from_numpy(obj[k], batch_size=batch_size)
                for k in names
            ]
            batch = pa.RecordBatch.from_arrays(arrs, names)
            return pa.Table.from_batches([batch])

        elif isinstance(obj, np.ndarray):
            # currently require contiguous ndarray
            if not obj.flags.c_contiguous:
                obj = np.ascontiguousarray(obj)
            pa_dtype = pa.from_numpy_dtype(obj.dtype)
            batch_size = obj.shape[0]
            element_shape = obj.shape[1:]
            total_num_elements = obj.size
            num_elements = 1 if len(obj.shape) == 1 else np.prod(element_shape)

            child_buf = pa.py_buffer(obj)
            child_array = pa.Array.from_buffers(pa_dtype, total_num_elements,
                                                [None, child_buf])

            offset_buf = pa.py_buffer(
                np.int32([i * num_elements for i in range(batch_size + 1)]))

            storage = pa.Array.from_buffers(pa.list_(pa_dtype),
                                            batch_size, [None, offset_buf],
                                            children=[child_array])

            typ = ArrowTensorType(element_shape, pa_dtype)
            return pa.ExtensionArray.from_storage(typ, storage)

        elif np.isscalar(obj):
            return pa.array([obj])

        else:

            def iter_gen():
                if batch_size is None:
                    for d in obj:
                        yield ArrowTensorArray.from_numpy(
                            d, batch_size=batch_size)
                else:
                    batch = []
                    for o in obj:
                        batch.append(o)
                        if len(batch) == batch_size:
                            # merge dict
                            if isinstance(batch[0], dict):
                                d = {k: [v] for k, v in batch[0].items()}
                                for i in range(1, len(batch)):
                                    for k, v in batch[i].items():
                                        d[k].append(v)
                                for k in d.keys():
                                    d[k] = np.stack(d[k], axis=0)
                                batch = d
                            yield ArrowTensorArray.from_numpy(batch,
                                                              batch_size=None)
                            batch = []

            return iter_gen()
Esempio n. 18
0
def get_dataframe(parameters, use_threads):
    # Initialize seeds
    if parameters.seed is not None:
        np.random.seed(parameters.seed)

    # For each column, use a generic Mimesis producer to create an Iterable
    # for generating data
    for i, column_params in enumerate(parameters.column_parameters):
        if column_params.dtype is None:
            column_params.generator = column_params.generator(
                Generic("en", seed=parameters.seed))
        else:
            column_params.generator = column_params.generator()

    # Get schema for each column
    table_fields = []
    for i, column_params in enumerate(parameters.column_parameters):
        if (isinstance(column_params.dtype, str)
                and column_params.dtype == "category"):
            arrow_type = pa.dictionary(
                index_type=pa.int64(),
                value_type=pa.from_numpy_dtype(
                    type(next(iter(column_params.generator)))),
            )
        elif hasattr(column_params.dtype, "to_arrow"):
            arrow_type = column_params.dtype.to_arrow()
        else:
            arrow_type = pa.from_numpy_dtype(
                type(next(iter(column_params.generator))
                     ) if column_params.dtype is None else column_params.dtype)
        table_fields.append(
            pa.field(
                name=str(i),
                type=arrow_type,
                nullable=column_params.null_frequency > 0,
            ))

    schema = pa.schema(table_fields)

    # Initialize column data and which columns should be sorted
    column_data = [None] * len(parameters.column_parameters)
    columns_to_sort = [
        str(i) for i, column_params in enumerate(parameters.column_parameters)
        if column_params.is_sorted
    ]
    # Generate data
    if not use_threads:
        for i, column_params in enumerate(parameters.column_parameters):
            column_data[i] = _generate_column(column_params,
                                              parameters.num_rows)
    else:
        pool = Pool(pa.cpu_count())
        column_data = pool.starmap(
            _generate_column,
            [(column_params, parameters.num_rows)
             for i, column_params in enumerate(parameters.column_parameters)],
        )
        pool.close()
        pool.join()
    # Convert to Pandas DataFrame and sort columns appropriately
    tbl = pa.Table.from_arrays(
        column_data,
        schema=schema,
    )
    if columns_to_sort:
        tbl = tbl.to_pandas()
        tbl = tbl.sort_values(columns_to_sort)
        tbl = pa.Table.from_pandas(tbl, schema)
    return tbl
Esempio n. 19
0
def pandas_read_csv(
    filepath_or_buffer,
    sep=",",
    # Column and Index Locations and Names
    names=None,
    usecols=None,
    # General Parsing Configuration
    dtype=None,
    skiprows=None,
    # Datetime Handling
    parse_dates=False,
):
    """Implements pandas.read_csv via pyarrow.csv.read_csv.
    This function has the same interface as pandas.read_csv.
    """

    # Fallback to pandas
    need_categorical = isinstance(dtype, pd.CategoricalDtype)
    try:
        need_categorical |= any(isinstance(v, pd.CategoricalDtype) for v in dtype.values())
    except: pass

    if need_categorical:
        return pd.read_csv(
            filepath_or_buffer,
            sep=sep,
            names=names,
            usecols=usecols,
            dtype=dtype,
            skiprows=skiprows,
            parse_dates=parse_dates
        )

    autogenerate_column_names = bool(names)

    include_columns = None

    # categories = None

    if usecols is not None:
        include_columns = [f'f{i}' for i in usecols]

    read_options = pyarrow.csv.ReadOptions(
        skip_rows=skiprows,
        # column_names=column_names,
        autogenerate_column_names=autogenerate_column_names,
    )

    parse_options = pyarrow.csv.ParseOptions(
        delimiter=sep,
    )

    # try:
    #     keys = [k for k, v in dtype.items() if isinstance(v, pd.CategoricalDtype)]
    #     if keys:
    #         for k in keys:
    #             del dtype[k]
    #         names_list = list(names)
    #         categories = [f"f{names_list.index(k)}" for k in keys]
    # except: pass

    if dtype is not None:
        names_list = list(names)
        if not hasattr(dtype, 'items'):
            dtype = { f"f{names_list.index(k)}": pyarrow.from_numpy_dtype(dtype) for k in names }
        else:
            dtype = { f"f{names_list.index(k)}": pyarrow.from_numpy_dtype(v) for k, v in dtype.items() }

    try:
        for column in parse_dates:
            name = f"f{column}"
            # TODO: Try to help pyarrow infer date type - set DateType.
            # dtype[name] = pyarrow.from_numpy_dtype(np.datetime64) # string
            del dtype[name]
    except: pass

    convert_options = pyarrow.csv.ConvertOptions(
        column_types=dtype,
        strings_can_be_null=True,
        include_columns=include_columns,
    )

    table = pyarrow.csv.read_csv(
        filepath_or_buffer,
        read_options=read_options,
        parse_options=parse_options,
        convert_options=convert_options,
    )

    dataframe = table.to_pandas(
        # categories=categories,
    )

    if names is not None:
        dataframe.columns = names

    return dataframe
Esempio n. 20
0
def generate(
    path,
    parameters,
    format={
        "name": "parquet",
        "row_group_size": 64
    },
    use_threads=True,
):
    """
    Generate dataset using given parameters and write to given format

    Parameters
    ----------
    path : str or file-like object
        Path to write to
    parameters : Parameters
        Parameters specifying how to randomly generate data
    format : Dict
        Format to write
    """

    # Initialize seeds
    if parameters.seed is not None:
        np.random.seed(parameters.seed)
    column_seeds = np.arange(len(parameters.column_parameters))
    np.random.shuffle(column_seeds)

    # For each column, use a generic Mimesis producer to create an Iterable
    # for generating data
    for i, column_params in enumerate(parameters.column_parameters):
        column_params.generator = column_params.generator(
            Generic("en", seed=column_seeds[i]))

    # Get schema for each column
    schema = pa.schema([
        pa.field(
            name=str(i),
            type=pa.from_numpy_dtype(type(next(iter(
                column_params.generator)))),
            nullable=column_params.null_frequency > 0,
        ) for i, column_params in enumerate(parameters.column_parameters)
    ])

    # Initialize column data and which columns should be sorted
    column_data = [None] * len(parameters.column_parameters)
    columns_to_sort = [
        str(i) for i, column_params in enumerate(parameters.column_parameters)
        if column_params.is_sorted
    ]

    # Generate data
    if not use_threads:
        for i, column_params in enumerate(parameters.column_parameters):
            column_data[i] = _generate_column(column_params,
                                              parameters.num_rows)
    else:
        pool = Pool(pa.cpu_count())
        column_data = pool.starmap(
            _generate_column,
            [(column_params, parameters.num_rows)
             for i, column_params in enumerate(parameters.column_parameters)],
        )
        pool.close()
        pool.join()

    # Convert to Pandas DataFrame and sort columns appropriately
    tbl = pa.Table.from_arrays(
        column_data,
        schema=schema,
    )
    if columns_to_sort:
        tbl = tbl.to_pandas()
        tbl = tbl.sort_values(columns_to_sort)
        tbl = pa.Table.from_pandas(tbl, schema)

    # Write
    _write(tbl, path, format)