Exemple #1
0
def _query_table(pa_table: pa.Table, key: Union[int, slice, range, str,
                                                Iterable]) -> pa.Table:
    """
    Query a pyarrow Table to extract the subtable that correspond to the given key.
    """
    if isinstance(key, int):
        return pa_table.slice(key % pa_table.num_rows, 1)
    if isinstance(key, slice):
        key = range(*key.indices(pa_table.num_rows))
    if isinstance(key, range):
        if _is_range_contiguous(key) and key.start >= 0:
            return pa_table.slice(key.start, key.stop - key.start)
        else:
            pass  # treat as an iterable
    if isinstance(key, str):
        return pa_table.drop(column for column in pa_table.column_names
                             if column != key)
    if isinstance(key, Iterable):
        if len(key) == 0:
            return pa_table.slice(0, 0)
        # don't use pyarrow.Table.take even for pyarrow >=1.0 (see https://issues.apache.org/jira/browse/ARROW-9773)
        return pa.concat_tables(
            pa_table.slice(int(i) % pa_table.num_rows, 1) for i in key)

    _raise_bad_key_type(key)
Exemple #2
0
def row_iter(table: pyarrow.Table):
    """Iterator row over row."""
    # pylint: disable=invalid-name
    Row = collections.namedtuple("Row", table.column_names)
    for index in range(table.num_rows):
        row = table.slice(index, 1)
        obj = Row(*(col[0].as_py() for col in row.itercolumns()))
        yield obj
Exemple #3
0
def slice_table(table: pa.Table, max_len: Optional[int]) -> List[pa.Table]:
    """
    Iteratively create 0-copy table slices.
    """
    if max_len is None:
        return [table]
    tables = []
    offset = 0
    records_remaining = len(table)
    while records_remaining > 0:
        records_this_entry = min(max_len, records_remaining)
        tables.append(table.slice(offset, records_this_entry))
        records_remaining -= records_this_entry
        offset += records_this_entry
    return tables
def _write_chunk(
    file_path: str,
    boto3_session: Optional[boto3.Session],
    s3_additional_kwargs: Optional[Dict[str, str]],
    compression: Optional[str],
    table: pa.Table,
    offset: int,
    chunk_size: int,
) -> List[str]:
    fs = _get_fs(boto3_session=boto3_session,
                 s3_additional_kwargs=s3_additional_kwargs)
    with _new_writer(file_path=file_path,
                     fs=fs,
                     compression=compression,
                     schema=table.schema) as writer:
        writer.write_table(table.slice(offset, chunk_size))
    return [file_path]
Exemple #5
0
def _write_chunk(
    file_path: str,
    boto3_session: Optional[boto3.Session],
    s3_additional_kwargs: Optional[Dict[str, str]],
    compression: Optional[str],
    table: pa.Table,
    offset: int,
    chunk_size: int,
    use_threads: bool,
) -> List[str]:
    with _new_writer(
            file_path=file_path,
            compression=compression,
            schema=table.schema,
            boto3_session=boto3_session,
            s3_additional_kwargs=s3_additional_kwargs,
            use_threads=use_threads,
    ) as writer:
        writer.write_table(table.slice(offset, chunk_size))
    return [file_path]
def add_data_points_to_df(
    data_table: pa.Table,
    start_index: int,
    sample_interval_micros: float,
    num_samples_to_add: int = 1,
    point_creation_mode: DataPointCreationMode = DataPointCreationMode.COPY,
) -> pa.Table:
    """
    adds data points to the end of the table, starting from the index specified.
        Note:
            * table must not be empty
            * start_index must be non-negative and less than the length of table
            * num_samples_to_add must be greater than 0
            * sample_interval_micros cannot be 0
            * points are added onto the end and the result is not sorted
        Options for point_creation_mode are:
            * NAN: default values and nans
            * COPY: copies of the start data point
            * INTERPOLATE: interpolated values between start data point and adjacent point

    :param data_table: pyarrow table to add dataless timestamps to
    :param start_index: index of the table to use as starting point for creating new values
    :param sample_interval_micros: sample interval in microseconds of the timestamps; use negative values to
                                    add points before the start_index
    :param num_samples_to_add: the number of timestamps to create, default 1
    :param point_creation_mode: the mode of point creation to use
    :return: updated table with synthetic data points
    """
    if len(data_table) > start_index and len(data_table) > 0 and num_samples_to_add > 0 \
            and sample_interval_micros != 0.:
        start_timestamp = data_table["timestamps"][start_index].as_py()
        # create timestamps for every point that needs to be added
        new_timestamps = start_timestamp + np.arange(
            1, num_samples_to_add + 1) * sample_interval_micros
        if point_creation_mode == DataPointCreationMode.COPY:
            # copy the start point
            copy_row = data_table.slice(start_index, 1).to_pydict()
            for t in new_timestamps:
                copy_row["timestamps"] = [t]
                # for k in copy_row.keys():
                #     new_dict[k].append(copy_row[k])
            empty_df = pa.Table.from_pydict(copy_row)
        elif point_creation_mode == DataPointCreationMode.INTERPOLATE:
            # use the start point and the next point as the edges for interpolation
            start_point = data_table.slice(start_index, 1).to_pydict()
            numeric_start = start_point[[
                col for col in data_table.schema.names
                if col not in NON_INTERPOLATED_COLUMNS + NON_NUMERIC_COLUMNS
            ]]
            non_numeric_start = start_point[[
                col for col in data_table.schema.names
                if col in NON_NUMERIC_COLUMNS
            ]]
            end_point = data_table.slice(
                start_index + (1 if sample_interval_micros > 0 else -1),
                1).to_pydict()
            numeric_end = end_point[[
                col for col in data_table.schema.names
                if col not in NON_INTERPOLATED_COLUMNS + NON_NUMERIC_COLUMNS
            ]]
            non_numeric_end = end_point[[
                col for col in data_table.schema.names
                if col in NON_NUMERIC_COLUMNS
            ]]
            if np.abs(start_point["timestamps"] - new_timestamps[0]) \
                    <= np.abs(end_point["timestamps"] - new_timestamps[0]):
                non_numeric_diff = non_numeric_start
            else:
                non_numeric_diff = non_numeric_end
            numeric_diff = numeric_end - numeric_start
            numeric_diff = \
                (numeric_diff / numeric_diff["timestamps"]) * \
                (new_timestamps - numeric_start) + numeric_start
            # merge dicts (python 3.5 to 3.8)
            empty_df = pa.Table.from_pydict({
                **numeric_diff,
                **non_numeric_diff
            })
            # merge dicts (python 3.9):
            # empty_df = pa.Table.from_pydict(numeric_diff | non_numeric_diff)
        else:
            # add nans and defaults
            empty_dict: Dict[str, List] = {}
            for k in data_table.schema.names:
                empty_dict[k] = []
            for column_index in data_table.schema.names:
                if column_index == "timestamps":
                    empty_dict[column_index] = new_timestamps
                elif column_index == "location_provider":
                    empty_dict[column_index] = [
                        LocationProvider["UNKNOWN"].value
                        for i in range(num_samples_to_add)
                    ]
                elif column_index == "image_codec":
                    empty_dict[column_index] = [
                        ImageCodec["UNKNOWN"].value
                        for i in range(num_samples_to_add)
                    ]
                elif column_index == "audio_codec":
                    empty_dict[column_index] = [
                        AudioCodec["UNKNOWN"].value
                        for i in range(num_samples_to_add)
                    ]
                elif column_index == "network_type":
                    empty_dict[column_index] = [
                        NetworkType["UNKNOWN_NETWORK"].value
                        for i in range(num_samples_to_add)
                    ]
                elif column_index == "power_state":
                    empty_dict[column_index] = [
                        PowerState["UNKNOWN_POWER_STATE"].value
                        for i in range(num_samples_to_add)
                    ]
                elif column_index == "cell_service":
                    empty_dict[column_index] = [
                        CellServiceState["UNKNOWN"].value
                        for i in range(num_samples_to_add)
                    ]
                elif column_index == "wifi_wake_lock":
                    empty_dict[column_index] = [
                        WifiWakeLock["NONE"].value
                        for i in range(num_samples_to_add)
                    ]
                elif column_index == "screen_state":
                    empty_dict[column_index] = [
                        ScreenState["UNKNOWN_SCREEN_STATE"].value
                        for i in range(num_samples_to_add)
                    ]
                else:
                    empty_dict[column_index] = np.full(num_samples_to_add,
                                                       np.nan).tolist()
            empty_df = pa.Table.from_pydict(empty_dict)
        data_table = pa.concat_tables([data_table, empty_df])

    return data_table
Exemple #7
0
 def extract_row(self, pa_table: pa.Table) -> pd.DataFrame:
     return pa_table.slice(length=1).to_pandas(types_mapper=pandas_types_mapper)
Exemple #8
0
def make_sorted_groups(sorting_table: pa.Table,
                       input_table: pa.Table) -> SortedGroups:
    if not sorting_table.num_columns:
        # Exactly one output group, even for empty-table input
        return SortedGroups(
            sorted_groups=pa.table({
                "A": [None]
            }).select([]),  # 1-row, 0-col table
            sorted_input_table=
            input_table,  # everything is one group (maybe 0-row)
            group_splits=np.array([], np.int64()),
        )

    # pyarrow 3.0.0 can't sort dictionary columns.
    # TODO make sort-dictionary work; nix this conversion
    sorting_table_without_dictionary = pa.table(
        [
            column.cast(pa.utf8())
            if pa.types.is_dictionary(column.type) else column
            for column in sorting_table.columns
        ],
        schema=pa.schema([
            pa.field(field.name, pa.utf8())
            if pa.types.is_dictionary(field.type) else field for field in [
                sorting_table.schema.field(i)
                for i in range(len(sorting_table.schema.names))
            ]
        ]),
    )
    indices = pa.compute.sort_indices(
        sorting_table_without_dictionary,
        sort_keys=[(c, "ascending")
                   for c in sorting_table_without_dictionary.column_names],
    )

    sorted_groups_with_dups_and_nulls = sorting_table.take(indices)
    # Behavior we ought to DEPRECATE: to mimic Pandas, we drop all groups that
    # contain NULL. This is mathematically sound for Pandas' "NA" (because if
    # all these unknown things are the same thing, doesn't that mean we know
    # something about them? -- reducto ad absurdum, QED). But Workbench's NULL
    # is a bit closer to SQL NULL, which means "whatever you say, pal".
    #
    # This null-dropping is for backwards compat. TODO make it optional ... and
    # eventually nix the option and always output NULL groups.
    nonnull_indices = indices.filter(
        find_nonnull_table_mask(sorted_groups_with_dups_and_nulls))

    if input_table.num_columns:
        sorted_input_table = input_table.take(nonnull_indices)
    else:
        # Don't .take() on a zero-column Arrow table: its .num_rows would change
        #
        # All rows are identical, so .slice() gives the table we want
        sorted_input_table = input_table.slice(0, len(nonnull_indices))

    sorted_groups_with_dups = sorting_table.take(nonnull_indices)

    # "is_dup": find each row in sorted_groups_with_dups that is _equal_ to
    # the row before it. (The first value compares the first and second row.)
    #
    # We start assuming all are equal; then we search for inequality
    if len(sorted_groups_with_dups):
        is_dup = pa.array(np.ones(len(sorted_groups_with_dups) - 1),
                          pa.bool_())
        for column in sorted_groups_with_dups.itercolumns():
            chunk = column.chunks[0]
            if pa.types.is_dictionary(chunk.type):
                chunk = chunk.indices
            first = chunk.slice(0, len(column) - 1)
            second = chunk.slice(1)
            # TODO when we support NULL groups:
            # both_null = pa.compute.and_(first.is_null(), second.is_null())
            # both_equal_if_not_null = pa.compute.equal(first, second)
            # both_equal = pa.compute.fill_null(both_equal_if_not_null, False)
            # value_is_dup = pa.compute.or_(both_null, both_equal)
            # ... and for now, it's simply:
            value_is_dup = pa.compute.equal(first, second)
            is_dup = pa.compute.and_(is_dup, value_is_dup)

        group_splits = np.where(~(is_dup.to_numpy(
            zero_copy_only=False)))[0] + 1

        sorted_groups = reencode_dictionaries(
            sorted_groups_with_dups.take(np.insert(group_splits, 0, 0)))
    else:
        sorted_groups = sorted_groups_with_dups
        group_splits = np.array([], np.int64())

    return SortedGroups(
        sorted_groups=sorted_groups,
        sorted_input_table=sorted_input_table,
        group_splits=group_splits,
    )
Exemple #9
0
def _postprocess_name_columns(
        table: pyarrow.Table,
        has_header: bool) -> Tuple[pyarrow.Table, List[ParseCsvWarning]]:
    """
    Return `table`, with final column names but still String values.
    """
    warnings = []
    if has_header and table.num_rows > 0:
        n_ascii_cleaned = 0
        first_ascii_cleaned = None
        n_truncated = 0
        first_truncated = None
        n_numbered = 0
        first_numbered = None

        names = []
        for colname in gen_unique_clean_colnames(
                list(("" if c[0] is pyarrow.NULL else c[0].as_py())
                     for c in table.columns),
                settings=settings,
        ):
            names.append(colname.name)
            if colname.is_ascii_cleaned:
                if n_ascii_cleaned == 0:
                    first_ascii_cleaned = colname.name
                n_ascii_cleaned += 1
            if colname.is_truncated:
                if n_truncated == 0:
                    first_truncated = colname.name
                n_truncated += 1
            if colname.is_numbered:
                if n_numbered == 0:
                    first_numbered = colname.name
                n_numbered += 1
            # Unicode can't be fixed, because we assume valid UTF-8 input
            assert not colname.is_unicode_fixed
            # Stay silent if colname.is_default. Users expect us to
            # auto-generate default column names.

        if n_ascii_cleaned:
            warnings.append(
                ParseCsvWarning.CleanedAsciiColumnNames(
                    n_ascii_cleaned, first_ascii_cleaned))
        if n_truncated:
            warnings.append(
                ParseCsvWarning.TruncatedColumnNames(n_truncated,
                                                     first_truncated))
        if n_numbered:
            warnings.append(
                ParseCsvWarning.NumberedColumnNames(n_numbered,
                                                    first_numbered))

        # Remove header (zero-copy: builds new pa.Table with same backing data)
        table = table.slice(1)
    else:
        names = [f"Column {i + 1}" for i in range(len(table.columns))]

    return (
        pyarrow.table({name: table.column(i)
                       for i, name in enumerate(names)}),
        warnings,
    )