def _query_table(pa_table: pa.Table, key: Union[int, slice, range, str, Iterable]) -> pa.Table: """ Query a pyarrow Table to extract the subtable that correspond to the given key. """ if isinstance(key, int): return pa_table.slice(key % pa_table.num_rows, 1) if isinstance(key, slice): key = range(*key.indices(pa_table.num_rows)) if isinstance(key, range): if _is_range_contiguous(key) and key.start >= 0: return pa_table.slice(key.start, key.stop - key.start) else: pass # treat as an iterable if isinstance(key, str): return pa_table.drop(column for column in pa_table.column_names if column != key) if isinstance(key, Iterable): if len(key) == 0: return pa_table.slice(0, 0) # don't use pyarrow.Table.take even for pyarrow >=1.0 (see https://issues.apache.org/jira/browse/ARROW-9773) return pa.concat_tables( pa_table.slice(int(i) % pa_table.num_rows, 1) for i in key) _raise_bad_key_type(key)
def row_iter(table: pyarrow.Table): """Iterator row over row.""" # pylint: disable=invalid-name Row = collections.namedtuple("Row", table.column_names) for index in range(table.num_rows): row = table.slice(index, 1) obj = Row(*(col[0].as_py() for col in row.itercolumns())) yield obj
def slice_table(table: pa.Table, max_len: Optional[int]) -> List[pa.Table]: """ Iteratively create 0-copy table slices. """ if max_len is None: return [table] tables = [] offset = 0 records_remaining = len(table) while records_remaining > 0: records_this_entry = min(max_len, records_remaining) tables.append(table.slice(offset, records_this_entry)) records_remaining -= records_this_entry offset += records_this_entry return tables
def _write_chunk( file_path: str, boto3_session: Optional[boto3.Session], s3_additional_kwargs: Optional[Dict[str, str]], compression: Optional[str], table: pa.Table, offset: int, chunk_size: int, ) -> List[str]: fs = _get_fs(boto3_session=boto3_session, s3_additional_kwargs=s3_additional_kwargs) with _new_writer(file_path=file_path, fs=fs, compression=compression, schema=table.schema) as writer: writer.write_table(table.slice(offset, chunk_size)) return [file_path]
def _write_chunk( file_path: str, boto3_session: Optional[boto3.Session], s3_additional_kwargs: Optional[Dict[str, str]], compression: Optional[str], table: pa.Table, offset: int, chunk_size: int, use_threads: bool, ) -> List[str]: with _new_writer( file_path=file_path, compression=compression, schema=table.schema, boto3_session=boto3_session, s3_additional_kwargs=s3_additional_kwargs, use_threads=use_threads, ) as writer: writer.write_table(table.slice(offset, chunk_size)) return [file_path]
def add_data_points_to_df( data_table: pa.Table, start_index: int, sample_interval_micros: float, num_samples_to_add: int = 1, point_creation_mode: DataPointCreationMode = DataPointCreationMode.COPY, ) -> pa.Table: """ adds data points to the end of the table, starting from the index specified. Note: * table must not be empty * start_index must be non-negative and less than the length of table * num_samples_to_add must be greater than 0 * sample_interval_micros cannot be 0 * points are added onto the end and the result is not sorted Options for point_creation_mode are: * NAN: default values and nans * COPY: copies of the start data point * INTERPOLATE: interpolated values between start data point and adjacent point :param data_table: pyarrow table to add dataless timestamps to :param start_index: index of the table to use as starting point for creating new values :param sample_interval_micros: sample interval in microseconds of the timestamps; use negative values to add points before the start_index :param num_samples_to_add: the number of timestamps to create, default 1 :param point_creation_mode: the mode of point creation to use :return: updated table with synthetic data points """ if len(data_table) > start_index and len(data_table) > 0 and num_samples_to_add > 0 \ and sample_interval_micros != 0.: start_timestamp = data_table["timestamps"][start_index].as_py() # create timestamps for every point that needs to be added new_timestamps = start_timestamp + np.arange( 1, num_samples_to_add + 1) * sample_interval_micros if point_creation_mode == DataPointCreationMode.COPY: # copy the start point copy_row = data_table.slice(start_index, 1).to_pydict() for t in new_timestamps: copy_row["timestamps"] = [t] # for k in copy_row.keys(): # new_dict[k].append(copy_row[k]) empty_df = pa.Table.from_pydict(copy_row) elif point_creation_mode == DataPointCreationMode.INTERPOLATE: # use the start point and the next point as the edges for interpolation start_point = data_table.slice(start_index, 1).to_pydict() numeric_start = start_point[[ col for col in data_table.schema.names if col not in NON_INTERPOLATED_COLUMNS + NON_NUMERIC_COLUMNS ]] non_numeric_start = start_point[[ col for col in data_table.schema.names if col in NON_NUMERIC_COLUMNS ]] end_point = data_table.slice( start_index + (1 if sample_interval_micros > 0 else -1), 1).to_pydict() numeric_end = end_point[[ col for col in data_table.schema.names if col not in NON_INTERPOLATED_COLUMNS + NON_NUMERIC_COLUMNS ]] non_numeric_end = end_point[[ col for col in data_table.schema.names if col in NON_NUMERIC_COLUMNS ]] if np.abs(start_point["timestamps"] - new_timestamps[0]) \ <= np.abs(end_point["timestamps"] - new_timestamps[0]): non_numeric_diff = non_numeric_start else: non_numeric_diff = non_numeric_end numeric_diff = numeric_end - numeric_start numeric_diff = \ (numeric_diff / numeric_diff["timestamps"]) * \ (new_timestamps - numeric_start) + numeric_start # merge dicts (python 3.5 to 3.8) empty_df = pa.Table.from_pydict({ **numeric_diff, **non_numeric_diff }) # merge dicts (python 3.9): # empty_df = pa.Table.from_pydict(numeric_diff | non_numeric_diff) else: # add nans and defaults empty_dict: Dict[str, List] = {} for k in data_table.schema.names: empty_dict[k] = [] for column_index in data_table.schema.names: if column_index == "timestamps": empty_dict[column_index] = new_timestamps elif column_index == "location_provider": empty_dict[column_index] = [ LocationProvider["UNKNOWN"].value for i in range(num_samples_to_add) ] elif column_index == "image_codec": empty_dict[column_index] = [ ImageCodec["UNKNOWN"].value for i in range(num_samples_to_add) ] elif column_index == "audio_codec": empty_dict[column_index] = [ AudioCodec["UNKNOWN"].value for i in range(num_samples_to_add) ] elif column_index == "network_type": empty_dict[column_index] = [ NetworkType["UNKNOWN_NETWORK"].value for i in range(num_samples_to_add) ] elif column_index == "power_state": empty_dict[column_index] = [ PowerState["UNKNOWN_POWER_STATE"].value for i in range(num_samples_to_add) ] elif column_index == "cell_service": empty_dict[column_index] = [ CellServiceState["UNKNOWN"].value for i in range(num_samples_to_add) ] elif column_index == "wifi_wake_lock": empty_dict[column_index] = [ WifiWakeLock["NONE"].value for i in range(num_samples_to_add) ] elif column_index == "screen_state": empty_dict[column_index] = [ ScreenState["UNKNOWN_SCREEN_STATE"].value for i in range(num_samples_to_add) ] else: empty_dict[column_index] = np.full(num_samples_to_add, np.nan).tolist() empty_df = pa.Table.from_pydict(empty_dict) data_table = pa.concat_tables([data_table, empty_df]) return data_table
def extract_row(self, pa_table: pa.Table) -> pd.DataFrame: return pa_table.slice(length=1).to_pandas(types_mapper=pandas_types_mapper)
def make_sorted_groups(sorting_table: pa.Table, input_table: pa.Table) -> SortedGroups: if not sorting_table.num_columns: # Exactly one output group, even for empty-table input return SortedGroups( sorted_groups=pa.table({ "A": [None] }).select([]), # 1-row, 0-col table sorted_input_table= input_table, # everything is one group (maybe 0-row) group_splits=np.array([], np.int64()), ) # pyarrow 3.0.0 can't sort dictionary columns. # TODO make sort-dictionary work; nix this conversion sorting_table_without_dictionary = pa.table( [ column.cast(pa.utf8()) if pa.types.is_dictionary(column.type) else column for column in sorting_table.columns ], schema=pa.schema([ pa.field(field.name, pa.utf8()) if pa.types.is_dictionary(field.type) else field for field in [ sorting_table.schema.field(i) for i in range(len(sorting_table.schema.names)) ] ]), ) indices = pa.compute.sort_indices( sorting_table_without_dictionary, sort_keys=[(c, "ascending") for c in sorting_table_without_dictionary.column_names], ) sorted_groups_with_dups_and_nulls = sorting_table.take(indices) # Behavior we ought to DEPRECATE: to mimic Pandas, we drop all groups that # contain NULL. This is mathematically sound for Pandas' "NA" (because if # all these unknown things are the same thing, doesn't that mean we know # something about them? -- reducto ad absurdum, QED). But Workbench's NULL # is a bit closer to SQL NULL, which means "whatever you say, pal". # # This null-dropping is for backwards compat. TODO make it optional ... and # eventually nix the option and always output NULL groups. nonnull_indices = indices.filter( find_nonnull_table_mask(sorted_groups_with_dups_and_nulls)) if input_table.num_columns: sorted_input_table = input_table.take(nonnull_indices) else: # Don't .take() on a zero-column Arrow table: its .num_rows would change # # All rows are identical, so .slice() gives the table we want sorted_input_table = input_table.slice(0, len(nonnull_indices)) sorted_groups_with_dups = sorting_table.take(nonnull_indices) # "is_dup": find each row in sorted_groups_with_dups that is _equal_ to # the row before it. (The first value compares the first and second row.) # # We start assuming all are equal; then we search for inequality if len(sorted_groups_with_dups): is_dup = pa.array(np.ones(len(sorted_groups_with_dups) - 1), pa.bool_()) for column in sorted_groups_with_dups.itercolumns(): chunk = column.chunks[0] if pa.types.is_dictionary(chunk.type): chunk = chunk.indices first = chunk.slice(0, len(column) - 1) second = chunk.slice(1) # TODO when we support NULL groups: # both_null = pa.compute.and_(first.is_null(), second.is_null()) # both_equal_if_not_null = pa.compute.equal(first, second) # both_equal = pa.compute.fill_null(both_equal_if_not_null, False) # value_is_dup = pa.compute.or_(both_null, both_equal) # ... and for now, it's simply: value_is_dup = pa.compute.equal(first, second) is_dup = pa.compute.and_(is_dup, value_is_dup) group_splits = np.where(~(is_dup.to_numpy( zero_copy_only=False)))[0] + 1 sorted_groups = reencode_dictionaries( sorted_groups_with_dups.take(np.insert(group_splits, 0, 0))) else: sorted_groups = sorted_groups_with_dups group_splits = np.array([], np.int64()) return SortedGroups( sorted_groups=sorted_groups, sorted_input_table=sorted_input_table, group_splits=group_splits, )
def _postprocess_name_columns( table: pyarrow.Table, has_header: bool) -> Tuple[pyarrow.Table, List[ParseCsvWarning]]: """ Return `table`, with final column names but still String values. """ warnings = [] if has_header and table.num_rows > 0: n_ascii_cleaned = 0 first_ascii_cleaned = None n_truncated = 0 first_truncated = None n_numbered = 0 first_numbered = None names = [] for colname in gen_unique_clean_colnames( list(("" if c[0] is pyarrow.NULL else c[0].as_py()) for c in table.columns), settings=settings, ): names.append(colname.name) if colname.is_ascii_cleaned: if n_ascii_cleaned == 0: first_ascii_cleaned = colname.name n_ascii_cleaned += 1 if colname.is_truncated: if n_truncated == 0: first_truncated = colname.name n_truncated += 1 if colname.is_numbered: if n_numbered == 0: first_numbered = colname.name n_numbered += 1 # Unicode can't be fixed, because we assume valid UTF-8 input assert not colname.is_unicode_fixed # Stay silent if colname.is_default. Users expect us to # auto-generate default column names. if n_ascii_cleaned: warnings.append( ParseCsvWarning.CleanedAsciiColumnNames( n_ascii_cleaned, first_ascii_cleaned)) if n_truncated: warnings.append( ParseCsvWarning.TruncatedColumnNames(n_truncated, first_truncated)) if n_numbered: warnings.append( ParseCsvWarning.NumberedColumnNames(n_numbered, first_numbered)) # Remove header (zero-copy: builds new pa.Table with same backing data) table = table.slice(1) else: names = [f"Column {i + 1}" for i in range(len(table.columns))] return ( pyarrow.table({name: table.column(i) for i, name in enumerate(names)}), warnings, )