Python Table.take Examples

Programming Language: Python

Namespace/Package Name: pyarrow

Class/Type: Table

Method/Function: take

Examples at hotexamples.com: 5

Python Table.take - 5 examples found. These are the top rated real world Python examples of pyarrow.Table.take extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

from_pandas(18)

to_pandas(18)

column(16)

append_column(10)

slice(9)

itercolumns(8)

from_pydict(7)

set_column(6)

to_batches(6)

drop(5)

take(5)

to_pydict(5)

filter(4)

field(4)

rename_columns(4)

from_arrays(3)

from_batches(3)

cast(3)

replace_schema_metadata(3)

combine_chunks(2)

select(1)

Example #1

Show file

File: hash_bucket.py Project: amzn/amazon-ray

def group_by_pk_hash_bucket(table: pa.Table, num_buckets: int,
                            primary_keys: List[str]) -> np.ndarray:

    # generate the primary key digest column
    all_pk_column_fields = []
    for pk_name in primary_keys:
        # casting a primary key column to numpy also ensures no nulls exist
        column_fields = table[pk_name].to_numpy()
        all_pk_column_fields.append(column_fields)
    hash_column_generator = hash_pk_bytes_generator(all_pk_column_fields)
    table = sc.append_pk_hash_column(table, hash_column_generator)

    # drop primary key columns to free up memory
    table = table.drop(primary_keys)

    # group hash bucket record indices
    hash_bucket_to_indices = np.empty([num_buckets], dtype="object")
    record_index = 0
    for digest in sc.pk_hash_column_np(table):
        hash_bucket = pk_digest_to_hash_bucket_index(digest, num_buckets)
        if hash_bucket_to_indices[hash_bucket] is None:
            hash_bucket_to_indices[hash_bucket] = []
        hash_bucket_to_indices[hash_bucket].append(record_index)
        record_index += 1

    # generate the ordered record number column
    hash_bucket_to_table = np.empty([num_buckets], dtype="object")
    for hash_bucket in range(len(hash_bucket_to_indices)):
        indices = hash_bucket_to_indices[hash_bucket]
        if indices:
            hash_bucket_to_table[hash_bucket] = sc.append_record_idx_col(
                table.take(indices),
                indices,
            )
    return hash_bucket_to_table

Example #2

Show file

def drop_duplicates_by_primary_key_hash(table: pa.Table) -> pa.Table:
    # TODO: drop all primary key occurrences for DELETE delta types
    value_to_last_row_idx = {}
    row_idx = 0
    for chunk in sc.pk_hash_column(table).iterchunks():
        for val in chunk.to_numpy(zero_copy_only=False):
            value_to_last_row_idx[val] = row_idx
            row_idx += 1
    return table.take(list(value_to_last_row_idx.values()))

Example #3

Show file

File: gap_and_pad_utils.py Project: RedVoxInc/redvox-python-sdk

def fill_gaps(
        arrow_df: pa.Table,
        gaps: List[Tuple[float, float]],
        sample_interval_micros: float,
        copy: bool = False) -> Tuple[pa.Table, List[Tuple[float, float]]]:
    """
    fills gaps in the table with np.nan or interpolated values by interpolating timestamps based on the
    calculated sample interval

    :param arrow_df: pyarrow table with data.  first column is "timestamps"
    :param gaps: list of tuples of known non-inclusive start and end timestamps of the gaps
    :param sample_interval_micros: known sample interval of the data points
    :param copy: if True, copy the data points, otherwise interpolate from edges, default False
    :return: table without gaps and the list of gaps
    """
    # extract the necessary information to compute gap size and gap timestamps
    data_time_stamps = arrow_df["timestamps"].to_numpy()
    if len(data_time_stamps) > 1:
        data_duration = data_time_stamps[-1] - data_time_stamps[0]
        expected_samples = (
            np.floor(data_duration / sample_interval_micros) +
            (1 if data_duration % sample_interval_micros >=
             sample_interval_micros * DEFAULT_GAP_UPPER_LIMIT else 0)) + 1
        if expected_samples > len(data_time_stamps):
            if copy:
                pcm = DataPointCreationMode["COPY"]
            else:
                pcm = DataPointCreationMode["NAN"]
            # make it safe to alter the gap values
            my_gaps = check_gap_list(gaps, data_time_stamps[0],
                                     data_time_stamps[-1])
            for gap in my_gaps:
                # if timestamps are around gaps, we have to update the values
                before_start = np.argwhere(
                    [t <= gap[0] for t in data_time_stamps])
                after_end = np.argwhere(
                    [t >= gap[1] for t in data_time_stamps])
                if len(before_start) > 0:
                    before_start = before_start[-1][0]
                    # sim = gap[0] - data_time_stamps[before_start]
                    # result_df = add_data_points_to_df(result_df, before_start, sim, point_creation_mode=pcm)
                    gap = (data_time_stamps[before_start], gap[1])
                else:
                    before_start = None
                if len(after_end) > 0:
                    after_end = after_end[0][0]
                    # sim = gap[1] - data_time_stamps[after_end]
                    gap = (gap[0], data_time_stamps[after_end])
                else:
                    after_end = None
                num_new_points = int(
                    (gap[1] - gap[0]) / sample_interval_micros) - 1
                if before_start is not None:
                    arrow_df = add_data_points_to_df(arrow_df, before_start,
                                                     sample_interval_micros,
                                                     num_new_points, pcm)
                elif after_end is not None:
                    arrow_df = add_data_points_to_df(arrow_df, after_end,
                                                     -sample_interval_micros,
                                                     num_new_points, pcm)
        indic = pc.sort_indices(arrow_df,
                                sort_keys=[("timestamps", "ascending")])
        return arrow_df.take(indic), gaps
    return arrow_df, gaps

Example #4

Show file

File: _provider_impl_arrow_lazy.py Project: CeetronSolutions/webviz-subsurface

def _sort_table_on_real_then_date(table: pa.Table) -> pa.Table:
    indices = pc.sort_indices(
        table, sort_keys=[("REAL", "ascending"), ("DATE", "ascending")]
    )
    sorted_table = table.take(indices)
    return sorted_table

Example #5

Show file

def make_sorted_groups(sorting_table: pa.Table,
                       input_table: pa.Table) -> SortedGroups:
    if not sorting_table.num_columns:
        # Exactly one output group, even for empty-table input
        return SortedGroups(
            sorted_groups=pa.table({
                "A": [None]
            }).select([]),  # 1-row, 0-col table
            sorted_input_table=
            input_table,  # everything is one group (maybe 0-row)
            group_splits=np.array([], np.int64()),
        )

    # pyarrow 3.0.0 can't sort dictionary columns.
    # TODO make sort-dictionary work; nix this conversion
    sorting_table_without_dictionary = pa.table(
        [
            column.cast(pa.utf8())
            if pa.types.is_dictionary(column.type) else column
            for column in sorting_table.columns
        ],
        schema=pa.schema([
            pa.field(field.name, pa.utf8())
            if pa.types.is_dictionary(field.type) else field for field in [
                sorting_table.schema.field(i)
                for i in range(len(sorting_table.schema.names))
            ]
        ]),
    )
    indices = pa.compute.sort_indices(
        sorting_table_without_dictionary,
        sort_keys=[(c, "ascending")
                   for c in sorting_table_without_dictionary.column_names],
    )

    sorted_groups_with_dups_and_nulls = sorting_table.take(indices)
    # Behavior we ought to DEPRECATE: to mimic Pandas, we drop all groups that
    # contain NULL. This is mathematically sound for Pandas' "NA" (because if
    # all these unknown things are the same thing, doesn't that mean we know
    # something about them? -- reducto ad absurdum, QED). But Workbench's NULL
    # is a bit closer to SQL NULL, which means "whatever you say, pal".
    #
    # This null-dropping is for backwards compat. TODO make it optional ... and
    # eventually nix the option and always output NULL groups.
    nonnull_indices = indices.filter(
        find_nonnull_table_mask(sorted_groups_with_dups_and_nulls))

    if input_table.num_columns:
        sorted_input_table = input_table.take(nonnull_indices)
    else:
        # Don't .take() on a zero-column Arrow table: its .num_rows would change
        #
        # All rows are identical, so .slice() gives the table we want
        sorted_input_table = input_table.slice(0, len(nonnull_indices))

    sorted_groups_with_dups = sorting_table.take(nonnull_indices)

    # "is_dup": find each row in sorted_groups_with_dups that is _equal_ to
    # the row before it. (The first value compares the first and second row.)
    #
    # We start assuming all are equal; then we search for inequality
    if len(sorted_groups_with_dups):
        is_dup = pa.array(np.ones(len(sorted_groups_with_dups) - 1),
                          pa.bool_())
        for column in sorted_groups_with_dups.itercolumns():
            chunk = column.chunks[0]
            if pa.types.is_dictionary(chunk.type):
                chunk = chunk.indices
            first = chunk.slice(0, len(column) - 1)
            second = chunk.slice(1)
            # TODO when we support NULL groups:
            # both_null = pa.compute.and_(first.is_null(), second.is_null())
            # both_equal_if_not_null = pa.compute.equal(first, second)
            # both_equal = pa.compute.fill_null(both_equal_if_not_null, False)
            # value_is_dup = pa.compute.or_(both_null, both_equal)
            # ... and for now, it's simply:
            value_is_dup = pa.compute.equal(first, second)
            is_dup = pa.compute.and_(is_dup, value_is_dup)

        group_splits = np.where(~(is_dup.to_numpy(
            zero_copy_only=False)))[0] + 1

        sorted_groups = reencode_dictionaries(
            sorted_groups_with_dups.take(np.insert(group_splits, 0, 0)))
    else:
        sorted_groups = sorted_groups_with_dups
        group_splits = np.array([], np.int64())

    return SortedGroups(
        sorted_groups=sorted_groups,
        sorted_input_table=sorted_input_table,
        group_splits=group_splits,
    )