Beispiel #1
0
def pyarrow_transform(batch: pa.Table) -> pa.Table:
    batch = batch.filter(pac.equal(batch["variety"], "Versicolor"))
    batch = batch.append_column(
        "normalized.sepal.length",
        pac.divide(batch["sepal.length"], pac.max(batch["sepal.length"])),
    )
    return batch.drop(["sepal.length"])
def add_page_pings_enabled_col(table: pa.Table) -> pa.Table:
    # Page views with page pings enabled have the 'heartbeat' context added. The context also tells us how many seconds
    # there are between each page ping. For now, we just hard code that value to 30s but it can be extracted from the
    # heartbeat context if needed.
    page_pings_enabled = table.column('contexts').to_pandas()\
        .str.contains('iglu:dk.jyllands-posten/heartbeat/jsonschema/')
    # noinspection PyCallByClass,PyTypeChecker
    return table.append_column(
        pa.Column.from_array('page_pings_enabled', page_pings_enabled))
Beispiel #3
0
def append_is_source_col(
        table: pa.Table,
        booleans) -> pa.Table:

    table = table.append_column(
        _IS_SOURCE_COLUMN_FIELD,
        get_is_source_column_array(booleans),
    )
    return table
Beispiel #4
0
def append_delta_type_col(
        table: pa.Table,
        delta_types) -> pa.Table:

    table = table.append_column(
        _DELTA_TYPE_COLUMN_FIELD,
        get_delta_type_column_array(delta_types),
    )
    return table
Beispiel #5
0
def append_dedupe_task_idx_col(
        table: pa.Table,
        dedupe_task_indices) -> pa.Table:

    table = table.append_column(
        _DEDUPE_TASK_IDX_COLUMN_FIELD,
        get_dedupe_task_idx_column_array(dedupe_task_indices),
    )
    return table
Beispiel #6
0
def append_record_idx_col(
        table: pa.Table,
        ordered_record_indices) -> pa.Table:

    table = table.append_column(
        _ORDERED_RECORD_IDX_COLUMN_FIELD,
        get_record_index_column_array(ordered_record_indices),
    )
    return table
Beispiel #7
0
def append_pk_hash_column(
        table: pa.Table,
        pk_hashes) -> pa.Table:

    table = table.append_column(
        _PK_HASH_COLUMN_FIELD,
        get_pk_hash_column_array(pk_hashes)
    )
    return table
Beispiel #8
0
def append_file_idx_column(
        table: pa.Table,
        ordered_file_indices):

    table = table.append_column(
        _ORDERED_FILE_IDX_COLUMN_FIELD,
        get_file_index_column_array(ordered_file_indices)
    )
    return table
Beispiel #9
0
def append_stream_position_column(
        table: pa.Table,
        stream_positions):

    table = table.append_column(
        _PARTITION_STREAM_POSITION_COLUMN_FIELD,
        get_stream_position_column_array(stream_positions)
    )
    return table
def add_brand(table: pa.Table) -> pa.Table:
    is_jp = table.column('app_id').to_pandas().str.endswith(
        'jyllands-posten.dk')
    brand = pd.Categorical(np.where(is_jp, 'jp', 'erhvervsmedier'))
    # noinspection PyCallByClass,PyTypeChecker
    return table.append_column(pa.Column.from_array('brand', brand))