def pyarrow_transform(batch: pa.Table) -> pa.Table: batch = batch.filter(pac.equal(batch["variety"], "Versicolor")) batch = batch.append_column( "normalized.sepal.length", pac.divide(batch["sepal.length"], pac.max(batch["sepal.length"])), ) return batch.drop(["sepal.length"])
def add_page_pings_enabled_col(table: pa.Table) -> pa.Table: # Page views with page pings enabled have the 'heartbeat' context added. The context also tells us how many seconds # there are between each page ping. For now, we just hard code that value to 30s but it can be extracted from the # heartbeat context if needed. page_pings_enabled = table.column('contexts').to_pandas()\ .str.contains('iglu:dk.jyllands-posten/heartbeat/jsonschema/') # noinspection PyCallByClass,PyTypeChecker return table.append_column( pa.Column.from_array('page_pings_enabled', page_pings_enabled))
def append_is_source_col( table: pa.Table, booleans) -> pa.Table: table = table.append_column( _IS_SOURCE_COLUMN_FIELD, get_is_source_column_array(booleans), ) return table
def append_delta_type_col( table: pa.Table, delta_types) -> pa.Table: table = table.append_column( _DELTA_TYPE_COLUMN_FIELD, get_delta_type_column_array(delta_types), ) return table
def append_dedupe_task_idx_col( table: pa.Table, dedupe_task_indices) -> pa.Table: table = table.append_column( _DEDUPE_TASK_IDX_COLUMN_FIELD, get_dedupe_task_idx_column_array(dedupe_task_indices), ) return table
def append_record_idx_col( table: pa.Table, ordered_record_indices) -> pa.Table: table = table.append_column( _ORDERED_RECORD_IDX_COLUMN_FIELD, get_record_index_column_array(ordered_record_indices), ) return table
def append_pk_hash_column( table: pa.Table, pk_hashes) -> pa.Table: table = table.append_column( _PK_HASH_COLUMN_FIELD, get_pk_hash_column_array(pk_hashes) ) return table
def append_file_idx_column( table: pa.Table, ordered_file_indices): table = table.append_column( _ORDERED_FILE_IDX_COLUMN_FIELD, get_file_index_column_array(ordered_file_indices) ) return table
def append_stream_position_column( table: pa.Table, stream_positions): table = table.append_column( _PARTITION_STREAM_POSITION_COLUMN_FIELD, get_stream_position_column_array(stream_positions) ) return table
def add_brand(table: pa.Table) -> pa.Table: is_jp = table.column('app_id').to_pandas().str.endswith( 'jyllands-posten.dk') brand = pd.Categorical(np.where(is_jp, 'jp', 'erhvervsmedier')) # noinspection PyCallByClass,PyTypeChecker return table.append_column(pa.Column.from_array('brand', brand))