def test_by_time_period_from_7670_to_8400(): expected = Table.from_pydict({ 'unit_id': [ 1000000002, 1000000004, 1000000003, 1000000001, 1000000001, 1000000003, 1000000003, 1000000001, 1000000002 ], 'value': ["8", "2", "12", "3", "16", "2", "12", "3", "8"], 'start_epoch_days': [ 1461, 3287, 4018, 5479, 7851, 7701, 7957, 8126, 8066 ], 'stop_epoch_days': [ 8065, 7710, 7700, 7850, 8125, 7956, np.nan, np.nan, np.nan ] }) print_expected(expected) actual = filter_by_time_period( TEST_BOSTED_PARQUET_DIR, 7670, 8400, None, True ) print_actual(actual) assert_frame_equal( expected.to_pandas(), actual.to_pandas(), check_dtype=False )
def _send_data(self, to: ActorVirtualIdentity, data_payload: DataPayload) -> None: """ Send data payload to the given target actor. This method is to be used internally only. :param to: The target actor's ActorVirtualIdentity :param data_payload: The data payload to be sent, can be either DataFrame or EndOfUpstream """ if isinstance(data_payload, OutputDataFrame): # converting from a column-based dictionary is the fastest known method # https://stackoverflow.com/questions/57939092/fastest-way-to-construct-pyarrow-table-row-by-row field_names = data_payload.schema.names table = Table.from_pydict( {name: [t[name] for t in data_payload.frame] for name in field_names}, schema=data_payload.schema ) data_header = PythonDataHeader(tag=to, is_end=False) self._proxy_client.send_data(bytes(data_header), table) elif isinstance(data_payload, EndOfUpstream): data_header = PythonDataHeader(tag=to, is_end=True) self._proxy_client.send_data(bytes(data_header), None) else: raise TypeError(f"Unexpected payload {data_payload}")
def test_by_time_excluding_attributes(): expected = Table.from_pydict({ 'unit_id': [1000000002, 1000000004, 1000000003, 1000000001], 'value': ["8", "2", "12", "3"] }) print_expected(expected) actual = filter_by_time(TEST_BOSTED_PARQUET_DIR, 7669) print_actual(actual) assert_frame_equal( expected.to_pandas(), actual.to_pandas(), check_dtype=False )
def test_by_time(): expected = Table.from_pydict({ 'unit_id': [1000000002, 1000000004, 1000000003, 1000000001], 'value': ["8", "2", "12", "3"], 'start_epoch_days': [1461, 3287, 4018, 5479], 'stop_epoch_days': [8065, 7710, 7700, 7850] }) print_expected(expected) actual = filter_by_time(TEST_BOSTED_PARQUET_DIR, 7669, None, True) print_actual(actual) assert_frame_equal( expected.to_pandas(), actual.to_pandas(), check_dtype=False )
def test_by_fixed_including_population_filter_on_single_parquet(): parquet_file = TEST_PERSON_INCOME_PARQUET_FILE population_filter = [11111113785911, 11111111190644] expected = Table.from_pydict({ 'unit_id': [11111113785911, 11111111190644], 'value': ["16354872", "11331198"], 'start_epoch_days': [13879, 6209], 'stop_epoch_days': [14244, 6573] }) print_expected(expected) actual = filter_by_fixed(parquet_file, population_filter, True) if actual: print_actual(actual) assert_frame_equal( expected.to_pandas(), actual.to_pandas(), check_dtype=False )
def test_by_fixed_including_attributes_and_population_filter(): population_filter = [1000000002, 1000000003] expected = Table.from_pydict({ 'unit_id': [ 1000000002, 1000000003, 1000000003, 1000000003, 1000000002 ], 'value': ["8", "12", "2", "12", "8"], 'start_epoch_days': [1461, 4018, 7701, 7957, 8066], 'stop_epoch_days': [8065, 7700, 7956, np.nan, np.nan] }) print_expected(expected) actual = filter_by_fixed(TEST_BOSTED_PARQUET_DIR, population_filter, True) print_actual(actual) assert_frame_equal( expected.to_pandas(), actual.to_pandas(), check_dtype=False )
print("Initialize From a dictionary") data_dictionary = { 'col-1': [1, 2, 3, 4], 'col-2': [5, 6, 7, 8], 'col-3': [9, 10, 11, 12] } tb: Table = Table.from_pydict(ctx, data_dictionary) print(tb) print("Initialize From a PyArrow Table") data_dictionary = { 'col-1': [1, 2, 3, 4], 'col-2': [5, 6, 7, 8], 'col-3': [9, 10, 11, 12] } atb: ATable = ATable.from_pydict(data_dictionary) tb: Table = Table.from_arrow(ctx, atb) print(atb) print(tb) print("Initialize From Numpy") nd_array_list = [ np.array([1, 2, 3, 4]), np.array([5, 6, 7, 8]), np.array([9, 10, 11, 12]) ] columns = ['c1', 'c2', 'c3'] tb: Table = Table.from_numpy(ctx, columns, nd_array_list) print(tb) print("Initialize From Pandas")