def test_get_partition_difference_datetime(): bucket = 'safebucket' key = 'dataset' partition = 'burgertime' rando_values = [(datetime.datetime.now() - datetime.timedelta( seconds=random.randrange(100 * 24 * 60 * 60))).replace(microsecond=0) for x in range(5)] s3_paths = [ f"{key}/{partition}={x.strftime('%Y-%m-%d %H:%M:%S')}/12345.parquet" for x in rando_values[:-1] ] with patch("s3parq.fetch_parq.get_all_files_list") as get_all_files_list: with patch("s3parq.fetch_parq._get_partitions_and_types" ) as _get_partitions_and_types: get_all_files_list.return_value = s3_paths _get_partitions_and_types.return_value = {"burgertime": "datetime"} # partition values not in list values deltas = fetch_parq.get_diff_partition_values( bucket, key, partition, rando_values[:-2]) assert deltas == [rando_values[-2]] # list values not in partition values deltas = fetch_parq.get_diff_partition_values(bucket, key, partition, rando_values, reverse=True) assert deltas == [rando_values[-1]]
def test_get_partition_difference_int_comparison_none(): bucket = 'safebucket' key = 'dataset' partition = 'hamburger' rando_values = [1, 2, 3, 4, 5] rando_values.sort() s3_paths = [ f"{key}/{partition}={x}/12345.parquet" for x in rando_values[:-1] ] with patch("s3parq.fetch_parq.get_all_files_list") as get_all_files_list: with patch("s3parq.fetch_parq._get_partitions_and_types" ) as _get_partitions_and_types: get_all_files_list.return_value = s3_paths _get_partitions_and_types.return_value = {"hamburger": "integer"} # values when there is no sent comparisons deltas = fetch_parq.get_diff_partition_values( bucket, key, partition, []) deltas.sort() assert deltas == rando_values[:-1] # values when there is no sent comparisons and reversed deltas = fetch_parq.get_diff_partition_values( bucket, key, partition, [], True) deltas.sort() assert deltas == []
def test_get_partition_difference_int_when_none(self): bucket = 'safebucket' key = 'dataset' partition = 'hamburger' rando_values = [1, 2, 3, 4, 5] rando_values.sort() s3_paths = [] with patch("s3parq.fetch_parq._get_all_files_list" ) as _get_all_files_list: with patch("s3parq.fetch_parq._get_partitions_and_types" ) as _get_partitions_and_types: _get_all_files_list.return_value = [] _get_partitions_and_types.return_value = 777.09 # values when there is no bucket data deltas = fetch_parq.get_diff_partition_values( bucket, key, partition, rando_values[:-2]) deltas.sort() assert deltas == [] # values when theres no bucket data and reversed deltas = fetch_parq.get_diff_partition_values( bucket, key, partition, rando_values[:-2], True) deltas.sort() assert deltas == rando_values[:-2]
def test_get_partition_difference_string(self): bucket = 'safebucket' key = 'dataset' partition = 'hamburger' rando_values = [self.rand_string() for x in range(10)] s3_paths = [ f"{key}/{partition}={x}/12345.parquet" for x in rando_values[:-1] ] with patch("s3parq.fetch_parq._get_all_files_list" ) as _get_all_files_list: with patch("s3parq.fetch_parq._get_partitions_and_types" ) as _get_partitions_and_types: _get_all_files_list.return_value = s3_paths _get_partitions_and_types.return_value = { "hamburger": "string" } # partition values not in list values deltas = fetch_parq.get_diff_partition_values( bucket, key, partition, rando_values[:-2]) assert deltas == [rando_values[-2]] # list values not in partition values deltas = fetch_parq.get_diff_partition_values( bucket, key, partition, rando_values, True) assert deltas == [rando_values[-1]]
def test_get_partition_difference_string_when_none(): bucket = 'safebucket' key = 'dataset' partition = 'hamburger' rando_values = [setup_random_string() for x in range(10)] rando_values.sort() s3_paths = [] with patch("s3parq.fetch_parq.get_all_files_list") as get_all_files_list: get_all_files_list.return_value = [] # values when theres no bucket data deltas = fetch_parq.get_diff_partition_values(bucket, key, partition, rando_values[:-2]) deltas.sort() assert deltas == [] # values when theres no bucket data and reversed deltas = fetch_parq.get_diff_partition_values(bucket, key, partition, rando_values[:-2], True) deltas.sort() assert deltas == rando_values[:-2]