def test_find_partitions_with_limit_hive_format(self): """Partitioner.partitions_on_disk() with limit days set should work on hive-formatted partitions""" self.s3.create_bucket(Bucket=self.bucket) self.helper.make_database_and_table() today = pendulum.now() partitions = [] for i in range(1, 11): partition_date = today.subtract(days=i) year = partition_date.strftime("%Y") month = partition_date.strftime("%m") day = partition_date.strftime("%d") hour = "03" partition = Partition([ year, month, day, hour ], f"s3://{self.bucket}/{self.table}/year={year}/month={month}/day={day}/hour={hour}/" ) self.helper.write_partition_to_s3(partition) partitions.append(partition) partitioner = Partitioner(self.database, self.table, aws_region=self.region) found_partitions = partitioner.partitions_on_disk(limit_days=7) found_partitions.should.have.length_of(7) set(found_partitions).should.equal(set(partitions[0:7]))
def test_find_partitions_in_s3(self): self.s3.create_bucket(Bucket=self.bucket) self.helper.make_database_and_table() partitions = self.helper.create_many_partitions(count=10) partitioner = Partitioner(self.database, self.table, aws_region=self.region) found_partitions = partitioner.partitions_on_disk() set(found_partitions).should.equal(set(partitions))
def test_find_partitions_with_limit_hive_format_capital_keys(self): """Partitioner.partitions_on_disk() with limit days set should work on hive-formatted partitions where they keys are not lowercase""" self.s3.create_bucket(Bucket=self.bucket) self.helper.make_database() self.helper.make_table(partition_keys=[ { "Name": "Year", "Type": "int" }, { "Name": "Month", "Type": "int" }, { "Name": "Day", "Type": "int" }, { "Name": "Hour", "Type": "int" }, ]) today = pendulum.now() partitions = [] for i in range(1, 11): partition_date = today.subtract(days=i) year = partition_date.strftime("%Y") month = partition_date.strftime("%m") day = partition_date.strftime("%d") hour = "03" partition = Partition([ year, month, day, hour ], f"s3://{self.bucket}/{self.table}/Year={year}/Month={month}/Day={day}/Hour={hour}/" ) self.helper.write_partition_to_s3(partition) partitions.append(partition) partitioner = Partitioner(self.database, self.table, aws_region=self.region) found_partitions = partitioner.partitions_on_disk(limit_days=7) print(found_partitions) print(partitions[0:7]) found_partitions.should.have.length_of(7) set(found_partitions).should.equal(set(partitions[0:7]))
def test_find_partitions_with_limit_no_hour_partition(self): """Partitioner.partitions_on_disk, limit_days set, on a table partitioned by day, should work""" self.s3.create_bucket(Bucket=self.bucket) db_input = self.helper.create_database_input() self.glue.create_database(**db_input) table_input = self.helper.create_table_input( location=f"s3://{self.bucket}/{self.table}/") table_input["TableInput"]["PartitionKeys"] = [ { "Name": "year", "Type": "string" }, { "Name": "month", "Type": "string" }, { "Name": "day", "Type": "string" }, ] self.glue.create_table(**table_input) today = pendulum.now() partitions = [] for i in range(1, 11): partition_date = today.subtract(days=i) year = partition_date.strftime("%Y") month = partition_date.strftime("%m") day = partition_date.strftime("%d") partition = Partition( [year, month, day], f"s3://{self.bucket}/{self.table}/{year}/{month}/{day}/") self.helper.write_partition_to_s3(partition) partitions.append(partition) partitioner = Partitioner(self.database, self.table, aws_region=self.region) found_partitions = partitioner.partitions_on_disk(limit_days=4) found_partitions.should.have.length_of(4) set(found_partitions).should.equal(set(partitions[0:4]))
def test_partitions_to_create(self): self.s3.create_bucket(Bucket=self.bucket) self.helper.make_database_and_table() already_created = self.helper.create_many_partitions(count=10, write=True) to_create = self.helper.create_many_partitions(count=3, write=True) partitioner = Partitioner(self.database, self.table, aws_region=self.region) partitioner.create_partitions(already_created) found = partitioner.partitions_on_disk() wants_to_create = partitioner.partitions_to_create(found) set(wants_to_create).should.equal(set(to_create))
def test_create_partitions(self): self.s3.create_bucket(Bucket=self.bucket) self.helper.make_database_and_table() partitions = self.helper.create_many_partitions(count=10) partitions.sort() expected_output = f"Running Partitioner for {self.database}.{self.table}\n\tLooking for partitions in s3://{self.bucket}/{self.table}/\n\tFound 10 new partitions to create\n\t" expected_output += ", ".join(map(str, partitions)) partitioner = Partitioner(self.database, self.table, aws_region=self.region) with captured_output() as (out, err): create_found_partitions(partitioner, dry_run=False) output = out.getvalue().strip() output.should.equal(expected_output) found = partitioner.partitions_on_disk() set(found).should.equal(set(partitions))
def test_find_partitions_in_s3_with_hive_formatted_partitions(self): self.s3.create_bucket(Bucket=self.bucket) self.helper.make_database_and_table() # partitions = self.helper.create_many_partitions(count=10) partitions = [] for i in range(1, 11): partition = Partition([ "2019", "01", f"{i:02d}", "03" ], f"s3://{self.bucket}/{self.table}/year=2019/month=01/day={i:02d}/hour=03/" ) print(partition.location) self.helper.write_partition_to_s3(partition) partitions.append(partition) partitioner = Partitioner(self.database, self.table, aws_region=self.region) found_partitions = partitioner.partitions_on_disk() set(found_partitions).should.equal(set(partitions))
def test_create_partitions(self): self.s3.create_bucket(Bucket=self.bucket) self.helper.make_database_and_table() cli = Cli() partitions = self.helper.create_many_partitions(count=10) partitions.sort() expected_output = f"Running Partitioner for {self.database}.{self.table}\n\tLooking for partitions in s3://{self.bucket}/{self.table}/\n\tFound 10 new partitions to create\n\t" expected_output += ", ".join(map(str, partitions)) out, err = self.get_cmd_output( cli, ["create-partitions", self.database, self.table]) out.should.equal(expected_output) partitioner = Partitioner(self.database, self.table, aws_region=self.region) found = partitioner.partitions_on_disk() set(found).should.equal(set(partitions))
def test_find_partitions_single_key(self): """Partitioner.partitions_on_disk should work with single-key tables, in hive-format""" self.s3.create_bucket(Bucket=self.bucket) db_input = self.helper.create_database_input() self.glue.create_database(**db_input) table_input = self.helper.create_table_input() table_input["TableInput"]["PartitionKeys"] = [ { "Name": "dt", "Type": "string" }, ] self.glue.create_table(**table_input) # create initial partition prefix = table_input["TableInput"]["StorageDescriptor"]["Location"] location = f"{prefix}/dt=2019-01-02/" s3_key = f"{location}object.json" splits = s3_key[len("s3://"):].split("/", 1) bucket = splits[0] path = splits[1] self.s3.put_object( Body='{"foo": "bar"}', Bucket=bucket, Key=path, ) partitions = [Partition(["2019-01-02"], location)] partitioner = Partitioner(self.database, self.table, aws_region=self.region) found_partitions = partitioner.partitions_on_disk() set(found_partitions).should.equal(set(partitions))
def test_create_partitions_on_disk_with_bad_table_location(self): self.s3.create_bucket(Bucket=self.bucket) database_input = self.helper.create_database_input() self.glue.create_database(**database_input) # no trailing slash for location is on purpose and what this # test is checking against table_input = self.helper.create_table_input( location=f"s3://{self.bucket}/{self.table}") self.glue.create_table(**table_input) partition = self.helper.create_partition_data() subpath = "/".join(partition.values) full_location = f"s3://{self.bucket}/{self.table}/{subpath}/" partitioner = Partitioner(self.database, self.table, aws_region=self.region) found_partitions = partitioner.partitions_on_disk() found_partitions.should.have.length_of(1) found_partitions[0].location.should.equal(full_location)
def test_find_partitions_with_limit_days_and_prefix(self): """Partitioner.partitions_on_disk() with limit_days and prefix_partitions should find preceding partitions with hive-format names""" self.s3.create_bucket(Bucket=self.bucket) self.helper.make_database() self.helper.make_table(partition_keys=[ { "Name": "region", "Type": "string" }, { "Name": "year", "Type": "int" }, { "Name": "month", "Type": "int" }, { "Name": "day", "Type": "int" }, ]) today = pendulum.now() partitions = [] for i in range(1, 11): partition_date = today.subtract(days=i) year = partition_date.strftime("%Y") month = partition_date.strftime("%m") day = partition_date.strftime("%d") partition_east = Partition([ "us-east-1", year, month, day ], f"s3://{self.bucket}/{self.table}/region=us-east-1/year={year}/month={month}/day={day}/" ) partition_west = Partition([ "us-west-2", year, month, day ], f"s3://{self.bucket}/{self.table}/region=us-east-1/year={year}/month={month}/day={day}/" ) self.helper.write_partition_to_s3(partition_east) self.helper.write_partition_to_s3(partition_west) partitions.append(partition_east) partitions.append(partition_west) partitioner = Partitioner(self.database, self.table, aws_region=self.region) found_partitions = partitioner.partitions_on_disk( limit_days=4, prefix_partitions=["us-east-1"]) found_partitions.should.have.length_of(4) to_be_found = [] for i in range(1, 5): partition_date = today.subtract(days=i) year = partition_date.strftime("%Y") month = partition_date.strftime("%m") day = partition_date.strftime("%d") to_be_found.append( Partition([ "us-east-1", year, month, day ], f"s3://{self.bucket}/{self.table}/region=us-east-1/year={year}/month={month}/day={day}/" )) set(found_partitions).should.equal(set(to_be_found))