def test_delete_all_partitions_dry_run(self): self.s3.create_bucket(Bucket=self.bucket) self.helper.make_database_and_table() cli = Cli() partitions = self.helper.create_many_partitions(count=10) partitions.sort() partitioner = Partitioner(self.database, self.table, aws_region=self.region) partitioner.create_partitions(partitions) expected_out = "Deleting the following partitions:" for partition in partitions: expected_out += f"\n\t{str(partition)}" out, err = self.get_cmd_output( cli, ["delete-all-partitions", self.database, self.table, "--dry-run"]) out.should.equal(expected_out) found_partitions = partitioner.existing_partitions() found_partitions.should.have.length_of(len(partitions))
def test_create_partitions_limit_days(self): self.s3.create_bucket(Bucket=self.bucket) self.helper.make_database_and_table() cli = Cli() today = pendulum.now() partitions = [] for i in range(1, 11): partition_date = today.subtract(days=i) year = partition_date.strftime("%Y") month = partition_date.strftime("%m") day = partition_date.strftime("%d") hour = "03" partition = Partition([ year, month, day, hour ], f"s3://{self.bucket}/{self.table}/{year}/{month}/{day}/{hour}/") self.helper.write_partition_to_s3(partition) partitions.append(partition) partitions.sort() expected_output = f"Running Partitioner for {self.database}.{self.table}\n\tLooking for partitions in s3://{self.bucket}/{self.table}/\n\tFound 7 new partitions to create\n\t" expected_output += ", ".join(map(str, partitions[3:])) out, err = self.get_cmd_output( cli, ["create-partitions", self.database, self.table, "--limit-days=7"]) out.should.equal(expected_output) partitioner = Partitioner(self.database, self.table, aws_region=self.region) found = partitioner.existing_partitions() found.should.have.length_of(7) set(found).should.equal(set(partitions[3:]))
def test_delete_bad_partitions(self): self.s3.create_bucket(Bucket=self.bucket) self.helper.make_database_and_table() cli = Cli() partitions = self.helper.create_many_partitions( count=10, prefix="not-this-table") partitions.sort() partitioner = Partitioner(self.database, self.table, aws_region=self.region) partitioner.create_partitions(partitions) expected_out = "Found 10 partitions to delete\nDeleting the following partitions:" for partition in partitions: expected_out += f"\n\t{str(partition)}" out, err = self.get_cmd_output( cli, ["delete-bad-partitions", self.database, self.table]) out.should.equal(expected_out) found_partitions = partitioner.existing_partitions() found_partitions.should.have.length_of(0)
def get_partitioner(self, args): try: return Partitioner(args.database, args.table, aws_profile=args.profile) except GlutilError as e: message = e.message if e.error_type == "ProfileNotFound": message += f"\n\tConfirm that {args.profile} is a locally configured aws profile." if e.error_type == "AccessDenied": if args.profile: message += f"\n\tConfirm that {args.profile} has the glue:GetTable permission." else: message += "\n\tDid you mean to run this with a profile specified?" if e.error_type == "EntityNotFound": message += f"\n\tConfirm {args.table} exists, and you have the ability to access it." print(message) sys.exit(1)
def test_partitions_to_create(self): self.s3.create_bucket(Bucket=self.bucket) self.helper.make_database_and_table() already_created = self.helper.create_many_partitions(count=10, write=True) to_create = self.helper.create_many_partitions(count=3, write=True) partitioner = Partitioner(self.database, self.table, aws_region=self.region) partitioner.create_partitions(already_created) found = partitioner.partitions_on_disk() wants_to_create = partitioner.partitions_to_create(found) set(wants_to_create).should.equal(set(to_create))
def test_find_partitions_with_limit_bad_partition_keys(self): """Partitioner.partitions_on_disk, limit_days set, on a single-partition table raises an error""" self.s3.create_bucket(Bucket=self.bucket) db_input = self.helper.create_database_input() self.glue.create_database(**db_input) table_input = self.helper.create_table_input( location=f"s3://{self.bucket}/{self.table}/") table_input["TableInput"]["PartitionKeys"] = [ { "Name": "dt", "Type": "string" }, ] self.glue.create_table(**table_input) partitioner = Partitioner(self.database, self.table, aws_region=self.region) partitioner.partitions_on_disk.when.called_with( limit_days=4).should.have.raised(TypeError)
def test_update_partition_storage_descriptors(self): """Partitioner.update_storage_descriptors() updates the storage descriptors of all partitions""" self.helper.make_database_and_table() partitioner = Partitioner(self.database, self.table, aws_region=self.region) partitioner.create_partitions( self.helper.create_many_partitions(write=False)) # get and update table columns = [ { "Name": "foo", "Type": "string" }, { "Name": "bar", "Type": "string" }, { "Name": "only-in-this-test", "Type": "string" }, ] table = partitioner.glue.get_table(DatabaseName=self.database, Name=self.table)["Table"] for key in [ "DatabaseName", "CreateTime", "CreatedBy", "IsRegisteredWithLakeFormation", "CatalogId" ]: if key in table: del table[key] table["StorageDescriptor"]["Columns"] = columns partitioner.glue.update_table(DatabaseName=self.database, TableInput=table) errors = partitioner.update_partition_storage_descriptors() errors.should.have.length_of(0) for partition in partitioner.existing_partitions(): partition.raw["StorageDescriptor"]["Columns"].should.equal(columns)
def test_delete_partitions(self): self.s3.create_bucket(Bucket=self.bucket) self.helper.make_database_and_table() self.helper.create_partition_data() partition = self.helper.create_partition_data() partitioner = Partitioner(self.database, self.table, aws_region=self.region) partitioner.create_partitions([partition]) mock = MagicMock(return_value=[]) partitioner.glue.batch_delete_partition = mock to_delete = partitioner.existing_partitions() partitioner.delete_partitions(to_delete) mock.assert_called_with(DatabaseName=self.database, TableName=self.table, PartitionsToDelete=[{ "Values": to_delete[0].values }])
def test_find_partitions_with_limit_days_and_prefix(self): """Partitioner.partitions_on_disk() with limit_days and prefix_partitions should find preceding partitions with hive-format names""" self.s3.create_bucket(Bucket=self.bucket) self.helper.make_database() self.helper.make_table(partition_keys=[ { "Name": "region", "Type": "string" }, { "Name": "year", "Type": "int" }, { "Name": "month", "Type": "int" }, { "Name": "day", "Type": "int" }, ]) today = pendulum.now() partitions = [] for i in range(1, 11): partition_date = today.subtract(days=i) year = partition_date.strftime("%Y") month = partition_date.strftime("%m") day = partition_date.strftime("%d") partition_east = Partition([ "us-east-1", year, month, day ], f"s3://{self.bucket}/{self.table}/region=us-east-1/year={year}/month={month}/day={day}/" ) partition_west = Partition([ "us-west-2", year, month, day ], f"s3://{self.bucket}/{self.table}/region=us-east-1/year={year}/month={month}/day={day}/" ) self.helper.write_partition_to_s3(partition_east) self.helper.write_partition_to_s3(partition_west) partitions.append(partition_east) partitions.append(partition_west) partitioner = Partitioner(self.database, self.table, aws_region=self.region) found_partitions = partitioner.partitions_on_disk( limit_days=4, prefix_partitions=["us-east-1"]) found_partitions.should.have.length_of(4) to_be_found = [] for i in range(1, 5): partition_date = today.subtract(days=i) year = partition_date.strftime("%Y") month = partition_date.strftime("%m") day = partition_date.strftime("%d") to_be_found.append( Partition([ "us-east-1", year, month, day ], f"s3://{self.bucket}/{self.table}/region=us-east-1/year={year}/month={month}/day={day}/" )) set(found_partitions).should.equal(set(to_be_found))