def test_create_partition_batches_by_one_hundred(self): self.s3.create_bucket(Bucket=self.bucket) self.helper.make_database_and_table() partitions = sorted(self.helper.create_many_partitions(count=150)) partitioner = Partitioner(self.database, self.table, aws_region=self.region) create_partitions_mock = MagicMock(return_value=[]) partitioner.glue.batch_create_partition = create_partitions_mock partitioner.create_partitions(partitions) first_list = [ partitioner._partition_input(p) for p in partitions[:100] ] second_list = [ partitioner._partition_input(p) for p in partitions[100:] ] calls = [ call(DatabaseName=self.database, TableName=self.table, PartitionInputList=first_list), call(DatabaseName=self.database, TableName=self.table, PartitionInputList=second_list), ] create_partitions_mock.call_count.should.equal(2) create_partitions_mock.assert_has_calls(calls)
def test_create_partitions_error_output(self): """ Technically this should _never_ happen, but on the off chance that batch_get_partition ever returns bad values we'll leave it in""" self.s3.create_bucket(Bucket=self.bucket) self.helper.make_database_and_table() partitions = self.helper.create_many_partitions(count=10) partitions.sort() expected_output = f"Running Partitioner for {self.database}.{self.table}\n\tLooking for partitions in s3://{self.bucket}/{self.table}/\n\tFound 10 new partitions to create\n\t" expected_output += ", ".join(map(str, partitions)) expected_output += f"\nOne or more errors occurred when attempting to create partitions\nError on {partitions[0].values}: AlreadyExistsException" partitioner = Partitioner(self.database, self.table, aws_region=self.region) partitioner.create_partitions([partitions[0]]) mock = MagicMock(return_value=partitions) partitioner.partitions_to_create = mock with captured_output() as (out, err): create_found_partitions(partitioner) output = out.getvalue().strip() output.should.equal(expected_output) self.exit_mock.assert_called_with(1) fresh_partitioner = Partitioner(self.database, self.table, aws_region=self.region) exists = fresh_partitioner.existing_partitions() set(exists).should.equal(set(partitions))
def test_delete_bad_partitions_dry_run(self): self.s3.create_bucket(Bucket=self.bucket) self.helper.make_database_and_table() cli = Cli() partitions = self.helper.create_many_partitions( count=10, prefix="not-this-table") partitions.sort() partitioner = Partitioner(self.database, self.table, aws_region=self.region) partitioner.create_partitions(partitions) expected_out = "Found 10 partitions to delete\nDeleting the following partitions:" for partition in partitions: expected_out += f"\n\t{str(partition)}" out, err = self.get_cmd_output( cli, ["delete-bad-partitions", self.database, self.table, "--dry-run"]) out.should.equal(expected_out) found_partitions = partitioner.existing_partitions() found_partitions.should.have.length_of(10)
def test_update_partitions_error_output(self): self.s3.create_bucket(Bucket=self.bucket) self.helper.make_database_and_table() cli = Cli() partitioner = Partitioner(self.database, self.table, aws_region=self.region) partition = self.helper.create_partition_data() partition.location = "s3://old-bucket/old-table/" partitioner.create_partitions([partition]) mock = MagicMock() mock.return_value = [{ "PartitionValues": partition.values, "ErrorDetail": { "ErrorCode": "PartitionNotFound", "ErrorMessage": "Partition not found" } }] partitioner.update_partition_locations = mock partitioner_mock = MagicMock(return_value=partitioner) cli.get_partitioner = partitioner_mock expected_output = f"Found 1 moved partitions\n\t{partition}\nOne or more errors occurred when attempting to update partitions\nError on {partition.values}: PartitionNotFound" out, err = self.get_cmd_output( cli, ["update-partitions", self.database, self.table]) out.should.equal(expected_output) self.exit_mock.assert_called_with(1)
def test_delete_missing_partitions(self): self.helper.make_database_and_table() cli = Cli() self.s3.create_bucket(Bucket=self.bucket) partitions = self.helper.create_many_partitions(count=10) partitions.sort() partitioner = Partitioner(self.database, self.table, aws_region=self.region) partitioner.create_partitions(partitions) s3resource = boto3.resource("s3") bucket = s3resource.Bucket(self.bucket) for obj in bucket.objects.all(): obj.delete() expected_out = "Found 10 partitions to delete:" for partition in partitions: expected_out += f"\n\t{partition}" out, err = self.get_cmd_output( cli, ["delete-missing-partitions", self.database, self.table]) out.should.equal(expected_out) found_partitions = partitioner.existing_partitions() found_partitions.should.have.length_of(0)
def test_update_partitions(self): self.s3.create_bucket(Bucket=self.bucket) self.helper.make_database_and_table() cli = Cli() partitions = self.helper.create_many_partitions(10) partitions.sort() partitioner = Partitioner(self.database, self.table, aws_region=self.region) partitioner.create_partitions(partitions) expected_output = "Found 5 moved partitions" partitions_to_move = partitions[0:5] for p in partitions_to_move: subpath = "/".join(p.values) new_location = f"s3://old-bucket/old-table/{subpath}/" p.location = new_location expected_output += f"\n\t{p}" partitioner.update_partition_locations(partitions_to_move) out, err = self.get_cmd_output( cli, ["update-partitions", self.database, self.table]) out.should.equal(expected_output) found_map = PartitionMap(partitioner.existing_partitions()) for partition in partitions_to_move: matching = found_map.get(partition) matching.should_not.be.false matching.location.startswith( f"s3://{self.bucket}/{self.table}/").should.be.true
def test_delete_partitions_in_groups_of_twenty_five(self): self.s3.create_bucket(Bucket=self.bucket) self.helper.make_database_and_table() partitions = sorted(self.helper.create_many_partitions(count=30)) partitioner = Partitioner(self.database, self.table, aws_region=self.region) partitioner.create_partitions(partitions) mock = MagicMock(return_value=[]) partitioner.glue.batch_delete_partition = mock existing_partitions = partitioner.existing_partitions() partitioner.delete_partitions(existing_partitions) first_list = [{"Values": p.values} for p in partitions[:25]] second_list = [{"Values": p.values} for p in partitions[25:]] calls = [ call(DatabaseName=self.database, TableName=self.table, PartitionsToDelete=first_list), call(DatabaseName=self.database, TableName=self.table, PartitionsToDelete=second_list), ] mock.call_count.should.equal(2) mock.assert_has_calls(calls)
def test_delete_bad_partitions_error_output(self): self.s3.create_bucket(Bucket=self.bucket) self.helper.make_database_and_table() cli = Cli() partitioner = Partitioner(self.database, self.table, aws_region=self.region) partition = self.helper.create_partition_data(prefix="not-this-table") partitioner.create_partitions([partition]) mock = MagicMock() mock.return_value = [{ "PartitionValues": partition.values, "ErrorDetail": { "ErrorCode": "PartitionNotFound", "ErrorMessage": "Partition not found" } }] partitioner.delete_partitions = mock partitioner_mock = MagicMock(return_value=partitioner) cli.get_partitioner = partitioner_mock expected_output = f"Found 1 partitions to delete\nDeleting the following partitions:\n\t{partition}\nOne or more errors occurred when attempting to delete partitions\nError on {partition.values}: PartitionNotFound" out, err = self.get_cmd_output( cli, ["delete-bad-partitions", self.database, self.table]) out.should.equal(expected_output) self.exit_mock.assert_called_with(1)
def test_find_partitions_in_glue_catalog(self): self.s3.create_bucket(Bucket=self.bucket) self.helper.make_database_and_table() partition = self.helper.create_partition_data() partitioner = Partitioner(self.database, self.table, aws_region=self.region) partitioner.create_partitions([partition]) existing_partitions = partitioner.existing_partitions() existing_partitions.should.have.length_of(1) existing_partitions[0].values.should.equal(partition.values) existing_partitions[0].location.should.equal(partition.location)
def test_create_partition_when_partition_exists(self): self.s3.create_bucket(Bucket=self.bucket) self.helper.make_database_and_table() partition = self.helper.create_partition_data() partitioner = Partitioner(self.database, self.table, aws_region=self.region) create_partitions_mock = MagicMock( return_value={ "Errors": [{ "PartitionValues": partition.values, "ErrorDetail": { "ErrorCode": "AlreadyExistsException", "ErrorMessage": "Partition already exists" } }] }) partitioner.glue.batch_create_partition = create_partitions_mock errors = partitioner.create_partitions([partition]) create_partitions_mock.assert_called_once() errors.should.have.length_of(1) errors[0]["PartitionValues"].should.equal(partition.values) errors[0]["ErrorDetail"]["ErrorCode"].should.equal( "AlreadyExistsException")
def test_update_partitions_no_partitions(self): self.s3.create_bucket(Bucket=self.bucket) self.helper.make_database_and_table() cli = Cli() partitions = self.helper.create_many_partitions(10) partitions.sort() partitioner = Partitioner(self.database, self.table, aws_region=self.region) partitioner.create_partitions(partitions) # all partitions correctly located out, err = self.get_cmd_output( cli, ["update-partitions", self.database, self.table]) out.should.equal("No partitions to update")
def test_partitions_to_create(self): self.s3.create_bucket(Bucket=self.bucket) self.helper.make_database_and_table() already_created = self.helper.create_many_partitions(count=10, write=True) to_create = self.helper.create_many_partitions(count=3, write=True) partitioner = Partitioner(self.database, self.table, aws_region=self.region) partitioner.create_partitions(already_created) found = partitioner.partitions_on_disk() wants_to_create = partitioner.partitions_to_create(found) set(wants_to_create).should.equal(set(to_create))
def test_create_partitions_nothing_new(self): self.s3.create_bucket(Bucket=self.bucket) self.helper.make_database_and_table() cli = Cli() partitions = self.helper.create_many_partitions(count=10) partitions.sort() partitioner = Partitioner(self.database, self.table, aws_region=self.region) partitioner.create_partitions(partitions) expected_output = f"Running Partitioner for {self.database}.{self.table}\n\tLooking for partitions in s3://{self.bucket}/{self.table}/\n\tFound 0 new partitions to create" out, err = self.get_cmd_output( cli, ["create-partitions", self.database, self.table]) out.should.equal(expected_output)
def test_create_partitions_nothing_new(self): self.s3.create_bucket(Bucket=self.bucket) self.helper.make_database_and_table() partitions = self.helper.create_many_partitions(count=10) partitions.sort() partitioner = Partitioner(self.database, self.table, aws_region=self.region) partitioner.create_partitions(partitions) expected_output = f"Running Partitioner for {self.database}.{self.table}\n\tLooking for partitions in s3://{self.bucket}/{self.table}/\n\tFound 0 new partitions to create" with captured_output() as (out, err): create_found_partitions(partitioner) output = out.getvalue().strip() output.should.equal(expected_output)
def test_update_partition_storage_descriptors(self): """Partitioner.update_storage_descriptors() updates the storage descriptors of all partitions""" self.helper.make_database_and_table() partitioner = Partitioner(self.database, self.table, aws_region=self.region) partitioner.create_partitions( self.helper.create_many_partitions(write=False)) # get and update table columns = [ { "Name": "foo", "Type": "string" }, { "Name": "bar", "Type": "string" }, { "Name": "only-in-this-test", "Type": "string" }, ] table = partitioner.glue.get_table(DatabaseName=self.database, Name=self.table)["Table"] for key in [ "DatabaseName", "CreateTime", "CreatedBy", "IsRegisteredWithLakeFormation", "CatalogId" ]: if key in table: del table[key] table["StorageDescriptor"]["Columns"] = columns partitioner.glue.update_table(DatabaseName=self.database, TableInput=table) errors = partitioner.update_partition_storage_descriptors() errors.should.have.length_of(0) for partition in partitioner.existing_partitions(): partition.raw["StorageDescriptor"]["Columns"].should.equal(columns)
def test_delete_missing_partitions_no_partitions(self): self.helper.make_database_and_table() cli = Cli() self.s3.create_bucket(Bucket=self.bucket) partitions = self.helper.create_many_partitions(count=10) partitions.sort() partitioner = Partitioner(self.database, self.table, aws_region=self.region) partitioner.create_partitions(partitions) out, err = self.get_cmd_output( cli, ["delete-missing-partitions", self.database, self.table]) out.should.equal("Found 0 partitions to delete:") catalog_partitions = partitioner.existing_partitions() catalog_partitions.should.have.length_of(10)
def test_create_partitions(self): self.s3.create_bucket(Bucket=self.bucket) self.helper.make_database_and_table() partition = self.helper.create_partition_data() partitioner = Partitioner(self.database, self.table, aws_region=self.region) create_partitions_mock = MagicMock(return_value=[]) partitioner.glue.batch_create_partition = create_partitions_mock partitioner.create_partitions([partition]) create_partitions_mock.assert_called_with( DatabaseName=self.database, TableName=self.table, PartitionInputList=[partitioner._partition_input(partition)])
def test_delete_all_partitions(self): self.s3.create_bucket(Bucket=self.bucket) self.helper.make_database_and_table() cli = Cli() partitions = self.helper.create_many_partitions(count=10) partitions.sort() partitioner = Partitioner(self.database, self.table, aws_region=self.region) partitioner.create_partitions(partitions) expected_out = "Deleting the following partitions:" for partition in partitions: expected_out += f"\n\t{str(partition)}" out, err = self.get_cmd_output( cli, ["delete-all-partitions", self.database, self.table]) out.should.equal(expected_out) found_partitions = partitioner.existing_partitions() found_partitions.should.have.length_of(0)
def test_delete_partitions(self): self.s3.create_bucket(Bucket=self.bucket) self.helper.make_database_and_table() self.helper.create_partition_data() partition = self.helper.create_partition_data() partitioner = Partitioner(self.database, self.table, aws_region=self.region) partitioner.create_partitions([partition]) mock = MagicMock(return_value=[]) partitioner.glue.batch_delete_partition = mock to_delete = partitioner.existing_partitions() partitioner.delete_partitions(to_delete) mock.assert_called_with(DatabaseName=self.database, TableName=self.table, PartitionsToDelete=[{ "Values": to_delete[0].values }])
def test_create_partition_already_exists_in_multiple_batches(self): self.s3.create_bucket(Bucket=self.bucket) self.helper.make_database_and_table() partitions = sorted(self.helper.create_many_partitions(count=150)) partitioner = Partitioner(self.database, self.table, aws_region=self.region) # prime partitions list with two partitions, one in each group already_exists = [partitions[5], partitions[115]] errors = partitioner.create_partitions(already_exists) errors.should.be.empty # now attempt to create them as part of a large batch errors = partitioner.create_partitions(partitions) errors.should.have.length_of(2) for idx, error in enumerate(errors): partition = already_exists[idx] error["PartitionValues"].should.equal(partition.values) error["ErrorDetail"]["ErrorCode"].should.equal( "AlreadyExistsException")