def test_delete_partitions_in_groups_of_twenty_five(self): self.s3.create_bucket(Bucket=self.bucket) self.helper.make_database_and_table() partitions = sorted(self.helper.create_many_partitions(count=30)) partitioner = Partitioner(self.database, self.table, aws_region=self.region) partitioner.create_partitions(partitions) mock = MagicMock(return_value=[]) partitioner.glue.batch_delete_partition = mock existing_partitions = partitioner.existing_partitions() partitioner.delete_partitions(existing_partitions) first_list = [{"Values": p.values} for p in partitions[:25]] second_list = [{"Values": p.values} for p in partitions[25:]] calls = [ call(DatabaseName=self.database, TableName=self.table, PartitionsToDelete=first_list), call(DatabaseName=self.database, TableName=self.table, PartitionsToDelete=second_list), ] mock.call_count.should.equal(2) mock.assert_has_calls(calls)
def test_create_partitions_error_output(self): """ Technically this should _never_ happen, but on the off chance that batch_get_partition ever returns bad values we'll leave it in""" self.s3.create_bucket(Bucket=self.bucket) self.helper.make_database_and_table() partitions = self.helper.create_many_partitions(count=10) partitions.sort() expected_output = f"Running Partitioner for {self.database}.{self.table}\n\tLooking for partitions in s3://{self.bucket}/{self.table}/\n\tFound 10 new partitions to create\n\t" expected_output += ", ".join(map(str, partitions)) expected_output += f"\nOne or more errors occurred when attempting to create partitions\nError on {partitions[0].values}: AlreadyExistsException" partitioner = Partitioner(self.database, self.table, aws_region=self.region) partitioner.create_partitions([partitions[0]]) mock = MagicMock(return_value=partitions) partitioner.partitions_to_create = mock with captured_output() as (out, err): create_found_partitions(partitioner) output = out.getvalue().strip() output.should.equal(expected_output) self.exit_mock.assert_called_with(1) fresh_partitioner = Partitioner(self.database, self.table, aws_region=self.region) exists = fresh_partitioner.existing_partitions() set(exists).should.equal(set(partitions))
def test_update_partitions(self): self.s3.create_bucket(Bucket=self.bucket) self.helper.make_database_and_table() cli = Cli() partitions = self.helper.create_many_partitions(10) partitions.sort() partitioner = Partitioner(self.database, self.table, aws_region=self.region) partitioner.create_partitions(partitions) expected_output = "Found 5 moved partitions" partitions_to_move = partitions[0:5] for p in partitions_to_move: subpath = "/".join(p.values) new_location = f"s3://old-bucket/old-table/{subpath}/" p.location = new_location expected_output += f"\n\t{p}" partitioner.update_partition_locations(partitions_to_move) out, err = self.get_cmd_output( cli, ["update-partitions", self.database, self.table]) out.should.equal(expected_output) found_map = PartitionMap(partitioner.existing_partitions()) for partition in partitions_to_move: matching = found_map.get(partition) matching.should_not.be.false matching.location.startswith( f"s3://{self.bucket}/{self.table}/").should.be.true
def test_delete_missing_partitions(self): self.helper.make_database_and_table() cli = Cli() self.s3.create_bucket(Bucket=self.bucket) partitions = self.helper.create_many_partitions(count=10) partitions.sort() partitioner = Partitioner(self.database, self.table, aws_region=self.region) partitioner.create_partitions(partitions) s3resource = boto3.resource("s3") bucket = s3resource.Bucket(self.bucket) for obj in bucket.objects.all(): obj.delete() expected_out = "Found 10 partitions to delete:" for partition in partitions: expected_out += f"\n\t{partition}" out, err = self.get_cmd_output( cli, ["delete-missing-partitions", self.database, self.table]) out.should.equal(expected_out) found_partitions = partitioner.existing_partitions() found_partitions.should.have.length_of(0)
def test_delete_bad_partitions_dry_run(self): self.s3.create_bucket(Bucket=self.bucket) self.helper.make_database_and_table() cli = Cli() partitions = self.helper.create_many_partitions( count=10, prefix="not-this-table") partitions.sort() partitioner = Partitioner(self.database, self.table, aws_region=self.region) partitioner.create_partitions(partitions) expected_out = "Found 10 partitions to delete\nDeleting the following partitions:" for partition in partitions: expected_out += f"\n\t{str(partition)}" out, err = self.get_cmd_output( cli, ["delete-bad-partitions", self.database, self.table, "--dry-run"]) out.should.equal(expected_out) found_partitions = partitioner.existing_partitions() found_partitions.should.have.length_of(10)
def test_find_partitions_in_glue_catalog(self): self.s3.create_bucket(Bucket=self.bucket) self.helper.make_database_and_table() partition = self.helper.create_partition_data() partitioner = Partitioner(self.database, self.table, aws_region=self.region) partitioner.create_partitions([partition]) existing_partitions = partitioner.existing_partitions() existing_partitions.should.have.length_of(1) existing_partitions[0].values.should.equal(partition.values) existing_partitions[0].location.should.equal(partition.location)
def test_update_partition_storage_descriptors(self): """Partitioner.update_storage_descriptors() updates the storage descriptors of all partitions""" self.helper.make_database_and_table() partitioner = Partitioner(self.database, self.table, aws_region=self.region) partitioner.create_partitions( self.helper.create_many_partitions(write=False)) # get and update table columns = [ { "Name": "foo", "Type": "string" }, { "Name": "bar", "Type": "string" }, { "Name": "only-in-this-test", "Type": "string" }, ] table = partitioner.glue.get_table(DatabaseName=self.database, Name=self.table)["Table"] for key in [ "DatabaseName", "CreateTime", "CreatedBy", "IsRegisteredWithLakeFormation", "CatalogId" ]: if key in table: del table[key] table["StorageDescriptor"]["Columns"] = columns partitioner.glue.update_table(DatabaseName=self.database, TableInput=table) errors = partitioner.update_partition_storage_descriptors() errors.should.have.length_of(0) for partition in partitioner.existing_partitions(): partition.raw["StorageDescriptor"]["Columns"].should.equal(columns)
def test_delete_missing_partitions_no_partitions(self): self.helper.make_database_and_table() cli = Cli() self.s3.create_bucket(Bucket=self.bucket) partitions = self.helper.create_many_partitions(count=10) partitions.sort() partitioner = Partitioner(self.database, self.table, aws_region=self.region) partitioner.create_partitions(partitions) out, err = self.get_cmd_output( cli, ["delete-missing-partitions", self.database, self.table]) out.should.equal("Found 0 partitions to delete:") catalog_partitions = partitioner.existing_partitions() catalog_partitions.should.have.length_of(10)
def test_create_partitions_dry_run(self): self.s3.create_bucket(Bucket=self.bucket) self.helper.make_database_and_table() partitions = self.helper.create_many_partitions(count=10) partitions.sort() expected_output = f"Running Partitioner for {self.database}.{self.table}\n\tLooking for partitions in s3://{self.bucket}/{self.table}/\n\tFound 10 new partitions to create\n\t" expected_output += ", ".join(map(str, partitions)) partitioner = Partitioner(self.database, self.table, aws_region=self.region) with captured_output() as (out, err): create_found_partitions(partitioner, dry_run=True) output = out.getvalue().strip() output.should.equal(expected_output) found = partitioner.existing_partitions() found.should.have.length_of(0)
def test_create_partitions_dry_run(self): self.s3.create_bucket(Bucket=self.bucket) self.helper.make_database_and_table() cli = Cli() partitions = self.helper.create_many_partitions(count=10) partitions.sort() expected_output = f"Running Partitioner for {self.database}.{self.table}\n\tLooking for partitions in s3://{self.bucket}/{self.table}/\n\tFound 10 new partitions to create\n\t" expected_output += ", ".join(map(str, partitions)) out, err = self.get_cmd_output( cli, ["create-partitions", self.database, self.table, "--dry-run"]) out.should.equal(expected_output) partitioner = Partitioner(self.database, self.table, aws_region=self.region) found = partitioner.existing_partitions() found.should.have.length_of(0)
def test_delete_all_partitions(self): self.s3.create_bucket(Bucket=self.bucket) self.helper.make_database_and_table() cli = Cli() partitions = self.helper.create_many_partitions(count=10) partitions.sort() partitioner = Partitioner(self.database, self.table, aws_region=self.region) partitioner.create_partitions(partitions) expected_out = "Deleting the following partitions:" for partition in partitions: expected_out += f"\n\t{str(partition)}" out, err = self.get_cmd_output( cli, ["delete-all-partitions", self.database, self.table]) out.should.equal(expected_out) found_partitions = partitioner.existing_partitions() found_partitions.should.have.length_of(0)
def test_delete_partitions(self): self.s3.create_bucket(Bucket=self.bucket) self.helper.make_database_and_table() self.helper.create_partition_data() partition = self.helper.create_partition_data() partitioner = Partitioner(self.database, self.table, aws_region=self.region) partitioner.create_partitions([partition]) mock = MagicMock(return_value=[]) partitioner.glue.batch_delete_partition = mock to_delete = partitioner.existing_partitions() partitioner.delete_partitions(to_delete) mock.assert_called_with(DatabaseName=self.database, TableName=self.table, PartitionsToDelete=[{ "Values": to_delete[0].values }])
def test_create_partitions_limit_days(self): self.s3.create_bucket(Bucket=self.bucket) self.helper.make_database_and_table() cli = Cli() today = pendulum.now() partitions = [] for i in range(1, 11): partition_date = today.subtract(days=i) year = partition_date.strftime("%Y") month = partition_date.strftime("%m") day = partition_date.strftime("%d") hour = "03" partition = Partition([ year, month, day, hour ], f"s3://{self.bucket}/{self.table}/{year}/{month}/{day}/{hour}/") self.helper.write_partition_to_s3(partition) partitions.append(partition) partitions.sort() expected_output = f"Running Partitioner for {self.database}.{self.table}\n\tLooking for partitions in s3://{self.bucket}/{self.table}/\n\tFound 7 new partitions to create\n\t" expected_output += ", ".join(map(str, partitions[3:])) out, err = self.get_cmd_output( cli, ["create-partitions", self.database, self.table, "--limit-days=7"]) out.should.equal(expected_output) partitioner = Partitioner(self.database, self.table, aws_region=self.region) found = partitioner.existing_partitions() found.should.have.length_of(7) set(found).should.equal(set(partitions[3:]))