Example #1
0
    def test_delete_all_partitions_dry_run(self):
        self.s3.create_bucket(Bucket=self.bucket)
        self.helper.make_database_and_table()
        cli = Cli()

        partitions = self.helper.create_many_partitions(count=10)
        partitions.sort()
        partitioner = Partitioner(self.database,
                                  self.table,
                                  aws_region=self.region)
        partitioner.create_partitions(partitions)

        expected_out = "Deleting the following partitions:"
        for partition in partitions:
            expected_out += f"\n\t{str(partition)}"

        out, err = self.get_cmd_output(
            cli,
            ["delete-all-partitions", self.database, self.table, "--dry-run"])
        out.should.equal(expected_out)

        found_partitions = partitioner.existing_partitions()
        found_partitions.should.have.length_of(len(partitions))
Example #2
0
    def test_create_partitions_limit_days(self):
        self.s3.create_bucket(Bucket=self.bucket)
        self.helper.make_database_and_table()
        cli = Cli()

        today = pendulum.now()

        partitions = []
        for i in range(1, 11):
            partition_date = today.subtract(days=i)
            year = partition_date.strftime("%Y")
            month = partition_date.strftime("%m")
            day = partition_date.strftime("%d")
            hour = "03"

            partition = Partition([
                year, month, day, hour
            ], f"s3://{self.bucket}/{self.table}/{year}/{month}/{day}/{hour}/")
            self.helper.write_partition_to_s3(partition)
            partitions.append(partition)

        partitions.sort()

        expected_output = f"Running Partitioner for {self.database}.{self.table}\n\tLooking for partitions in s3://{self.bucket}/{self.table}/\n\tFound 7 new partitions to create\n\t"
        expected_output += ", ".join(map(str, partitions[3:]))

        out, err = self.get_cmd_output(
            cli,
            ["create-partitions", self.database, self.table, "--limit-days=7"])
        out.should.equal(expected_output)

        partitioner = Partitioner(self.database,
                                  self.table,
                                  aws_region=self.region)
        found = partitioner.existing_partitions()
        found.should.have.length_of(7)
        set(found).should.equal(set(partitions[3:]))
Example #3
0
    def test_delete_bad_partitions(self):
        self.s3.create_bucket(Bucket=self.bucket)
        self.helper.make_database_and_table()
        cli = Cli()

        partitions = self.helper.create_many_partitions(
            count=10, prefix="not-this-table")
        partitions.sort()

        partitioner = Partitioner(self.database,
                                  self.table,
                                  aws_region=self.region)
        partitioner.create_partitions(partitions)

        expected_out = "Found 10 partitions to delete\nDeleting the following partitions:"
        for partition in partitions:
            expected_out += f"\n\t{str(partition)}"

        out, err = self.get_cmd_output(
            cli, ["delete-bad-partitions", self.database, self.table])
        out.should.equal(expected_out)

        found_partitions = partitioner.existing_partitions()
        found_partitions.should.have.length_of(0)
Example #4
0
    def get_partitioner(self, args):
        try:
            return Partitioner(args.database,
                               args.table,
                               aws_profile=args.profile)
        except GlutilError as e:
            message = e.message
            if e.error_type == "ProfileNotFound":
                message += f"\n\tConfirm that {args.profile} is a locally configured aws profile."
            if e.error_type == "AccessDenied":
                if args.profile:
                    message += f"\n\tConfirm that {args.profile} has the glue:GetTable permission."
                else:
                    message += "\n\tDid you mean to run this with a profile specified?"
            if e.error_type == "EntityNotFound":
                message += f"\n\tConfirm {args.table} exists, and you have the ability to access it."

            print(message)
            sys.exit(1)
Example #5
0
    def test_partitions_to_create(self):
        self.s3.create_bucket(Bucket=self.bucket)
        self.helper.make_database_and_table()

        already_created = self.helper.create_many_partitions(count=10,
                                                             write=True)
        to_create = self.helper.create_many_partitions(count=3, write=True)

        partitioner = Partitioner(self.database,
                                  self.table,
                                  aws_region=self.region)
        partitioner.create_partitions(already_created)

        found = partitioner.partitions_on_disk()
        wants_to_create = partitioner.partitions_to_create(found)

        set(wants_to_create).should.equal(set(to_create))
Example #6
0
    def test_find_partitions_with_limit_bad_partition_keys(self):
        """Partitioner.partitions_on_disk, limit_days set,
            on a single-partition table raises an error"""
        self.s3.create_bucket(Bucket=self.bucket)
        db_input = self.helper.create_database_input()
        self.glue.create_database(**db_input)

        table_input = self.helper.create_table_input(
            location=f"s3://{self.bucket}/{self.table}/")
        table_input["TableInput"]["PartitionKeys"] = [
            {
                "Name": "dt",
                "Type": "string"
            },
        ]

        self.glue.create_table(**table_input)

        partitioner = Partitioner(self.database,
                                  self.table,
                                  aws_region=self.region)
        partitioner.partitions_on_disk.when.called_with(
            limit_days=4).should.have.raised(TypeError)
Example #7
0
    def test_update_partition_storage_descriptors(self):
        """Partitioner.update_storage_descriptors() updates the storage descriptors of all partitions"""
        self.helper.make_database_and_table()

        partitioner = Partitioner(self.database,
                                  self.table,
                                  aws_region=self.region)
        partitioner.create_partitions(
            self.helper.create_many_partitions(write=False))

        # get and update table
        columns = [
            {
                "Name": "foo",
                "Type": "string"
            },
            {
                "Name": "bar",
                "Type": "string"
            },
            {
                "Name": "only-in-this-test",
                "Type": "string"
            },
        ]

        table = partitioner.glue.get_table(DatabaseName=self.database,
                                           Name=self.table)["Table"]
        for key in [
                "DatabaseName", "CreateTime", "CreatedBy",
                "IsRegisteredWithLakeFormation", "CatalogId"
        ]:
            if key in table:
                del table[key]

        table["StorageDescriptor"]["Columns"] = columns
        partitioner.glue.update_table(DatabaseName=self.database,
                                      TableInput=table)

        errors = partitioner.update_partition_storage_descriptors()
        errors.should.have.length_of(0)

        for partition in partitioner.existing_partitions():
            partition.raw["StorageDescriptor"]["Columns"].should.equal(columns)
Example #8
0
    def test_delete_partitions(self):
        self.s3.create_bucket(Bucket=self.bucket)
        self.helper.make_database_and_table()
        self.helper.create_partition_data()

        partition = self.helper.create_partition_data()
        partitioner = Partitioner(self.database,
                                  self.table,
                                  aws_region=self.region)
        partitioner.create_partitions([partition])

        mock = MagicMock(return_value=[])
        partitioner.glue.batch_delete_partition = mock

        to_delete = partitioner.existing_partitions()
        partitioner.delete_partitions(to_delete)

        mock.assert_called_with(DatabaseName=self.database,
                                TableName=self.table,
                                PartitionsToDelete=[{
                                    "Values":
                                    to_delete[0].values
                                }])
Example #9
0
    def test_find_partitions_with_limit_days_and_prefix(self):
        """Partitioner.partitions_on_disk() with limit_days and prefix_partitions should find preceding partitions with hive-format names"""
        self.s3.create_bucket(Bucket=self.bucket)
        self.helper.make_database()
        self.helper.make_table(partition_keys=[
            {
                "Name": "region",
                "Type": "string"
            },
            {
                "Name": "year",
                "Type": "int"
            },
            {
                "Name": "month",
                "Type": "int"
            },
            {
                "Name": "day",
                "Type": "int"
            },
        ])

        today = pendulum.now()

        partitions = []
        for i in range(1, 11):
            partition_date = today.subtract(days=i)
            year = partition_date.strftime("%Y")
            month = partition_date.strftime("%m")
            day = partition_date.strftime("%d")

            partition_east = Partition([
                "us-east-1", year, month, day
            ], f"s3://{self.bucket}/{self.table}/region=us-east-1/year={year}/month={month}/day={day}/"
                                       )
            partition_west = Partition([
                "us-west-2", year, month, day
            ], f"s3://{self.bucket}/{self.table}/region=us-east-1/year={year}/month={month}/day={day}/"
                                       )
            self.helper.write_partition_to_s3(partition_east)
            self.helper.write_partition_to_s3(partition_west)
            partitions.append(partition_east)
            partitions.append(partition_west)

        partitioner = Partitioner(self.database,
                                  self.table,
                                  aws_region=self.region)
        found_partitions = partitioner.partitions_on_disk(
            limit_days=4, prefix_partitions=["us-east-1"])
        found_partitions.should.have.length_of(4)

        to_be_found = []
        for i in range(1, 5):
            partition_date = today.subtract(days=i)
            year = partition_date.strftime("%Y")
            month = partition_date.strftime("%m")
            day = partition_date.strftime("%d")

            to_be_found.append(
                Partition([
                    "us-east-1", year, month, day
                ], f"s3://{self.bucket}/{self.table}/region=us-east-1/year={year}/month={month}/day={day}/"
                          ))

        set(found_partitions).should.equal(set(to_be_found))