Example #1
0
    def test_delete_partitions_in_groups_of_twenty_five(self):
        self.s3.create_bucket(Bucket=self.bucket)
        self.helper.make_database_and_table()

        partitions = sorted(self.helper.create_many_partitions(count=30))

        partitioner = Partitioner(self.database,
                                  self.table,
                                  aws_region=self.region)
        partitioner.create_partitions(partitions)

        mock = MagicMock(return_value=[])
        partitioner.glue.batch_delete_partition = mock

        existing_partitions = partitioner.existing_partitions()
        partitioner.delete_partitions(existing_partitions)

        first_list = [{"Values": p.values} for p in partitions[:25]]
        second_list = [{"Values": p.values} for p in partitions[25:]]
        calls = [
            call(DatabaseName=self.database,
                 TableName=self.table,
                 PartitionsToDelete=first_list),
            call(DatabaseName=self.database,
                 TableName=self.table,
                 PartitionsToDelete=second_list),
        ]

        mock.call_count.should.equal(2)
        mock.assert_has_calls(calls)
Example #2
0
    def test_create_partitions_error_output(self):
        """ Technically this should _never_ happen, but on the off chance that
        batch_get_partition ever returns bad values we'll leave it in"""
        self.s3.create_bucket(Bucket=self.bucket)
        self.helper.make_database_and_table()

        partitions = self.helper.create_many_partitions(count=10)
        partitions.sort()

        expected_output = f"Running Partitioner for {self.database}.{self.table}\n\tLooking for partitions in s3://{self.bucket}/{self.table}/\n\tFound 10 new partitions to create\n\t"
        expected_output += ", ".join(map(str, partitions))
        expected_output += f"\nOne or more errors occurred when attempting to create partitions\nError on {partitions[0].values}: AlreadyExistsException"

        partitioner = Partitioner(self.database,
                                  self.table,
                                  aws_region=self.region)
        partitioner.create_partitions([partitions[0]])
        mock = MagicMock(return_value=partitions)
        partitioner.partitions_to_create = mock

        with captured_output() as (out, err):
            create_found_partitions(partitioner)
        output = out.getvalue().strip()
        output.should.equal(expected_output)
        self.exit_mock.assert_called_with(1)

        fresh_partitioner = Partitioner(self.database,
                                        self.table,
                                        aws_region=self.region)
        exists = fresh_partitioner.existing_partitions()

        set(exists).should.equal(set(partitions))
Example #3
0
    def test_update_partitions(self):
        self.s3.create_bucket(Bucket=self.bucket)
        self.helper.make_database_and_table()
        cli = Cli()

        partitions = self.helper.create_many_partitions(10)
        partitions.sort()
        partitioner = Partitioner(self.database,
                                  self.table,
                                  aws_region=self.region)
        partitioner.create_partitions(partitions)

        expected_output = "Found 5 moved partitions"
        partitions_to_move = partitions[0:5]
        for p in partitions_to_move:
            subpath = "/".join(p.values)
            new_location = f"s3://old-bucket/old-table/{subpath}/"
            p.location = new_location
            expected_output += f"\n\t{p}"

        partitioner.update_partition_locations(partitions_to_move)

        out, err = self.get_cmd_output(
            cli, ["update-partitions", self.database, self.table])
        out.should.equal(expected_output)

        found_map = PartitionMap(partitioner.existing_partitions())
        for partition in partitions_to_move:
            matching = found_map.get(partition)
            matching.should_not.be.false
            matching.location.startswith(
                f"s3://{self.bucket}/{self.table}/").should.be.true
Example #4
0
    def test_delete_missing_partitions(self):
        self.helper.make_database_and_table()
        cli = Cli()

        self.s3.create_bucket(Bucket=self.bucket)
        partitions = self.helper.create_many_partitions(count=10)
        partitions.sort()

        partitioner = Partitioner(self.database,
                                  self.table,
                                  aws_region=self.region)
        partitioner.create_partitions(partitions)

        s3resource = boto3.resource("s3")
        bucket = s3resource.Bucket(self.bucket)
        for obj in bucket.objects.all():
            obj.delete()

        expected_out = "Found 10 partitions to delete:"
        for partition in partitions:
            expected_out += f"\n\t{partition}"

        out, err = self.get_cmd_output(
            cli, ["delete-missing-partitions", self.database, self.table])
        out.should.equal(expected_out)

        found_partitions = partitioner.existing_partitions()
        found_partitions.should.have.length_of(0)
Example #5
0
    def test_delete_bad_partitions_dry_run(self):
        self.s3.create_bucket(Bucket=self.bucket)
        self.helper.make_database_and_table()
        cli = Cli()

        partitions = self.helper.create_many_partitions(
            count=10, prefix="not-this-table")
        partitions.sort()

        partitioner = Partitioner(self.database,
                                  self.table,
                                  aws_region=self.region)
        partitioner.create_partitions(partitions)

        expected_out = "Found 10 partitions to delete\nDeleting the following partitions:"
        for partition in partitions:
            expected_out += f"\n\t{str(partition)}"

        out, err = self.get_cmd_output(
            cli,
            ["delete-bad-partitions", self.database, self.table, "--dry-run"])
        out.should.equal(expected_out)

        found_partitions = partitioner.existing_partitions()
        found_partitions.should.have.length_of(10)
Example #6
0
    def test_find_partitions_in_glue_catalog(self):
        self.s3.create_bucket(Bucket=self.bucket)
        self.helper.make_database_and_table()

        partition = self.helper.create_partition_data()
        partitioner = Partitioner(self.database,
                                  self.table,
                                  aws_region=self.region)
        partitioner.create_partitions([partition])

        existing_partitions = partitioner.existing_partitions()
        existing_partitions.should.have.length_of(1)
        existing_partitions[0].values.should.equal(partition.values)
        existing_partitions[0].location.should.equal(partition.location)
Example #7
0
    def test_update_partition_storage_descriptors(self):
        """Partitioner.update_storage_descriptors() updates the storage descriptors of all partitions"""
        self.helper.make_database_and_table()

        partitioner = Partitioner(self.database,
                                  self.table,
                                  aws_region=self.region)
        partitioner.create_partitions(
            self.helper.create_many_partitions(write=False))

        # get and update table
        columns = [
            {
                "Name": "foo",
                "Type": "string"
            },
            {
                "Name": "bar",
                "Type": "string"
            },
            {
                "Name": "only-in-this-test",
                "Type": "string"
            },
        ]

        table = partitioner.glue.get_table(DatabaseName=self.database,
                                           Name=self.table)["Table"]
        for key in [
                "DatabaseName", "CreateTime", "CreatedBy",
                "IsRegisteredWithLakeFormation", "CatalogId"
        ]:
            if key in table:
                del table[key]

        table["StorageDescriptor"]["Columns"] = columns
        partitioner.glue.update_table(DatabaseName=self.database,
                                      TableInput=table)

        errors = partitioner.update_partition_storage_descriptors()
        errors.should.have.length_of(0)

        for partition in partitioner.existing_partitions():
            partition.raw["StorageDescriptor"]["Columns"].should.equal(columns)
Example #8
0
    def test_delete_missing_partitions_no_partitions(self):
        self.helper.make_database_and_table()
        cli = Cli()

        self.s3.create_bucket(Bucket=self.bucket)
        partitions = self.helper.create_many_partitions(count=10)
        partitions.sort()

        partitioner = Partitioner(self.database,
                                  self.table,
                                  aws_region=self.region)
        partitioner.create_partitions(partitions)

        out, err = self.get_cmd_output(
            cli, ["delete-missing-partitions", self.database, self.table])
        out.should.equal("Found 0 partitions to delete:")

        catalog_partitions = partitioner.existing_partitions()
        catalog_partitions.should.have.length_of(10)
Example #9
0
    def test_create_partitions_dry_run(self):
        self.s3.create_bucket(Bucket=self.bucket)
        self.helper.make_database_and_table()

        partitions = self.helper.create_many_partitions(count=10)
        partitions.sort()

        expected_output = f"Running Partitioner for {self.database}.{self.table}\n\tLooking for partitions in s3://{self.bucket}/{self.table}/\n\tFound 10 new partitions to create\n\t"
        expected_output += ", ".join(map(str, partitions))

        partitioner = Partitioner(self.database,
                                  self.table,
                                  aws_region=self.region)
        with captured_output() as (out, err):
            create_found_partitions(partitioner, dry_run=True)
        output = out.getvalue().strip()
        output.should.equal(expected_output)

        found = partitioner.existing_partitions()
        found.should.have.length_of(0)
Example #10
0
    def test_create_partitions_dry_run(self):
        self.s3.create_bucket(Bucket=self.bucket)
        self.helper.make_database_and_table()
        cli = Cli()

        partitions = self.helper.create_many_partitions(count=10)
        partitions.sort()

        expected_output = f"Running Partitioner for {self.database}.{self.table}\n\tLooking for partitions in s3://{self.bucket}/{self.table}/\n\tFound 10 new partitions to create\n\t"
        expected_output += ", ".join(map(str, partitions))

        out, err = self.get_cmd_output(
            cli, ["create-partitions", self.database, self.table, "--dry-run"])
        out.should.equal(expected_output)

        partitioner = Partitioner(self.database,
                                  self.table,
                                  aws_region=self.region)
        found = partitioner.existing_partitions()
        found.should.have.length_of(0)
Example #11
0
    def test_delete_all_partitions(self):
        self.s3.create_bucket(Bucket=self.bucket)
        self.helper.make_database_and_table()
        cli = Cli()

        partitions = self.helper.create_many_partitions(count=10)
        partitions.sort()
        partitioner = Partitioner(self.database,
                                  self.table,
                                  aws_region=self.region)
        partitioner.create_partitions(partitions)

        expected_out = "Deleting the following partitions:"
        for partition in partitions:
            expected_out += f"\n\t{str(partition)}"

        out, err = self.get_cmd_output(
            cli, ["delete-all-partitions", self.database, self.table])
        out.should.equal(expected_out)

        found_partitions = partitioner.existing_partitions()
        found_partitions.should.have.length_of(0)
Example #12
0
    def test_delete_partitions(self):
        self.s3.create_bucket(Bucket=self.bucket)
        self.helper.make_database_and_table()
        self.helper.create_partition_data()

        partition = self.helper.create_partition_data()
        partitioner = Partitioner(self.database,
                                  self.table,
                                  aws_region=self.region)
        partitioner.create_partitions([partition])

        mock = MagicMock(return_value=[])
        partitioner.glue.batch_delete_partition = mock

        to_delete = partitioner.existing_partitions()
        partitioner.delete_partitions(to_delete)

        mock.assert_called_with(DatabaseName=self.database,
                                TableName=self.table,
                                PartitionsToDelete=[{
                                    "Values":
                                    to_delete[0].values
                                }])
Example #13
0
    def test_create_partitions_limit_days(self):
        self.s3.create_bucket(Bucket=self.bucket)
        self.helper.make_database_and_table()
        cli = Cli()

        today = pendulum.now()

        partitions = []
        for i in range(1, 11):
            partition_date = today.subtract(days=i)
            year = partition_date.strftime("%Y")
            month = partition_date.strftime("%m")
            day = partition_date.strftime("%d")
            hour = "03"

            partition = Partition([
                year, month, day, hour
            ], f"s3://{self.bucket}/{self.table}/{year}/{month}/{day}/{hour}/")
            self.helper.write_partition_to_s3(partition)
            partitions.append(partition)

        partitions.sort()

        expected_output = f"Running Partitioner for {self.database}.{self.table}\n\tLooking for partitions in s3://{self.bucket}/{self.table}/\n\tFound 7 new partitions to create\n\t"
        expected_output += ", ".join(map(str, partitions[3:]))

        out, err = self.get_cmd_output(
            cli,
            ["create-partitions", self.database, self.table, "--limit-days=7"])
        out.should.equal(expected_output)

        partitioner = Partitioner(self.database,
                                  self.table,
                                  aws_region=self.region)
        found = partitioner.existing_partitions()
        found.should.have.length_of(7)
        set(found).should.equal(set(partitions[3:]))