def _create_body(self) -> dict:
        body = {
            DESCRIPTION: self.description,
            STATUS: GcpTransferJobsStatus.ENABLED,
            TRANSFER_SPEC: {
                AWS_S3_DATA_SOURCE: {
                    BUCKET_NAME: self.s3_bucket,
                    PATH: normalize_directory_path(self.s3_path),
                },
                GCS_DATA_SINK: {
                    BUCKET_NAME: self.gcs_bucket,
                    PATH: normalize_directory_path(self.gcs_path),
                },
            },
        }

        if self.project_id is not None:
            body[PROJECT_ID] = self.project_id

        if self.schedule is not None:
            body[SCHEDULE] = self.schedule

        if self.object_conditions is not None:
            body[TRANSFER_SPEC][OBJECT_CONDITIONS] = self.object_conditions  # type: ignore[index]

        if self.transfer_options is not None:
            body[TRANSFER_SPEC][TRANSFER_OPTIONS] = self.transfer_options  # type: ignore[index]

        return body
Beispiel #2
0
    def sync(
        self,
        source_bucket: str,
        destination_bucket: str,
        source_object: Optional[str] = None,
        destination_object: Optional[str] = None,
        recursive: bool = True,
        allow_overwrite: bool = False,
        delete_extra_files: bool = False,
    ) -> None:
        """
        Synchronizes the contents of the buckets.

        Parameters ``source_object`` and ``destination_object`` describe the root sync directories. If they
        are not passed, the entire bucket will be synchronized. If they are passed, they should point
        to directories.

        .. note::
            The synchronization of individual files is not supported. Only entire directories can be
            synchronized.

        :param source_bucket: The name of the bucket containing the source objects.
        :type source_bucket: str
        :param destination_bucket: The name of the bucket containing the destination objects.
        :type destination_bucket: str
        :param source_object: The root sync directory in the source bucket.
        :type source_object: Optional[str]
        :param destination_object: The root sync directory in the destination bucket.
        :type destination_object: Optional[str]
        :param recursive: If True, subdirectories will be considered
        :type recursive: bool
        :param recursive: If True, subdirectories will be considered
        :type recursive: bool
        :param allow_overwrite: if True, the files will be overwritten if a mismatched file is found.
            By default, overwriting files is not allowed
        :type allow_overwrite: bool
        :param delete_extra_files: if True, deletes additional files from the source that not found in the
            destination. By default extra files are not deleted.

            .. note::
                This option can delete data quickly if you specify the wrong source/destination combination.

        :type delete_extra_files: bool
        :return: none
        """
        client = self.get_conn()
        # Create bucket object
        source_bucket_obj = client.bucket(source_bucket)
        destination_bucket_obj = client.bucket(destination_bucket)
        # Normalize parameters when they are passed
        source_object = normalize_directory_path(source_object)
        destination_object = normalize_directory_path(destination_object)
        # Calculate the number of characters that remove from the name, because they contain information
        # about the parent's path
        source_object_prefix_len = len(source_object) if source_object else 0
        # Prepare synchronization plan
        to_copy_blobs, to_delete_blobs, to_rewrite_blobs = self._prepare_sync_plan(
            source_bucket=source_bucket_obj,
            destination_bucket=destination_bucket_obj,
            source_object=source_object,
            destination_object=destination_object,
            recursive=recursive,
        )
        self.log.info(
            "Planned synchronization. To delete blobs count: %s, to upload blobs count: %s, "
            "to rewrite blobs count: %s",
            len(to_delete_blobs),
            len(to_copy_blobs),
            len(to_rewrite_blobs),
        )

        # Copy missing object to new bucket
        if not to_copy_blobs:
            self.log.info("Skipped blobs copying.")
        else:
            for blob in to_copy_blobs:
                dst_object = self._calculate_sync_destination_path(
                    blob, destination_object, source_object_prefix_len)
                self.copy(
                    source_bucket=source_bucket_obj.name,
                    source_object=blob.name,
                    destination_bucket=destination_bucket_obj.name,
                    destination_object=dst_object,
                )
            self.log.info("Blobs copied.")
        # Delete redundant files
        if not to_delete_blobs:
            self.log.info("Skipped blobs deleting.")
        elif delete_extra_files:
            # TODO: Add batch. I tried to do it, but the Google library is not stable at the moment.
            for blob in to_delete_blobs:
                self.delete(blob.bucket.name, blob.name)
            self.log.info("Blobs deleted.")

        # Overwrite files that are different
        if not to_rewrite_blobs:
            self.log.info("Skipped blobs overwriting.")
        elif allow_overwrite:
            for blob in to_rewrite_blobs:
                dst_object = self._calculate_sync_destination_path(
                    blob, destination_object, source_object_prefix_len)
                self.rewrite(
                    source_bucket=source_bucket_obj.name,
                    source_object=blob.name,
                    destination_bucket=destination_bucket_obj.name,
                    destination_object=dst_object,
                )
            self.log.info("Blobs rewritten.")

        self.log.info("Synchronization finished.")