def _create_body(self) -> dict: body = { DESCRIPTION: self.description, STATUS: GcpTransferJobsStatus.ENABLED, TRANSFER_SPEC: { AWS_S3_DATA_SOURCE: { BUCKET_NAME: self.s3_bucket, PATH: normalize_directory_path(self.s3_path), }, GCS_DATA_SINK: { BUCKET_NAME: self.gcs_bucket, PATH: normalize_directory_path(self.gcs_path), }, }, } if self.project_id is not None: body[PROJECT_ID] = self.project_id if self.schedule is not None: body[SCHEDULE] = self.schedule if self.object_conditions is not None: body[TRANSFER_SPEC][OBJECT_CONDITIONS] = self.object_conditions # type: ignore[index] if self.transfer_options is not None: body[TRANSFER_SPEC][TRANSFER_OPTIONS] = self.transfer_options # type: ignore[index] return body
def sync( self, source_bucket: str, destination_bucket: str, source_object: Optional[str] = None, destination_object: Optional[str] = None, recursive: bool = True, allow_overwrite: bool = False, delete_extra_files: bool = False, ) -> None: """ Synchronizes the contents of the buckets. Parameters ``source_object`` and ``destination_object`` describe the root sync directories. If they are not passed, the entire bucket will be synchronized. If they are passed, they should point to directories. .. note:: The synchronization of individual files is not supported. Only entire directories can be synchronized. :param source_bucket: The name of the bucket containing the source objects. :type source_bucket: str :param destination_bucket: The name of the bucket containing the destination objects. :type destination_bucket: str :param source_object: The root sync directory in the source bucket. :type source_object: Optional[str] :param destination_object: The root sync directory in the destination bucket. :type destination_object: Optional[str] :param recursive: If True, subdirectories will be considered :type recursive: bool :param recursive: If True, subdirectories will be considered :type recursive: bool :param allow_overwrite: if True, the files will be overwritten if a mismatched file is found. By default, overwriting files is not allowed :type allow_overwrite: bool :param delete_extra_files: if True, deletes additional files from the source that not found in the destination. By default extra files are not deleted. .. note:: This option can delete data quickly if you specify the wrong source/destination combination. :type delete_extra_files: bool :return: none """ client = self.get_conn() # Create bucket object source_bucket_obj = client.bucket(source_bucket) destination_bucket_obj = client.bucket(destination_bucket) # Normalize parameters when they are passed source_object = normalize_directory_path(source_object) destination_object = normalize_directory_path(destination_object) # Calculate the number of characters that remove from the name, because they contain information # about the parent's path source_object_prefix_len = len(source_object) if source_object else 0 # Prepare synchronization plan to_copy_blobs, to_delete_blobs, to_rewrite_blobs = self._prepare_sync_plan( source_bucket=source_bucket_obj, destination_bucket=destination_bucket_obj, source_object=source_object, destination_object=destination_object, recursive=recursive, ) self.log.info( "Planned synchronization. To delete blobs count: %s, to upload blobs count: %s, " "to rewrite blobs count: %s", len(to_delete_blobs), len(to_copy_blobs), len(to_rewrite_blobs), ) # Copy missing object to new bucket if not to_copy_blobs: self.log.info("Skipped blobs copying.") else: for blob in to_copy_blobs: dst_object = self._calculate_sync_destination_path( blob, destination_object, source_object_prefix_len) self.copy( source_bucket=source_bucket_obj.name, source_object=blob.name, destination_bucket=destination_bucket_obj.name, destination_object=dst_object, ) self.log.info("Blobs copied.") # Delete redundant files if not to_delete_blobs: self.log.info("Skipped blobs deleting.") elif delete_extra_files: # TODO: Add batch. I tried to do it, but the Google library is not stable at the moment. for blob in to_delete_blobs: self.delete(blob.bucket.name, blob.name) self.log.info("Blobs deleted.") # Overwrite files that are different if not to_rewrite_blobs: self.log.info("Skipped blobs overwriting.") elif allow_overwrite: for blob in to_rewrite_blobs: dst_object = self._calculate_sync_destination_path( blob, destination_object, source_object_prefix_len) self.rewrite( source_bucket=source_bucket_obj.name, source_object=blob.name, destination_bucket=destination_bucket_obj.name, destination_object=dst_object, ) self.log.info("Blobs rewritten.") self.log.info("Synchronization finished.")