Beispiel #1
0
 def test_get_file_to_stream(self, mock_service):
     mock_instance = mock_service.return_value
     hook = AzureFileShareHook(wasb_conn_id='wasb_test_sas_token')
     hook.get_file_to_stream('stream', 'share', 'directory', 'file', max_connections=1)
     mock_instance.get_file_to_stream.assert_called_once_with(
         'share', 'directory', 'file', 'stream', max_connections=1
     )
    def execute(self, context):
        azure_fileshare_hook = AzureFileShareHook(self.wasb_conn_id)
        files = azure_fileshare_hook.list_files(
            share_name=self.share_name, directory_name=self.directory_name)

        gcs_hook = GCSHook(
            gcp_conn_id=self.gcp_conn_id,
            delegate_to=self.delegate_to,
            impersonation_chain=self.google_impersonation_chain,
        )

        dest_gcs_bucket, dest_gcs_object_prefix = _parse_gcs_url(self.dest_gcs)

        # pylint: disable=too-many-nested-blocks
        if not self.replace:
            # if we are not replacing -> list all files in the GCS bucket
            # and only keep those files which are present in
            # S3 and not in Google Cloud Storage
            existing_files_prefixed = gcs_hook.list(
                dest_gcs_bucket, prefix=dest_gcs_object_prefix)

            existing_files = []

            # Remove the object prefix itself, an empty directory was found
            if dest_gcs_object_prefix in existing_files_prefixed:
                existing_files_prefixed.remove(dest_gcs_object_prefix)

            # Remove the object prefix from all object string paths
            for file in existing_files_prefixed:
                if file.startswith(dest_gcs_object_prefix):
                    existing_files.append(file[len(dest_gcs_object_prefix):])
                else:
                    existing_files.append(file)

            files = list(set(files) - set(existing_files))

        if files:
            self.log.info('%s files are going to be synced.', len(files))
        else:
            self.log.info('There are no new files to sync. Have a nice day!')

        for file in files:
            with NamedTemporaryFile() as temp_file:
                azure_fileshare_hook.get_file_to_stream(
                    stream=temp_file,
                    share_name=self.share_name,
                    directory_name=self.directory_name,
                    file_name=file,
                )
                temp_file.flush()

                # There will always be a '/' before file because it is
                # enforced at instantiation time
                dest_gcs_object = dest_gcs_object_prefix + file
                gcs_hook.upload(dest_gcs_bucket,
                                dest_gcs_object,
                                temp_file.name,
                                gzip=self.gzip)

        if files:
            self.log.info(
                "All done, uploaded %d files to Google Cloud Storage.",
                len(files))
        else:
            self.log.info(
                'In sync, no files needed to be uploaded to Google Cloud Storage'
            )

        return files