def test_get_file_to_stream(self, mock_service): mock_instance = mock_service.return_value hook = AzureFileShareHook(wasb_conn_id='wasb_test_sas_token') hook.get_file_to_stream('stream', 'share', 'directory', 'file', max_connections=1) mock_instance.get_file_to_stream.assert_called_once_with( 'share', 'directory', 'file', 'stream', max_connections=1 )
def execute(self, context): azure_fileshare_hook = AzureFileShareHook(self.wasb_conn_id) files = azure_fileshare_hook.list_files( share_name=self.share_name, directory_name=self.directory_name) gcs_hook = GCSHook( gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to, impersonation_chain=self.google_impersonation_chain, ) dest_gcs_bucket, dest_gcs_object_prefix = _parse_gcs_url(self.dest_gcs) # pylint: disable=too-many-nested-blocks if not self.replace: # if we are not replacing -> list all files in the GCS bucket # and only keep those files which are present in # S3 and not in Google Cloud Storage existing_files_prefixed = gcs_hook.list( dest_gcs_bucket, prefix=dest_gcs_object_prefix) existing_files = [] # Remove the object prefix itself, an empty directory was found if dest_gcs_object_prefix in existing_files_prefixed: existing_files_prefixed.remove(dest_gcs_object_prefix) # Remove the object prefix from all object string paths for file in existing_files_prefixed: if file.startswith(dest_gcs_object_prefix): existing_files.append(file[len(dest_gcs_object_prefix):]) else: existing_files.append(file) files = list(set(files) - set(existing_files)) if files: self.log.info('%s files are going to be synced.', len(files)) else: self.log.info('There are no new files to sync. Have a nice day!') for file in files: with NamedTemporaryFile() as temp_file: azure_fileshare_hook.get_file_to_stream( stream=temp_file, share_name=self.share_name, directory_name=self.directory_name, file_name=file, ) temp_file.flush() # There will always be a '/' before file because it is # enforced at instantiation time dest_gcs_object = dest_gcs_object_prefix + file gcs_hook.upload(dest_gcs_bucket, dest_gcs_object, temp_file.name, gzip=self.gzip) if files: self.log.info( "All done, uploaded %d files to Google Cloud Storage.", len(files)) else: self.log.info( 'In sync, no files needed to be uploaded to Google Cloud Storage' ) return files