def execute(self, context): # use the super to list all files in an Azure Data Lake path files = super(AdlsToGoogleCloudStorageOperator, self).execute(context) g_hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to) if not self.replace: # if we are not replacing -> list all files in the ADLS path # and only keep those files which are present in # ADLS and not in Google Cloud Storage bucket_name, prefix = _parse_gcs_url(self.dest_gcs) existing_files = g_hook.list(bucket=bucket_name, prefix=prefix) files = set(files) - set(existing_files) if files: hook = AzureDataLakeHook( azure_data_lake_conn_id=self.azure_data_lake_conn_id ) for obj in files: with NamedTemporaryFile(mode='wb', delete=True) as f: hook.download_file(local_path=f.name, remote_path=obj) f.flush() dest_gcs_bucket, dest_gcs_prefix = _parse_gcs_url(self.dest_gcs) dest_path = os.path.join(dest_gcs_prefix, obj) self.log.info("Saving file to %s", dest_path) g_hook.upload(bucket=dest_gcs_bucket, object=dest_path, filename=f.name) self.log.info("All done, uploaded %d files to GCS", len(files)) else: self.log.info("In sync, no files needed to be uploaded to GCS") return files
def test_download_file(self, mock_lib, mock_downloader): from airflow.contrib.hooks.azure_data_lake_hook import AzureDataLakeHook hook = AzureDataLakeHook(azure_data_lake_conn_id='adl_test_key') hook.download_file(local_path='test_adl_hook.py', remote_path='/test_adl_hook.py', nthreads=64, overwrite=True, buffersize=4194304, blocksize=4194304) mock_downloader.assert_called_once_with(hook.connection, lpath='test_adl_hook.py', rpath='/test_adl_hook.py', nthreads=64, overwrite=True, buffersize=4194304, blocksize=4194304)