Ejemplo n.º 1
0
    def execute(self, context):
        # use the super to list all files in an Azure Data Lake path
        files = super(AdlsToGoogleCloudStorageOperator, self).execute(context)
        g_hook = GoogleCloudStorageHook(
            google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
            delegate_to=self.delegate_to)

        if not self.replace:
            # if we are not replacing -> list all files in the ADLS path
            # and only keep those files which are present in
            # ADLS and not in Google Cloud Storage
            bucket_name, prefix = _parse_gcs_url(self.dest_gcs)
            existing_files = g_hook.list(bucket=bucket_name, prefix=prefix)
            files = set(files) - set(existing_files)

        if files:
            hook = AzureDataLakeHook(
                azure_data_lake_conn_id=self.azure_data_lake_conn_id
            )

            for obj in files:
                with NamedTemporaryFile(mode='wb', delete=True) as f:
                    hook.download_file(local_path=f.name, remote_path=obj)
                    f.flush()
                    dest_gcs_bucket, dest_gcs_prefix = _parse_gcs_url(self.dest_gcs)
                    dest_path = os.path.join(dest_gcs_prefix, obj)
                    self.log.info("Saving file to %s", dest_path)

                    g_hook.upload(bucket=dest_gcs_bucket, object=dest_path, filename=f.name)

            self.log.info("All done, uploaded %d files to GCS", len(files))
        else:
            self.log.info("In sync, no files needed to be uploaded to GCS")

        return files
Ejemplo n.º 2
0
 def test_download_file(self, mock_lib, mock_downloader):
     from airflow.contrib.hooks.azure_data_lake_hook import AzureDataLakeHook
     hook = AzureDataLakeHook(azure_data_lake_conn_id='adl_test_key')
     hook.download_file(local_path='test_adl_hook.py',
                        remote_path='/test_adl_hook.py',
                        nthreads=64, overwrite=True,
                        buffersize=4194304, blocksize=4194304)
     mock_downloader.assert_called_once_with(hook.connection,
                                             lpath='test_adl_hook.py',
                                             rpath='/test_adl_hook.py',
                                             nthreads=64, overwrite=True,
                                             buffersize=4194304, blocksize=4194304)