def execute(self, context: 'Context'): self._check_inputs() azure_fileshare_hook = AzureFileShareHook(self.azure_fileshare_conn_id) files = azure_fileshare_hook.list_files( share_name=self.share_name, directory_name=self.directory_name ) gcs_hook = GCSHook( gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to, impersonation_chain=self.google_impersonation_chain, ) dest_gcs_bucket, dest_gcs_object_prefix = _parse_gcs_url(self.dest_gcs) if not self.replace: # if we are not replacing -> list all files in the GCS bucket # and only keep those files which are present in # S3 and not in Google Cloud Storage existing_files_prefixed = gcs_hook.list(dest_gcs_bucket, prefix=dest_gcs_object_prefix) existing_files = [] # Remove the object prefix itself, an empty directory was found if dest_gcs_object_prefix in existing_files_prefixed: existing_files_prefixed.remove(dest_gcs_object_prefix) # Remove the object prefix from all object string paths for file in existing_files_prefixed: if file.startswith(dest_gcs_object_prefix): existing_files.append(file[len(dest_gcs_object_prefix) :]) else: existing_files.append(file) files = list(set(files) - set(existing_files)) if files: self.log.info('%s files are going to be synced.', len(files)) if self.directory_name is None: raise RuntimeError("The directory_name must be set!.") for file in files: with NamedTemporaryFile() as temp_file: azure_fileshare_hook.get_file_to_stream( stream=temp_file, share_name=self.share_name, directory_name=self.directory_name, file_name=file, ) temp_file.flush() # There will always be a '/' before file because it is # enforced at instantiation time dest_gcs_object = dest_gcs_object_prefix + file gcs_hook.upload(dest_gcs_bucket, dest_gcs_object, temp_file.name, gzip=self.gzip) self.log.info("All done, uploaded %d files to Google Cloud Storage.", len(files)) else: self.log.info('There are no new files to sync. Have a nice day!') self.log.info('In sync, no files needed to be uploaded to Google Cloud Storage') return files
def test_parse_gcs_url(self): """ Test GCS url parsing """ self.assertEqual(gcs._parse_gcs_url('gs://bucket/path/to/blob'), ('bucket', 'path/to/blob')) # invalid URI self.assertRaises(AirflowException, gcs._parse_gcs_url, 'gs:/bucket/path/to/blob') # trailing slash self.assertEqual(gcs._parse_gcs_url('gs://bucket/path/to/blob/'), ('bucket', 'path/to/blob/')) # bucket only self.assertEqual(gcs._parse_gcs_url('gs://bucket/'), ('bucket', ''))
def execute(self, context: 'Context'): # use the super to list all files in an Azure Data Lake path files = super().execute(context) g_hook = GCSHook( gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to, impersonation_chain=self.google_impersonation_chain, ) if not self.replace: # if we are not replacing -> list all files in the ADLS path # and only keep those files which are present in # ADLS and not in Google Cloud Storage bucket_name, prefix = _parse_gcs_url(self.dest_gcs) existing_files = g_hook.list(bucket_name=bucket_name, prefix=prefix) files = list(set(files) - set(existing_files)) if files: hook = AzureDataLakeHook( azure_data_lake_conn_id=self.azure_data_lake_conn_id) for obj in files: with NamedTemporaryFile(mode='wb', delete=True) as f: hook.download_file(local_path=f.name, remote_path=obj) f.flush() dest_gcs_bucket, dest_gcs_prefix = _parse_gcs_url( self.dest_gcs) dest_path = os.path.join(dest_gcs_prefix, obj) self.log.info("Saving file to %s", dest_path) g_hook.upload(bucket_name=dest_gcs_bucket, object_name=dest_path, filename=f.name, gzip=self.gzip) self.log.info("All done, uploaded %d files to GCS", len(files)) else: self.log.info("In sync, no files needed to be uploaded to GCS") return files
def _gcs_object_is_directory(bucket): _, blob = _parse_gcs_url(bucket) return len(blob) == 0 or blob.endswith('/')
def execute(self, context): # use the super method to list all the files in an S3 bucket/key files = super().execute(context) gcs_hook = GCSHook(google_cloud_storage_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to) # pylint: disable=too-many-nested-blocks if not self.replace: # if we are not replacing -> list all files in the GCS bucket # and only keep those files which are present in # S3 and not in Google Cloud Storage bucket_name, object_prefix = _parse_gcs_url(self.dest_gcs) existing_files_prefixed = gcs_hook.list(bucket_name, prefix=object_prefix) existing_files = [] if existing_files_prefixed: # Remove the object prefix itself, an empty directory was found if object_prefix in existing_files_prefixed: existing_files_prefixed.remove(object_prefix) # Remove the object prefix from all object string paths for f in existing_files_prefixed: if f.startswith(object_prefix): existing_files.append(f[len(object_prefix):]) else: existing_files.append(f) files = list(set(files) - set(existing_files)) if len(files) > 0: self.log.info('%s files are going to be synced: %s.', len(files), files) else: self.log.info( 'There are no new files to sync. Have a nice day!') if files: hook = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify) for file in files: # GCS hook builds its own in-memory file so we have to create # and pass the path file_object = hook.get_key(file, self.bucket) with NamedTemporaryFile(mode='wb', delete=True) as f: file_object.download_fileobj(f) f.flush() dest_gcs_bucket, dest_gcs_object_prefix = _parse_gcs_url( self.dest_gcs) # There will always be a '/' before file because it is # enforced at instantiation time dest_gcs_object = dest_gcs_object_prefix + file # Sync is sequential and the hook already logs too much # so skip this for now # self.log.info( # 'Saving file {0} from S3 bucket {1} in GCS bucket {2}' # ' as object {3}'.format(file, self.bucket, # dest_gcs_bucket, # dest_gcs_object)) gcs_hook.upload(dest_gcs_bucket, dest_gcs_object, f.name, gzip=self.gzip) self.log.info( "All done, uploaded %d files to Google Cloud Storage", len(files)) else: self.log.info( 'In sync, no files needed to be uploaded to Google Cloud' 'Storage') return files