def test_parse_gcs_url(self): """ Test GCS url parsing """ self.assertEqual(gcs._parse_gcs_url('gs://bucket/path/to/blob'), ('bucket', 'path/to/blob')) # invalid URI self.assertRaises(AirflowException, gcs._parse_gcs_url, 'gs:/bucket/path/to/blob') # trailing slash self.assertEqual(gcs._parse_gcs_url('gs://bucket/path/to/blob/'), ('bucket', 'path/to/blob/')) # bucket only self.assertEqual(gcs._parse_gcs_url('gs://bucket/'), ('bucket', ''))
def execute(self, context): # use the super to list all files in an Azure Data Lake path files = super().execute(context) g_hook = GCSHook( google_cloud_storage_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to) if not self.replace: # if we are not replacing -> list all files in the ADLS path # and only keep those files which are present in # ADLS and not in Google Cloud Storage bucket_name, prefix = _parse_gcs_url(self.dest_gcs) existing_files = g_hook.list(bucket_name=bucket_name, prefix=prefix) files = set(files) - set(existing_files) if files: hook = AzureDataLakeHook( azure_data_lake_conn_id=self.azure_data_lake_conn_id ) for obj in files: with NamedTemporaryFile(mode='wb', delete=True) as f: hook.download_file(local_path=f.name, remote_path=obj) f.flush() dest_gcs_bucket, dest_gcs_prefix = _parse_gcs_url(self.dest_gcs) dest_path = os.path.join(dest_gcs_prefix, obj) self.log.info("Saving file to %s", dest_path) g_hook.upload( bucket_name=dest_gcs_bucket, object_name=dest_path, filename=f.name, gzip=self.gzip ) self.log.info("All done, uploaded %d files to GCS", len(files)) else: self.log.info("In sync, no files needed to be uploaded to GCS") return files
def _gcs_object_is_directory(object): _, blob = _parse_gcs_url(object) return len(blob) == 0 or blob.endswith('/')
def execute(self, context): # use the super method to list all the files in an S3 bucket/key files = super().execute(context) gcs_hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to) if not self.replace: # if we are not replacing -> list all files in the GCS bucket # and only keep those files which are present in # S3 and not in Google Cloud Storage bucket_name, object_prefix = _parse_gcs_url(self.dest_gcs) existing_files_prefixed = gcs_hook.list(bucket_name, prefix=object_prefix) existing_files = [] if existing_files_prefixed: # Remove the object prefix itself, an empty directory was found if object_prefix in existing_files_prefixed: existing_files_prefixed.remove(object_prefix) # Remove the object prefix from all object string paths for f in existing_files_prefixed: if f.startswith(object_prefix): existing_files.append(f[len(object_prefix):]) else: existing_files.append(f) files = list(set(files) - set(existing_files)) if len(files) > 0: self.log.info('%s files are going to be synced: %s.', len(files), files) else: self.log.info( 'There are no new files to sync. Have a nice day!') if files: hook = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify) for file in files: # GCS hook builds its own in-memory file so we have to create # and pass the path file_object = hook.get_key(file, self.bucket) with NamedTemporaryFile(mode='wb', delete=True) as f: file_object.download_fileobj(f) f.flush() dest_gcs_bucket, dest_gcs_object_prefix = _parse_gcs_url( self.dest_gcs) # There will always be a '/' before file because it is # enforced at instantiation time dest_gcs_object = dest_gcs_object_prefix + file # Sync is sequential and the hook already logs too much # so skip this for now # self.log.info( # 'Saving file {0} from S3 bucket {1} in GCS bucket {2}' # ' as object {3}'.format(file, self.bucket, # dest_gcs_bucket, # dest_gcs_object)) gcs_hook.upload(dest_gcs_bucket, dest_gcs_object, f.name, gzip=self.gzip) self.log.info( "All done, uploaded %d files to Google Cloud Storage", len(files)) else: self.log.info( 'In sync, no files needed to be uploaded to Google Cloud' 'Storage') return files