def execute(self, context: "Context") -> list: hook = GCSHook( gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to, impersonation_chain=self.impersonation_chain, ) self.log.info( 'Getting list of the files. Bucket: %s; Delimiter: %s; Prefix: %s', self.bucket, self.delimiter, self.prefix, ) StorageLink.persist( context=context, task_instance=self, uri=self.bucket, project_id=hook.project_id, ) return hook.list(bucket_name=self.bucket, prefix=self.prefix, delimiter=self.delimiter)
def execute(self, context: "Context"): hook = DataprocMetastoreHook( gcp_conn_id=self.gcp_conn_id, impersonation_chain=self.impersonation_chain) self.log.info("Exporting metadata from Dataproc Metastore service: %s", self.service_id) hook.export_metadata( destination_gcs_folder=self.destination_gcs_folder, project_id=self.project_id, region=self.region, service_id=self.service_id, request_id=self.request_id, database_dump_type=self.database_dump_type, retry=self.retry, timeout=self.timeout, metadata=self.metadata, ) metadata_export = self._wait_for_export_metadata(hook) self.log.info("Metadata from service %s exported successfully", self.service_id) DataprocMetastoreLink.persist(context=context, task_instance=self, url=METASTORE_EXPORT_LINK) uri = self._get_uri_from_destination( MetadataExport.to_dict(metadata_export)["destination_gcs_uri"]) StorageLink.persist(context=context, task_instance=self, uri=uri) return MetadataExport.to_dict(metadata_export)
def execute(self, context: "Context") -> None: hook = GCSHook( gcp_conn_id=self.gcp_conn_id, impersonation_chain=self.impersonation_chain, ) StorageLink.persist( context=context, task_instance=self, uri=self.bucket, project_id=hook.project_id, ) hook.insert_bucket_acl( bucket_name=self.bucket, entity=self.entity, role=self.role, user_project=self.user_project )
def execute(self, context: "Context") -> None: hook = GCSHook( gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to, impersonation_chain=self.impersonation_chain, ) StorageLink.persist( context=context, task_instance=self, uri=self._get_uri(self.destination_bucket, self.destination_object), project_id=hook.project_id, ) hook.sync( source_bucket=self.source_bucket, destination_bucket=self.destination_bucket, source_object=self.source_object, destination_object=self.destination_object, recursive=self.recursive, delete_extra_files=self.delete_extra_files, allow_overwrite=self.allow_overwrite, )
def execute(self, context: "Context") -> None: hook = GCSHook( gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to, impersonation_chain=self.impersonation_chain, ) StorageLink.persist( context=context, task_instance=self, uri=self.bucket_name, project_id=self.project_id or hook.project_id, ) try: hook.create_bucket( bucket_name=self.bucket_name, resource=self.resource, storage_class=self.storage_class, location=self.location, project_id=self.project_id, labels=self.labels, ) except Conflict: # HTTP 409 self.log.warning("Bucket %s already exists", self.bucket_name)
def execute(self, context: 'Context') -> dict: self.log.info('Exporting data to Cloud Storage bucket %s', self.bucket) if self.overwrite_existing and self.namespace: gcs_hook = GCSHook(self.cloud_storage_conn_id, impersonation_chain=self.impersonation_chain) objects = gcs_hook.list(self.bucket, prefix=self.namespace) for obj in objects: gcs_hook.delete(self.bucket, obj) ds_hook = DatastoreHook( gcp_conn_id=self.datastore_conn_id, delegate_to=self.delegate_to, impersonation_chain=self.impersonation_chain, ) result = ds_hook.export_to_storage_bucket( bucket=self.bucket, namespace=self.namespace, entity_filter=self.entity_filter, labels=self.labels, project_id=self.project_id, ) operation_name = result['name'] result = ds_hook.poll_operation_until_done( operation_name, self.polling_interval_in_seconds) state = result['metadata']['common']['state'] if state != 'SUCCESSFUL': raise AirflowException(f'Operation failed: result={result}') StorageLink.persist( context=context, task_instance=self, uri= f"{self.bucket}/{result['response']['outputUrl'].split('/')[3]}", project_id=self.project_id or ds_hook.project_id, ) return result
def execute(self, context: "Context") -> List[str]: # Define intervals and prefixes. try: timespan_start = context["data_interval_start"] timespan_end = context["data_interval_end"] except KeyError: timespan_start = pendulum.instance(context["execution_date"]) following_execution_date = context["dag"].following_schedule( context["execution_date"]) if following_execution_date is None: timespan_end = None else: timespan_end = pendulum.instance(following_execution_date) if timespan_end is None: # Only possible in Airflow before 2.2. self.log.warning( "No following schedule found, setting timespan end to max %s", timespan_end) timespan_end = DateTime.max elif timespan_start >= timespan_end: # Airflow 2.2 sets start == end for non-perodic schedules. self.log.warning( "DAG schedule not periodic, setting timespan end to max %s", timespan_end) timespan_end = DateTime.max timespan_start = timespan_start.in_timezone(timezone.utc) timespan_end = timespan_end.in_timezone(timezone.utc) source_prefix_interp = GCSTimeSpanFileTransformOperator.interpolate_prefix( self.source_prefix, timespan_start, ) destination_prefix_interp = GCSTimeSpanFileTransformOperator.interpolate_prefix( self.destination_prefix, timespan_start, ) source_hook = GCSHook( gcp_conn_id=self.source_gcp_conn_id, impersonation_chain=self.source_impersonation_chain, ) destination_hook = GCSHook( gcp_conn_id=self.destination_gcp_conn_id, impersonation_chain=self.destination_impersonation_chain, ) StorageLink.persist( context=context, task_instance=self, uri=self.destination_bucket, project_id=destination_hook.project_id, ) # Fetch list of files. blobs_to_transform = source_hook.list_by_timespan( bucket_name=self.source_bucket, prefix=source_prefix_interp, timespan_start=timespan_start, timespan_end=timespan_end, ) with TemporaryDirectory() as temp_input_dir, TemporaryDirectory( ) as temp_output_dir: temp_input_dir_path = Path(temp_input_dir) temp_output_dir_path = Path(temp_output_dir) # TODO: download in parallel. for blob_to_transform in blobs_to_transform: destination_file = temp_input_dir_path / blob_to_transform destination_file.parent.mkdir(parents=True, exist_ok=True) try: source_hook.download( bucket_name=self.source_bucket, object_name=blob_to_transform, filename=str(destination_file), chunk_size=self.chunk_size, num_max_attempts=self.download_num_attempts, ) except GoogleCloudError: if self.download_continue_on_fail: continue raise self.log.info("Starting the transformation") cmd = [self.transform_script] if isinstance( self.transform_script, str) else self.transform_script cmd += [ str(temp_input_dir_path), str(temp_output_dir_path), timespan_start.replace(microsecond=0).isoformat(), timespan_end.replace(microsecond=0).isoformat(), ] with subprocess.Popen(args=cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True) as process: self.log.info("Process output:") if process.stdout: for line in iter(process.stdout.readline, b''): self.log.info( line.decode(self.output_encoding).rstrip()) process.wait() if process.returncode: raise AirflowException( f"Transform script failed: {process.returncode}") self.log.info( "Transformation succeeded. Output temporarily located at %s", temp_output_dir_path) files_uploaded = [] # TODO: upload in parallel. for upload_file in temp_output_dir_path.glob("**/*"): if upload_file.is_dir(): continue upload_file_name = str( upload_file.relative_to(temp_output_dir_path)) if self.destination_prefix is not None: upload_file_name = f"{destination_prefix_interp}/{upload_file_name}" self.log.info("Uploading file %s to %s", upload_file, upload_file_name) try: destination_hook.upload( bucket_name=self.destination_bucket, object_name=upload_file_name, filename=str(upload_file), chunk_size=self.chunk_size, num_max_attempts=self.upload_num_attempts, ) files_uploaded.append(str(upload_file_name)) except GoogleCloudError: if self.upload_continue_on_fail: continue raise return files_uploaded