Esempio n. 1
0
    def execute(self, context: "Context") -> list:

        hook = GCSHook(
            gcp_conn_id=self.gcp_conn_id,
            delegate_to=self.delegate_to,
            impersonation_chain=self.impersonation_chain,
        )

        self.log.info(
            'Getting list of the files. Bucket: %s; Delimiter: %s; Prefix: %s',
            self.bucket,
            self.delimiter,
            self.prefix,
        )

        StorageLink.persist(
            context=context,
            task_instance=self,
            uri=self.bucket,
            project_id=hook.project_id,
        )

        return hook.list(bucket_name=self.bucket,
                         prefix=self.prefix,
                         delimiter=self.delimiter)
Esempio n. 2
0
    def execute(self, context: "Context"):
        hook = DataprocMetastoreHook(
            gcp_conn_id=self.gcp_conn_id,
            impersonation_chain=self.impersonation_chain)
        self.log.info("Exporting metadata from Dataproc Metastore service: %s",
                      self.service_id)
        hook.export_metadata(
            destination_gcs_folder=self.destination_gcs_folder,
            project_id=self.project_id,
            region=self.region,
            service_id=self.service_id,
            request_id=self.request_id,
            database_dump_type=self.database_dump_type,
            retry=self.retry,
            timeout=self.timeout,
            metadata=self.metadata,
        )
        metadata_export = self._wait_for_export_metadata(hook)
        self.log.info("Metadata from service %s exported successfully",
                      self.service_id)

        DataprocMetastoreLink.persist(context=context,
                                      task_instance=self,
                                      url=METASTORE_EXPORT_LINK)
        uri = self._get_uri_from_destination(
            MetadataExport.to_dict(metadata_export)["destination_gcs_uri"])
        StorageLink.persist(context=context, task_instance=self, uri=uri)
        return MetadataExport.to_dict(metadata_export)
Esempio n. 3
0
 def execute(self, context: "Context") -> None:
     hook = GCSHook(
         gcp_conn_id=self.gcp_conn_id,
         impersonation_chain=self.impersonation_chain,
     )
     StorageLink.persist(
         context=context,
         task_instance=self,
         uri=self.bucket,
         project_id=hook.project_id,
     )
     hook.insert_bucket_acl(
         bucket_name=self.bucket, entity=self.entity, role=self.role, user_project=self.user_project
     )
Esempio n. 4
0
 def execute(self, context: "Context") -> None:
     hook = GCSHook(
         gcp_conn_id=self.gcp_conn_id,
         delegate_to=self.delegate_to,
         impersonation_chain=self.impersonation_chain,
     )
     StorageLink.persist(
         context=context,
         task_instance=self,
         uri=self._get_uri(self.destination_bucket, self.destination_object),
         project_id=hook.project_id,
     )
     hook.sync(
         source_bucket=self.source_bucket,
         destination_bucket=self.destination_bucket,
         source_object=self.source_object,
         destination_object=self.destination_object,
         recursive=self.recursive,
         delete_extra_files=self.delete_extra_files,
         allow_overwrite=self.allow_overwrite,
     )
Esempio n. 5
0
 def execute(self, context: "Context") -> None:
     hook = GCSHook(
         gcp_conn_id=self.gcp_conn_id,
         delegate_to=self.delegate_to,
         impersonation_chain=self.impersonation_chain,
     )
     StorageLink.persist(
         context=context,
         task_instance=self,
         uri=self.bucket_name,
         project_id=self.project_id or hook.project_id,
     )
     try:
         hook.create_bucket(
             bucket_name=self.bucket_name,
             resource=self.resource,
             storage_class=self.storage_class,
             location=self.location,
             project_id=self.project_id,
             labels=self.labels,
         )
     except Conflict:  # HTTP 409
         self.log.warning("Bucket %s already exists", self.bucket_name)
Esempio n. 6
0
    def execute(self, context: 'Context') -> dict:
        self.log.info('Exporting data to Cloud Storage bucket %s', self.bucket)

        if self.overwrite_existing and self.namespace:
            gcs_hook = GCSHook(self.cloud_storage_conn_id,
                               impersonation_chain=self.impersonation_chain)
            objects = gcs_hook.list(self.bucket, prefix=self.namespace)
            for obj in objects:
                gcs_hook.delete(self.bucket, obj)

        ds_hook = DatastoreHook(
            gcp_conn_id=self.datastore_conn_id,
            delegate_to=self.delegate_to,
            impersonation_chain=self.impersonation_chain,
        )
        result = ds_hook.export_to_storage_bucket(
            bucket=self.bucket,
            namespace=self.namespace,
            entity_filter=self.entity_filter,
            labels=self.labels,
            project_id=self.project_id,
        )
        operation_name = result['name']
        result = ds_hook.poll_operation_until_done(
            operation_name, self.polling_interval_in_seconds)

        state = result['metadata']['common']['state']
        if state != 'SUCCESSFUL':
            raise AirflowException(f'Operation failed: result={result}')
        StorageLink.persist(
            context=context,
            task_instance=self,
            uri=
            f"{self.bucket}/{result['response']['outputUrl'].split('/')[3]}",
            project_id=self.project_id or ds_hook.project_id,
        )
        return result
Esempio n. 7
0
    def execute(self, context: "Context") -> List[str]:
        # Define intervals and prefixes.
        try:
            timespan_start = context["data_interval_start"]
            timespan_end = context["data_interval_end"]
        except KeyError:
            timespan_start = pendulum.instance(context["execution_date"])
            following_execution_date = context["dag"].following_schedule(
                context["execution_date"])
            if following_execution_date is None:
                timespan_end = None
            else:
                timespan_end = pendulum.instance(following_execution_date)

        if timespan_end is None:  # Only possible in Airflow before 2.2.
            self.log.warning(
                "No following schedule found, setting timespan end to max %s",
                timespan_end)
            timespan_end = DateTime.max
        elif timespan_start >= timespan_end:  # Airflow 2.2 sets start == end for non-perodic schedules.
            self.log.warning(
                "DAG schedule not periodic, setting timespan end to max %s",
                timespan_end)
            timespan_end = DateTime.max

        timespan_start = timespan_start.in_timezone(timezone.utc)
        timespan_end = timespan_end.in_timezone(timezone.utc)

        source_prefix_interp = GCSTimeSpanFileTransformOperator.interpolate_prefix(
            self.source_prefix,
            timespan_start,
        )
        destination_prefix_interp = GCSTimeSpanFileTransformOperator.interpolate_prefix(
            self.destination_prefix,
            timespan_start,
        )

        source_hook = GCSHook(
            gcp_conn_id=self.source_gcp_conn_id,
            impersonation_chain=self.source_impersonation_chain,
        )
        destination_hook = GCSHook(
            gcp_conn_id=self.destination_gcp_conn_id,
            impersonation_chain=self.destination_impersonation_chain,
        )
        StorageLink.persist(
            context=context,
            task_instance=self,
            uri=self.destination_bucket,
            project_id=destination_hook.project_id,
        )

        # Fetch list of files.
        blobs_to_transform = source_hook.list_by_timespan(
            bucket_name=self.source_bucket,
            prefix=source_prefix_interp,
            timespan_start=timespan_start,
            timespan_end=timespan_end,
        )

        with TemporaryDirectory() as temp_input_dir, TemporaryDirectory(
        ) as temp_output_dir:
            temp_input_dir_path = Path(temp_input_dir)
            temp_output_dir_path = Path(temp_output_dir)

            # TODO: download in parallel.
            for blob_to_transform in blobs_to_transform:
                destination_file = temp_input_dir_path / blob_to_transform
                destination_file.parent.mkdir(parents=True, exist_ok=True)
                try:
                    source_hook.download(
                        bucket_name=self.source_bucket,
                        object_name=blob_to_transform,
                        filename=str(destination_file),
                        chunk_size=self.chunk_size,
                        num_max_attempts=self.download_num_attempts,
                    )
                except GoogleCloudError:
                    if self.download_continue_on_fail:
                        continue
                    raise

            self.log.info("Starting the transformation")
            cmd = [self.transform_script] if isinstance(
                self.transform_script, str) else self.transform_script
            cmd += [
                str(temp_input_dir_path),
                str(temp_output_dir_path),
                timespan_start.replace(microsecond=0).isoformat(),
                timespan_end.replace(microsecond=0).isoformat(),
            ]
            with subprocess.Popen(args=cmd,
                                  stdout=subprocess.PIPE,
                                  stderr=subprocess.STDOUT,
                                  close_fds=True) as process:
                self.log.info("Process output:")
                if process.stdout:
                    for line in iter(process.stdout.readline, b''):
                        self.log.info(
                            line.decode(self.output_encoding).rstrip())

                process.wait()
                if process.returncode:
                    raise AirflowException(
                        f"Transform script failed: {process.returncode}")

            self.log.info(
                "Transformation succeeded. Output temporarily located at %s",
                temp_output_dir_path)

            files_uploaded = []

            # TODO: upload in parallel.
            for upload_file in temp_output_dir_path.glob("**/*"):
                if upload_file.is_dir():
                    continue

                upload_file_name = str(
                    upload_file.relative_to(temp_output_dir_path))

                if self.destination_prefix is not None:
                    upload_file_name = f"{destination_prefix_interp}/{upload_file_name}"

                self.log.info("Uploading file %s to %s", upload_file,
                              upload_file_name)

                try:
                    destination_hook.upload(
                        bucket_name=self.destination_bucket,
                        object_name=upload_file_name,
                        filename=str(upload_file),
                        chunk_size=self.chunk_size,
                        num_max_attempts=self.upload_num_attempts,
                    )
                    files_uploaded.append(str(upload_file_name))
                except GoogleCloudError:
                    if self.upload_continue_on_fail:
                        continue
                    raise

            return files_uploaded