Ejemplo n.º 1
0
 def execute(self, context: 'Context') -> None:
     hook = CloudTextToSpeechHook(
         gcp_conn_id=self.gcp_conn_id,
         impersonation_chain=self.impersonation_chain,
     )
     result = hook.synthesize_speech(
         input_data=self.input_data,
         voice=self.voice,
         audio_config=self.audio_config,
         retry=self.retry,
         timeout=self.timeout,
     )
     with NamedTemporaryFile() as temp_file:
         temp_file.write(result.audio_content)
         cloud_storage_hook = GCSHook(
             gcp_conn_id=self.gcp_conn_id,
             impersonation_chain=self.impersonation_chain,
         )
         cloud_storage_hook.upload(bucket_name=self.target_bucket_name,
                                   object_name=self.target_filename,
                                   filename=temp_file.name)
         FileDetailsLink.persist(
             context=context,
             task_instance=self,
             uri=f"{self.target_bucket_name}/{self.target_filename}",
             project_id=cloud_storage_hook.project_id,
         )
Ejemplo n.º 2
0
    def execute(self, context: 'Context') -> dict:
        speech_to_text_hook = CloudSpeechToTextHook(
            gcp_conn_id=self.gcp_conn_id,
            impersonation_chain=self.impersonation_chain,
        )
        translate_hook = CloudTranslateHook(
            gcp_conn_id=self.gcp_conn_id,
            impersonation_chain=self.impersonation_chain,
        )

        recognize_result = speech_to_text_hook.recognize_speech(
            config=self.config, audio=self.audio)
        recognize_dict = MessageToDict(recognize_result)

        self.log.info("Recognition operation finished")

        if not recognize_dict['results']:
            self.log.info("No recognition results")
            return {}
        self.log.debug("Recognition result: %s", recognize_dict)

        try:
            transcript = recognize_dict['results'][0]['alternatives'][0][
                'transcript']
        except KeyError as key:
            raise AirflowException(
                f"Wrong response '{recognize_dict}' returned - it should contain {key} field"
            )

        try:
            translation = translate_hook.translate(
                values=transcript,
                target_language=self.target_language,
                format_=self.format_,
                source_language=self.source_language,
                model=self.model,
            )
            self.log.info('Translated output: %s', translation)
            FileDetailsLink.persist(
                context=context,
                task_instance=self,
                uri=self.audio["uri"][5:],
                project_id=self.project_id or translate_hook.project_id,
            )
            return translation
        except ValueError as e:
            self.log.error(
                'An error has been thrown from translate speech method:')
            self.log.error(e)
            raise AirflowException(e)
Ejemplo n.º 3
0
    def execute(self, context: "Context") -> None:
        hook = GCSHook(gcp_conn_id=self.gcp_conn_id,
                       impersonation_chain=self.impersonation_chain)

        with NamedTemporaryFile() as source_file, NamedTemporaryFile(
        ) as destination_file:
            self.log.info("Downloading file from %s", self.source_bucket)
            hook.download(bucket_name=self.source_bucket,
                          object_name=self.source_object,
                          filename=source_file.name)

            self.log.info("Starting the transformation")
            cmd = [self.transform_script] if isinstance(
                self.transform_script, str) else self.transform_script
            cmd += [source_file.name, destination_file.name]
            with subprocess.Popen(args=cmd,
                                  stdout=subprocess.PIPE,
                                  stderr=subprocess.STDOUT,
                                  close_fds=True) as process:
                self.log.info("Process output:")
                if process.stdout:
                    for line in iter(process.stdout.readline, b''):
                        self.log.info(
                            line.decode(self.output_encoding).rstrip())

                process.wait()
                if process.returncode:
                    raise AirflowException(
                        f"Transform script failed: {process.returncode}")

            self.log.info(
                "Transformation succeeded. Output temporarily located at %s",
                destination_file.name)

            self.log.info("Uploading file to %s as %s",
                          self.destination_bucket, self.destination_object)
            FileDetailsLink.persist(
                context=context,
                task_instance=self,
                uri=f"{self.destination_bucket}/{self.destination_object}",
                project_id=hook.project_id,
            )
            hook.upload(
                bucket_name=self.destination_bucket,
                object_name=self.destination_object,
                filename=destination_file.name,
            )
Ejemplo n.º 4
0
    def execute(self, context: 'Context'):
        hook = CloudSpeechToTextHook(
            gcp_conn_id=self.gcp_conn_id,
            impersonation_chain=self.impersonation_chain,
        )

        FileDetailsLink.persist(
            context=context,
            task_instance=self,
            # Slice from: "gs://{BUCKET_NAME}/{FILE_NAME}" to: "{BUCKET_NAME}/{FILE_NAME}"
            uri=self.audio["uri"][5:],
            project_id=self.project_id or hook.project_id,
        )

        response = hook.recognize_speech(config=self.config,
                                         audio=self.audio,
                                         retry=self.retry,
                                         timeout=self.timeout)
        return MessageToDict(response)
Ejemplo n.º 5
0
 def execute(self, context: "Context") -> None:
     hook = GCSHook(
         gcp_conn_id=self.gcp_conn_id,
         impersonation_chain=self.impersonation_chain,
     )
     FileDetailsLink.persist(
         context=context,
         task_instance=self,
         uri=f"{self.bucket}/{self.object_name}",
         project_id=hook.project_id,
     )
     hook.insert_object_acl(
         bucket_name=self.bucket,
         object_name=self.object_name,
         entity=self.entity,
         role=self.role,
         generation=self.generation,
         user_project=self.user_project,
     )
Ejemplo n.º 6
0
 def execute(self, context: 'Context') -> None:
     self._validate_body_fields()
     hook = CloudSQLHook(
         gcp_conn_id=self.gcp_conn_id,
         api_version=self.api_version,
         impersonation_chain=self.impersonation_chain,
     )
     CloudSQLInstanceLink.persist(
         context=context,
         task_instance=self,
         cloud_sql_instance=self.instance,
         project_id=self.project_id or hook.project_id,
     )
     FileDetailsLink.persist(
         context=context,
         task_instance=self,
         uri=self.body["importContext"]["uri"][5:],
         project_id=self.project_id or hook.project_id,
     )
     return hook.import_instance(project_id=self.project_id, instance=self.instance, body=self.body)
Ejemplo n.º 7
0
class CloudTranslateSpeechOperator(BaseOperator):
    """
    Recognizes speech in audio input and translates it.

    Note that it uses the first result from the recognition api response - the one with the highest confidence
    In order to see other possible results please use
    :ref:`howto/operator:CloudSpeechToTextRecognizeSpeechOperator`
    and
    :ref:`howto/operator:CloudTranslateTextOperator`
    separately

    .. seealso::
        For more information on how to use this operator, take a look at the guide:
        :ref:`howto/operator:CloudTranslateSpeechOperator`

    See https://cloud.google.com/translate/docs/translating-text

    Execute method returns string object with the translation

    This is a list of dictionaries queried value.
    Dictionary typically contains three keys (though not
    all will be present in all cases).

    * ``detectedSourceLanguage``: The detected language (as an
      ISO 639-1 language code) of the text.
    * ``translatedText``: The translation of the text into the
      target language.
    * ``input``: The corresponding input value.
    * ``model``: The model used to translate the text.

    Dictionary is set as XCom return value.

    :param audio: audio data to be recognized. See more:
        https://googleapis.github.io/google-cloud-python/latest/speech/gapic/v1/types.html#google.cloud.speech_v1.types.RecognitionAudio

    :param config: information to the recognizer that specifies how to process the request. See more:
        https://googleapis.github.io/google-cloud-python/latest/speech/gapic/v1/types.html#google.cloud.speech_v1.types.RecognitionConfig

    :param target_language: The language to translate results into. This is required by the API and defaults
        to the target language of the current instance.
        Check the list of available languages here: https://cloud.google.com/translate/docs/languages

    :param format_: (Optional) One of ``text`` or ``html``, to specify
        if the input text is plain text or HTML.

    :param source_language: (Optional) The language of the text to
        be translated.

    :param model: (Optional) The model used to translate the text, such
        as ``'base'`` or ``'nmt'``.

    :param project_id: Optional, Google Cloud Project ID where the Compute
        Engine Instance exists. If set to None or missing, the default project_id from the Google Cloud
        connection is used.

    :param gcp_conn_id: Optional, The connection ID used to connect to Google Cloud.
        Defaults to 'google_cloud_default'.

    :param impersonation_chain: Optional service account to impersonate using short-term
        credentials, or chained list of accounts required to get the access_token
        of the last account in the list, which will be impersonated in the request.
        If set as a string, the account must grant the originating account
        the Service Account Token Creator IAM role.
        If set as a sequence, the identities from the list must grant
        Service Account Token Creator IAM role to the directly preceding identity, with first
        account from the list granting this role to the originating account (templated).

    """

    # [START translate_speech_template_fields]
    template_fields: Sequence[str] = (
        'target_language',
        'format_',
        'source_language',
        'model',
        'project_id',
        'gcp_conn_id',
        'impersonation_chain',
    )
    operator_extra_links = (FileDetailsLink(), )

    # [END translate_speech_template_fields]

    def __init__(
        self,
        *,
        audio: RecognitionAudio,
        config: RecognitionConfig,
        target_language: str,
        format_: str,
        source_language: Optional[str],
        model: str,
        project_id: Optional[str] = None,
        gcp_conn_id: str = 'google_cloud_default',
        impersonation_chain: Optional[Union[str, Sequence[str]]] = None,
        **kwargs,
    ) -> None:
        super().__init__(**kwargs)
        self.audio = audio
        self.config = config
        self.target_language = target_language
        self.format_ = format_
        self.source_language = source_language
        self.model = model
        self.project_id = project_id
        self.gcp_conn_id = gcp_conn_id
        self.impersonation_chain = impersonation_chain

    def execute(self, context: 'Context') -> dict:
        speech_to_text_hook = CloudSpeechToTextHook(
            gcp_conn_id=self.gcp_conn_id,
            impersonation_chain=self.impersonation_chain,
        )
        translate_hook = CloudTranslateHook(
            gcp_conn_id=self.gcp_conn_id,
            impersonation_chain=self.impersonation_chain,
        )

        recognize_result = speech_to_text_hook.recognize_speech(
            config=self.config, audio=self.audio)
        recognize_dict = MessageToDict(recognize_result)

        self.log.info("Recognition operation finished")

        if not recognize_dict['results']:
            self.log.info("No recognition results")
            return {}
        self.log.debug("Recognition result: %s", recognize_dict)

        try:
            transcript = recognize_dict['results'][0]['alternatives'][0][
                'transcript']
        except KeyError as key:
            raise AirflowException(
                f"Wrong response '{recognize_dict}' returned - it should contain {key} field"
            )

        try:
            translation = translate_hook.translate(
                values=transcript,
                target_language=self.target_language,
                format_=self.format_,
                source_language=self.source_language,
                model=self.model,
            )
            self.log.info('Translated output: %s', translation)
            FileDetailsLink.persist(
                context=context,
                task_instance=self,
                uri=self.audio["uri"][5:],
                project_id=self.project_id or translate_hook.project_id,
            )
            return translation
        except ValueError as e:
            self.log.error(
                'An error has been thrown from translate speech method:')
            self.log.error(e)
            raise AirflowException(e)
Ejemplo n.º 8
0
class CloudSQLImportInstanceOperator(CloudSQLBaseOperator):
    """
    Imports data into a Cloud SQL instance from a SQL dump or CSV file in Cloud Storage.

    CSV IMPORT:

    This operator is NOT idempotent for a CSV import. If the same file is imported
    multiple times, the imported data will be duplicated in the database.
    Moreover, if there are any unique constraints the duplicate import may result in an
    error.

    SQL IMPORT:

    This operator is idempotent for a SQL import if it was also exported by Cloud SQL.
    The exported SQL contains 'DROP TABLE IF EXISTS' statements for all tables
    to be imported.

    If the import file was generated in a different way, idempotence is not guaranteed.
    It has to be ensured on the SQL file level.

    .. seealso::
        For more information on how to use this operator, take a look at the guide:
        :ref:`howto/operator:CloudSQLImportInstanceOperator`

    :param instance: Cloud SQL instance ID. This does not include the project ID.
    :param body: The request body, as described in
        https://cloud.google.com/sql/docs/mysql/admin-api/v1beta4/instances/import#request-body
    :param project_id: Optional, Google Cloud Project ID. If set to None or missing,
            the default project_id from the Google Cloud connection is used.
    :param gcp_conn_id: The connection ID used to connect to Google Cloud.
    :param api_version: API version used (e.g. v1beta4).
    :param validate_body: Whether the body should be validated. Defaults to True.
    :param impersonation_chain: Optional service account to impersonate using short-term
        credentials, or chained list of accounts required to get the access_token
        of the last account in the list, which will be impersonated in the request.
        If set as a string, the account must grant the originating account
        the Service Account Token Creator IAM role.
        If set as a sequence, the identities from the list must grant
        Service Account Token Creator IAM role to the directly preceding identity, with first
        account from the list granting this role to the originating account (templated).
    """

    # [START gcp_sql_import_template_fields]
    template_fields: Sequence[str] = (
        'project_id',
        'instance',
        'body',
        'gcp_conn_id',
        'api_version',
        'impersonation_chain',
    )
    # [END gcp_sql_import_template_fields]
    ui_color = '#D3EDFB'
    operator_extra_links = (CloudSQLInstanceLink(), FileDetailsLink())

    def __init__(
        self,
        *,
        instance: str,
        body: dict,
        project_id: Optional[str] = None,
        gcp_conn_id: str = 'google_cloud_default',
        api_version: str = 'v1beta4',
        validate_body: bool = True,
        impersonation_chain: Optional[Union[str, Sequence[str]]] = None,
        **kwargs,
    ) -> None:
        self.body = body
        self.validate_body = validate_body
        super().__init__(
            project_id=project_id,
            instance=instance,
            gcp_conn_id=gcp_conn_id,
            api_version=api_version,
            impersonation_chain=impersonation_chain,
            **kwargs,
        )

    def _validate_inputs(self) -> None:
        super()._validate_inputs()
        if not self.body:
            raise AirflowException("The required parameter 'body' is empty")

    def _validate_body_fields(self) -> None:
        if self.validate_body:
            GcpBodyFieldValidator(CLOUD_SQL_IMPORT_VALIDATION,
                                  api_version=self.api_version).validate(
                                      self.body)

    def execute(self, context: 'Context') -> None:
        self._validate_body_fields()
        hook = CloudSQLHook(
            gcp_conn_id=self.gcp_conn_id,
            api_version=self.api_version,
            impersonation_chain=self.impersonation_chain,
        )
        CloudSQLInstanceLink.persist(
            context=context,
            task_instance=self,
            cloud_sql_instance=self.instance,
            project_id=self.project_id or hook.project_id,
        )
        FileDetailsLink.persist(
            context=context,
            task_instance=self,
            uri=self.body["importContext"]["uri"][5:],
            project_id=self.project_id or hook.project_id,
        )
        return hook.import_instance(project_id=self.project_id,
                                    instance=self.instance,
                                    body=self.body)
Ejemplo n.º 9
0
class CloudTextToSpeechSynthesizeOperator(BaseOperator):
    """
    Synthesizes text to speech and stores it in Google Cloud Storage

    .. seealso::
        For more information on how to use this operator, take a look at the guide:
        :ref:`howto/operator:CloudTextToSpeechSynthesizeOperator`

    :param input_data: text input to be synthesized. See more:
        https://googleapis.github.io/google-cloud-python/latest/texttospeech/gapic/v1/types.html#google.cloud.texttospeech_v1.types.SynthesisInput
    :param voice: configuration of voice to be used in synthesis. See more:
        https://googleapis.github.io/google-cloud-python/latest/texttospeech/gapic/v1/types.html#google.cloud.texttospeech_v1.types.VoiceSelectionParams
    :param audio_config: configuration of the synthesized audio. See more:
        https://googleapis.github.io/google-cloud-python/latest/texttospeech/gapic/v1/types.html#google.cloud.texttospeech_v1.types.AudioConfig
    :param target_bucket_name: name of the GCS bucket in which output file should be stored
    :param target_filename: filename of the output file.
    :param project_id: Optional, Google Cloud Project ID where the Compute
        Engine Instance exists. If set to None or missing, the default project_id from the Google Cloud
        connection is used.
    :param gcp_conn_id: Optional, The connection ID used to connect to Google Cloud.
        Defaults to 'google_cloud_default'.
    :param retry: (Optional) A retry object used to retry requests. If None is specified,
            requests will not be retried.
    :param timeout: (Optional) The amount of time, in seconds, to wait for the request to complete.
        Note that if retry is specified, the timeout applies to each individual attempt.
    :param impersonation_chain: Optional service account to impersonate using short-term
        credentials, or chained list of accounts required to get the access_token
        of the last account in the list, which will be impersonated in the request.
        If set as a string, the account must grant the originating account
        the Service Account Token Creator IAM role.
        If set as a sequence, the identities from the list must grant
        Service Account Token Creator IAM role to the directly preceding identity, with first
        account from the list granting this role to the originating account (templated).
    """

    # [START gcp_text_to_speech_synthesize_template_fields]
    template_fields: Sequence[str] = (
        "input_data",
        "voice",
        "audio_config",
        "project_id",
        "gcp_conn_id",
        "target_bucket_name",
        "target_filename",
        "impersonation_chain",
    )
    # [END gcp_text_to_speech_synthesize_template_fields]
    operator_extra_links = (FileDetailsLink(), )

    def __init__(
        self,
        *,
        input_data: Union[Dict, SynthesisInput],
        voice: Union[Dict, VoiceSelectionParams],
        audio_config: Union[Dict, AudioConfig],
        target_bucket_name: str,
        target_filename: str,
        project_id: Optional[str] = None,
        gcp_conn_id: str = "google_cloud_default",
        retry: Union[Retry, _MethodDefault] = DEFAULT,
        timeout: Optional[float] = None,
        impersonation_chain: Optional[Union[str, Sequence[str]]] = None,
        **kwargs,
    ) -> None:
        self.input_data = input_data
        self.voice = voice
        self.audio_config = audio_config
        self.target_bucket_name = target_bucket_name
        self.target_filename = target_filename
        self.project_id = project_id
        self.gcp_conn_id = gcp_conn_id
        self.retry = retry
        self.timeout = timeout
        self._validate_inputs()
        self.impersonation_chain = impersonation_chain
        super().__init__(**kwargs)

    def _validate_inputs(self) -> None:
        for parameter in [
                "input_data",
                "voice",
                "audio_config",
                "target_bucket_name",
                "target_filename",
        ]:
            if getattr(self, parameter) == "":
                raise AirflowException(
                    f"The required parameter '{parameter}' is empty")

    def execute(self, context: 'Context') -> None:
        hook = CloudTextToSpeechHook(
            gcp_conn_id=self.gcp_conn_id,
            impersonation_chain=self.impersonation_chain,
        )
        result = hook.synthesize_speech(
            input_data=self.input_data,
            voice=self.voice,
            audio_config=self.audio_config,
            retry=self.retry,
            timeout=self.timeout,
        )
        with NamedTemporaryFile() as temp_file:
            temp_file.write(result.audio_content)
            cloud_storage_hook = GCSHook(
                gcp_conn_id=self.gcp_conn_id,
                impersonation_chain=self.impersonation_chain,
            )
            cloud_storage_hook.upload(bucket_name=self.target_bucket_name,
                                      object_name=self.target_filename,
                                      filename=temp_file.name)
            FileDetailsLink.persist(
                context=context,
                task_instance=self,
                uri=f"{self.target_bucket_name}/{self.target_filename}",
                project_id=cloud_storage_hook.project_id,
            )
Ejemplo n.º 10
0
class CloudSpeechToTextRecognizeSpeechOperator(BaseOperator):
    """
    Recognizes speech from audio file and returns it as text.

    .. seealso::
        For more information on how to use this operator, take a look at the guide:
        :ref:`howto/operator:CloudSpeechToTextRecognizeSpeechOperator`

    :param config: information to the recognizer that specifies how to process the request. See more:
        https://googleapis.github.io/google-cloud-python/latest/speech/gapic/v1/types.html#google.cloud.speech_v1.types.RecognitionConfig
    :param audio: audio data to be recognized. See more:
        https://googleapis.github.io/google-cloud-python/latest/speech/gapic/v1/types.html#google.cloud.speech_v1.types.RecognitionAudio
    :param project_id: Optional, Google Cloud Project ID where the Compute
        Engine Instance exists. If set to None or missing, the default project_id from the Google Cloud
        connection is used.
    :param gcp_conn_id: Optional, The connection ID used to connect to Google Cloud.
        Defaults to 'google_cloud_default'.
    :param retry: (Optional) A retry object used to retry requests. If None is specified,
            requests will not be retried.
    :param timeout: (Optional) The amount of time, in seconds, to wait for the request to complete.
        Note that if retry is specified, the timeout applies to each individual attempt.
    :param impersonation_chain: Optional service account to impersonate using short-term
        credentials, or chained list of accounts required to get the access_token
        of the last account in the list, which will be impersonated in the request.
        If set as a string, the account must grant the originating account
        the Service Account Token Creator IAM role.
        If set as a sequence, the identities from the list must grant
        Service Account Token Creator IAM role to the directly preceding identity, with first
        account from the list granting this role to the originating account (templated).
    """

    # [START gcp_speech_to_text_synthesize_template_fields]
    template_fields: Sequence[str] = (
        "audio",
        "config",
        "project_id",
        "gcp_conn_id",
        "timeout",
        "impersonation_chain",
    )
    # [END gcp_speech_to_text_synthesize_template_fields]
    operator_extra_links = (FileDetailsLink(), )

    def __init__(
        self,
        *,
        audio: RecognitionAudio,
        config: RecognitionConfig,
        project_id: Optional[str] = None,
        gcp_conn_id: str = "google_cloud_default",
        retry: Union[Retry, _MethodDefault] = DEFAULT,
        timeout: Optional[float] = None,
        impersonation_chain: Optional[Union[str, Sequence[str]]] = None,
        **kwargs,
    ) -> None:
        self.audio = audio
        self.config = config
        self.project_id = project_id
        self.gcp_conn_id = gcp_conn_id
        self.retry = retry
        self.timeout = timeout
        self._validate_inputs()
        self.impersonation_chain = impersonation_chain
        super().__init__(**kwargs)

    def _validate_inputs(self) -> None:
        if self.audio == "":
            raise AirflowException("The required parameter 'audio' is empty")
        if self.config == "":
            raise AirflowException("The required parameter 'config' is empty")

    def execute(self, context: 'Context'):
        hook = CloudSpeechToTextHook(
            gcp_conn_id=self.gcp_conn_id,
            impersonation_chain=self.impersonation_chain,
        )

        FileDetailsLink.persist(
            context=context,
            task_instance=self,
            # Slice from: "gs://{BUCKET_NAME}/{FILE_NAME}" to: "{BUCKET_NAME}/{FILE_NAME}"
            uri=self.audio["uri"][5:],
            project_id=self.project_id or hook.project_id,
        )

        response = hook.recognize_speech(config=self.config,
                                         audio=self.audio,
                                         retry=self.retry,
                                         timeout=self.timeout)
        return MessageToDict(response)
Ejemplo n.º 11
0
class GCSFileTransformOperator(BaseOperator):
    """
    Copies data from a source GCS location to a temporary location on the
    local filesystem. Runs a transformation on this file as specified by
    the transformation script and uploads the output to a destination bucket.
    If the output bucket is not specified the original file will be
    overwritten.

    The locations of the source and the destination files in the local
    filesystem is provided as an first and second arguments to the
    transformation script. The transformation script is expected to read the
    data from source, transform it and write the output to the local
    destination file.

    :param source_bucket: The bucket to locate the source_object. (templated)
    :param source_object: The key to be retrieved from GCS. (templated)
    :param destination_bucket: The bucket to upload the key after transformation.
        If not provided, source_bucket will be used. (templated)
    :param destination_object: The key to be written in GCS.
        If not provided, source_object will be used. (templated)
    :param transform_script: location of the executable transformation script or list of arguments
        passed to subprocess ex. `['python', 'script.py', 10]`. (templated)
    :param gcp_conn_id: The connection ID to use connecting to Google Cloud.
    :param impersonation_chain: Optional service account to impersonate using short-term
        credentials, or chained list of accounts required to get the access_token
        of the last account in the list, which will be impersonated in the request.
        If set as a string, the account must grant the originating account
        the Service Account Token Creator IAM role.
        If set as a sequence, the identities from the list must grant
        Service Account Token Creator IAM role to the directly preceding identity, with first
        account from the list granting this role to the originating account (templated).
    """

    template_fields: Sequence[str] = (
        'source_bucket',
        'source_object',
        'destination_bucket',
        'destination_object',
        'transform_script',
        'impersonation_chain',
    )
    operator_extra_links = (FileDetailsLink(), )

    def __init__(
        self,
        *,
        source_bucket: str,
        source_object: str,
        transform_script: Union[str, List[str]],
        destination_bucket: Optional[str] = None,
        destination_object: Optional[str] = None,
        gcp_conn_id: str = "google_cloud_default",
        impersonation_chain: Optional[Union[str, Sequence[str]]] = None,
        **kwargs,
    ) -> None:
        super().__init__(**kwargs)
        self.source_bucket = source_bucket
        self.source_object = source_object
        self.destination_bucket = destination_bucket or self.source_bucket
        self.destination_object = destination_object or self.source_object

        self.gcp_conn_id = gcp_conn_id
        self.transform_script = transform_script
        self.output_encoding = sys.getdefaultencoding()
        self.impersonation_chain = impersonation_chain

    def execute(self, context: "Context") -> None:
        hook = GCSHook(gcp_conn_id=self.gcp_conn_id,
                       impersonation_chain=self.impersonation_chain)

        with NamedTemporaryFile() as source_file, NamedTemporaryFile(
        ) as destination_file:
            self.log.info("Downloading file from %s", self.source_bucket)
            hook.download(bucket_name=self.source_bucket,
                          object_name=self.source_object,
                          filename=source_file.name)

            self.log.info("Starting the transformation")
            cmd = [self.transform_script] if isinstance(
                self.transform_script, str) else self.transform_script
            cmd += [source_file.name, destination_file.name]
            with subprocess.Popen(args=cmd,
                                  stdout=subprocess.PIPE,
                                  stderr=subprocess.STDOUT,
                                  close_fds=True) as process:
                self.log.info("Process output:")
                if process.stdout:
                    for line in iter(process.stdout.readline, b''):
                        self.log.info(
                            line.decode(self.output_encoding).rstrip())

                process.wait()
                if process.returncode:
                    raise AirflowException(
                        f"Transform script failed: {process.returncode}")

            self.log.info(
                "Transformation succeeded. Output temporarily located at %s",
                destination_file.name)

            self.log.info("Uploading file to %s as %s",
                          self.destination_bucket, self.destination_object)
            FileDetailsLink.persist(
                context=context,
                task_instance=self,
                uri=f"{self.destination_bucket}/{self.destination_object}",
                project_id=hook.project_id,
            )
            hook.upload(
                bucket_name=self.destination_bucket,
                object_name=self.destination_object,
                filename=destination_file.name,
            )
Ejemplo n.º 12
0
class GCSObjectCreateAclEntryOperator(BaseOperator):
    """
    Creates a new ACL entry on the specified object.

    .. seealso::
        For more information on how to use this operator, take a look at the guide:
        :ref:`howto/operator:GCSObjectCreateAclEntryOperator`

    :param bucket: Name of a bucket.
    :param object_name: Name of the object. For information about how to URL encode object
        names to be path safe, see:
        https://cloud.google.com/storage/docs/json_api/#encoding
    :param entity: The entity holding the permission, in one of the following forms:
        user-userId, user-email, group-groupId, group-email, domain-domain,
        project-team-projectId, allUsers, allAuthenticatedUsers
    :param role: The access permission for the entity.
        Acceptable values are: "OWNER", "READER".
    :param generation: Optional. If present, selects a specific revision of this object.
    :param user_project: (Optional) The project to be billed for this request.
        Required for Requester Pays buckets.
    :param gcp_conn_id: (Optional) The connection ID used to connect to Google Cloud.
    :param impersonation_chain: Optional service account to impersonate using short-term
        credentials, or chained list of accounts required to get the access_token
        of the last account in the list, which will be impersonated in the request.
        If set as a string, the account must grant the originating account
        the Service Account Token Creator IAM role.
        If set as a sequence, the identities from the list must grant
        Service Account Token Creator IAM role to the directly preceding identity, with first
        account from the list granting this role to the originating account (templated).
    """

    # [START gcs_object_create_acl_template_fields]
    template_fields: Sequence[str] = (
        'bucket',
        'object_name',
        'entity',
        'generation',
        'role',
        'user_project',
        'impersonation_chain',
    )
    # [END gcs_object_create_acl_template_fields]
    operator_extra_links = (FileDetailsLink(), )

    def __init__(
        self,
        *,
        bucket: str,
        object_name: str,
        entity: str,
        role: str,
        generation: Optional[int] = None,
        user_project: Optional[str] = None,
        gcp_conn_id: str = 'google_cloud_default',
        impersonation_chain: Optional[Union[str, Sequence[str]]] = None,
        **kwargs,
    ) -> None:
        super().__init__(**kwargs)
        self.bucket = bucket
        self.object_name = object_name
        self.entity = entity
        self.role = role
        self.generation = generation
        self.user_project = user_project
        self.gcp_conn_id = gcp_conn_id
        self.impersonation_chain = impersonation_chain

    def execute(self, context: "Context") -> None:
        hook = GCSHook(
            gcp_conn_id=self.gcp_conn_id,
            impersonation_chain=self.impersonation_chain,
        )
        FileDetailsLink.persist(
            context=context,
            task_instance=self,
            uri=f"{self.bucket}/{self.object_name}",
            project_id=hook.project_id,
        )
        hook.insert_object_acl(
            bucket_name=self.bucket,
            object_name=self.object_name,
            entity=self.entity,
            role=self.role,
            generation=self.generation,
            user_project=self.user_project,
        )