Exemple #1
0
    def _generate_connection_uri(self):
        if self.use_proxy:
            if self.sql_proxy_use_tcp:
                if not self.sql_proxy_tcp_port:
                    self.reserve_free_tcp_port()
            if not self.sql_proxy_unique_path:
                self.sql_proxy_unique_path = self._generate_unique_path()

        database_uris = CONNECTION_URIS[self.database_type]
        ssl_spec = None
        socket_path = None
        if self.use_proxy:
            proxy_uris = database_uris['proxy']
            if self.sql_proxy_use_tcp:
                format_string = proxy_uris['tcp']
            else:
                format_string = proxy_uris['socket']
                socket_path = \
                    "{sql_proxy_socket_path}/{instance_socket_name}".format(
                        sql_proxy_socket_path=self.sql_proxy_unique_path,
                        instance_socket_name=self._get_instance_socket_name()
                    )
        else:
            public_uris = database_uris['public']
            if self.use_ssl:
                format_string = public_uris['ssl']
                ssl_spec = {
                    'cert': self.sslcert,
                    'key': self.sslkey,
                    'ca': self.sslrootcert
                }
            else:
                format_string = public_uris['non-ssl']
        if not self.user:
            raise AirflowException(
                "The login parameter needs to be set in connection")
        if not self.public_ip:
            raise AirflowException(
                "The location parameter needs to be set in connection")
        if not self.password:
            raise AirflowException(
                "The password parameter needs to be set in connection")
        if not self.database:
            raise AirflowException(
                "The database parameter needs to be set in connection")

        connection_uri = format_string.format(
            user=quote_plus(self.user) if self.user else '',
            password=quote_plus(self.password) if self.password else '',
            database=quote_plus(self.database) if self.database else '',
            public_ip=self.public_ip,
            public_port=self.public_port,
            proxy_port=self.sql_proxy_tcp_port,
            socket_path=self._quote(socket_path),
            ssl_spec=self._quote(json.dumps(ssl_spec)) if ssl_spec else '',
            client_cert_file=self._quote(self.sslcert) if self.sslcert else '',
            client_key_file=self._quote(self.sslkey) if self.sslcert else '',
            server_ca_file=self._quote(
                self.sslrootcert if self.sslcert else ''))
        self.log.info(
            "DB connection URI %s",
            connection_uri.replace(
                quote_plus(self.password) if self.password else 'PASSWORD',
                'XXXXXXXXXXXX'))
        return connection_uri
 def _validate_inputs(self):
     super(CloudSqlInstanceImportOperator, self)._validate_inputs()
     if not self.body:
         raise AirflowException("The required parameter 'body' is empty")
Exemple #3
0
 def _validate_inputs(self):
     for attr_name in self.REQUIRED_ATTRIBUTES:
         if not getattr(self, attr_name):
             raise AirflowException('Empty parameter: {}'.format(attr_name))
                                                      'END_OF_LOG_MARK')
        ELASTICSEARCH_WRITE_STDOUT: str = conf.get('elasticsearch',
                                                   'WRITE_STDOUT')
        ELASTICSEARCH_JSON_FORMAT: str = conf.get('elasticsearch',
                                                  'JSON_FORMAT')
        ELASTICSEARCH_JSON_FIELDS: str = conf.get('elasticsearch',
                                                  'JSON_FIELDS')

        ELASTIC_REMOTE_HANDLERS: Dict[str, Dict[str, str]] = {
            'task': {
                'class':
                'airflow.utils.log.es_task_handler.ElasticsearchTaskHandler',
                'formatter': 'airflow',
                'base_log_folder': str(os.path.expanduser(BASE_LOG_FOLDER)),
                'log_id_template': ELASTICSEARCH_LOG_ID_TEMPLATE,
                'filename_template': FILENAME_TEMPLATE,
                'end_of_log_mark': ELASTICSEARCH_END_OF_LOG_MARK,
                'host': ELASTICSEARCH_HOST,
                'write_stdout': ELASTICSEARCH_WRITE_STDOUT,
                'json_format': ELASTICSEARCH_JSON_FORMAT,
                'json_fields': ELASTICSEARCH_JSON_FIELDS
            },
        }

        DEFAULT_LOGGING_CONFIG['handlers'].update(ELASTIC_REMOTE_HANDLERS)
    else:
        raise AirflowException(
            "Incorrect remote log configuration. Please check the configuration of option 'host' in "
            "section 'elasticsearch' if you are using Elasticsearch. In the other case, "
            "'remote_base_log_folder' option in 'core' section.")
 def _validate_inputs(self):
     super(CloudSqlInstanceDatabaseDeleteOperator, self)._validate_inputs()
     if not self.database:
         raise AirflowException(
             "The required parameter 'database' is empty")
Exemple #6
0
 def _validate_inputs(self):
     if not self.filter:
         raise AirflowException(
             "The required parameter 'filter' is empty or None")
Exemple #7
0
 def _validate_inputs(self):
     if not self.operation_name:
         raise AirflowException(
             "The required parameter 'operation_name' is empty or None")
Exemple #8
0
    def __init__(self, body):
        if not body:
            raise AirflowException(
                "The required parameter 'body' is empty or None")

        self.body = body
Exemple #9
0
 def _validate_inputs(self):
     TransferJobValidator(body=self.body).validate_body()
     if not self.job_name:
         raise AirflowException(
             "The required parameter 'job_name' is empty or None")
Exemple #10
0
 def _restrict_empty_body(self):
     if not self.body:
         raise AirflowException(
             "The required parameter 'body' is empty or None")
 def terminate_cluster(self, clusterId):
     try:
         self.emr_connection.terminate_job_flows(JobFlowIds=[clusterId])
     except Exception as e:
         self.logger.error("Error deleting the cluster", exc_info=True)
         raise AirflowException("Failed to terminate the EMR cluster")
Exemple #12
0
 def _raise_ex_unable_to_determine_name(self):
     raise AirflowException(
         "Unable to determine the {label} name. Please either set the name directly in the {label} "
         "object or provide the `location` and `{id_label}` parameters.".
         format(label=self.label, id_label=self.id_label))
Exemple #13
0
 def _check_for_error(response: Dict) -> None:
     if "error" in response:
         raise AirflowException(response)
Exemple #14
0
    def run_ssh_command_and_return_output(self, command) -> (int, str):
        """
        Open and SSH Connection and execute a command

        Returns the exit status and output from stdout
        """
        # Copied from ssh_operator.py . It's not reusable from there.
        try:
            if self.ssh_conn_id:
                if self.ssh_hook and isinstance(self.ssh_hook, SSHHook):
                    self.log.info(
                        "ssh_conn_id is ignored when ssh_hook is provided.")
                else:
                    self.log.info("ssh_hook is not provided or invalid. " +
                                  "Trying ssh_conn_id to create SSHHook.")
                    self.ssh_hook = SSHHook(ssh_conn_id=self.ssh_conn_id,
                                            timeout=self.timeout)

            if not self.ssh_hook:
                raise AirflowException(
                    "Cannot operate without ssh_hook or ssh_conn_id.")

            if not command:
                raise AirflowException("SSH command not specified. Aborting.")

            with self.ssh_hook.get_conn() as ssh_client:
                self.log.info("Running command: %s", command)

                # set timeout taken as params
                stdin, stdout, stderr = ssh_client.exec_command(
                    command=command,
                    get_pty=False,
                    timeout=self.timeout,
                )
                # get channels
                channel = stdout.channel

                # closing stdin
                stdin.close()
                channel.shutdown_write()

                agg_stdout = b''
                agg_stderr = b''

                # capture any initial output in case channel is closed already
                stdout_buffer_length = len(stdout.channel.in_buffer)

                if stdout_buffer_length > 0:
                    agg_stdout += stdout.channel.recv(stdout_buffer_length)

                # read from both stdout and stderr
                while not channel.closed or \
                        channel.recv_ready() or \
                        channel.recv_stderr_ready():
                    readq, _, _ = select([channel], [], [], self.timeout)
                    for c in readq:
                        if c.recv_ready():
                            line = stdout.channel.recv(len(c.in_buffer))
                            line = line
                            agg_stdout += line
                            self.log.info(line.decode('utf-8').strip('\n'))
                        if c.recv_stderr_ready():
                            line = stderr.channel.recv_stderr(
                                len(c.in_stderr_buffer))
                            line = line
                            agg_stderr += line
                            self.log.warning(line.decode('utf-8').strip('\n'))
                    if stdout.channel.exit_status_ready() \
                            and not stderr.channel.recv_stderr_ready() \
                            and not stdout.channel.recv_ready():
                        stdout.channel.shutdown_read()
                        stdout.channel.close()
                        break

                stdout.close()
                stderr.close()

                exit_status = stdout.channel.recv_exit_status()

                return exit_status, agg_stdout.decode('utf-8')
        except EOFError:
            raise
        except Exception as e:
            raise AirflowException(
                "PBS Job Completion sensor error: {0}".format(str(e)))
    def get_conn(self) -> paramiko.SSHClient:
        """Return SSH connection."""
        self._load_connection_config()
        if not self.project_id:
            self.project_id = self._compute_hook.project_id

        missing_fields = [
            k for k in ["instance_name", "zone", "project_id"]
            if not getattr(self, k)
        ]
        if not self.instance_name or not self.zone or not self.project_id:
            raise AirflowException(
                f"Required parameters are missing: {missing_fields}. These parameters be passed either as "
                "keyword parameter or as extra field in Airflow connection definition. Both are not set!"
            )

        self.log.info(
            "Connecting to instance: instance_name=%s, user=%s, zone=%s, "
            "use_internal_ip=%s, use_iap_tunnel=%s, use_os_login=%s",
            self.instance_name,
            self.user,
            self.zone,
            self.use_internal_ip,
            self.use_iap_tunnel,
            self.use_oslogin,
        )
        if not self.hostname:
            hostname = self._compute_hook.get_instance_address(
                zone=self.zone,
                resource_id=self.instance_name,
                project_id=self.project_id,
                use_internal_ip=self.use_internal_ip or self.use_iap_tunnel,
            )
        else:
            hostname = self.hostname

        privkey, pubkey = self._generate_ssh_key(self.user)
        if self.use_oslogin:
            user = self._authorize_os_login(pubkey)
        else:
            user = self.user
            self._authorize_compute_engine_instance_metadata(pubkey)

        proxy_command = None
        if self.use_iap_tunnel:
            proxy_command_args = [
                'gcloud',
                'compute',
                'start-iap-tunnel',
                str(self.instance_name),
                '22',
                '--listen-on-stdin',
                f'--project={self.project_id}',
                f'--zone={self.zone}',
                '--verbosity=warning',
            ]
            proxy_command = " ".join(
                shlex.quote(arg) for arg in proxy_command_args)

        sshclient = self._connect_to_instance(user, hostname, privkey,
                                              proxy_command)
        return sshclient
Exemple #16
0
    def execute(self, context):
        bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
                               delegate_to=self.delegate_to,
                               location=self.location)

        if not self.schema_fields:
            if self.schema_object and self.source_format != 'DATASTORE_BACKUP':
                gcs_hook = GCSHook(google_cloud_storage_conn_id=self.
                                   google_cloud_storage_conn_id,
                                   delegate_to=self.delegate_to)
                schema_fields = json.loads(
                    gcs_hook.download(self.bucket,
                                      self.schema_object).decode("utf-8"))
            elif self.schema_object is None and self.autodetect is False:
                raise AirflowException(
                    'At least one of `schema_fields`, '
                    '`schema_object`, or `autodetect` must be passed.')
            else:
                schema_fields = None

        else:
            schema_fields = self.schema_fields

        source_uris = [
            'gs://{}/{}'.format(self.bucket, source_object)
            for source_object in self.source_objects
        ]
        conn = bq_hook.get_conn()
        cursor = conn.cursor()

        if self.external_table:
            cursor.create_external_table(
                external_project_dataset_table=self.
                destination_project_dataset_table,
                schema_fields=schema_fields,
                source_uris=source_uris,
                source_format=self.source_format,
                compression=self.compression,
                skip_leading_rows=self.skip_leading_rows,
                field_delimiter=self.field_delimiter,
                max_bad_records=self.max_bad_records,
                quote_character=self.quote_character,
                ignore_unknown_values=self.ignore_unknown_values,
                allow_quoted_newlines=self.allow_quoted_newlines,
                allow_jagged_rows=self.allow_jagged_rows,
                encoding=self.encoding,
                src_fmt_configs=self.src_fmt_configs,
                encryption_configuration=self.encryption_configuration)
        else:
            cursor.run_load(
                destination_project_dataset_table=self.
                destination_project_dataset_table,
                schema_fields=schema_fields,
                source_uris=source_uris,
                source_format=self.source_format,
                autodetect=self.autodetect,
                create_disposition=self.create_disposition,
                skip_leading_rows=self.skip_leading_rows,
                write_disposition=self.write_disposition,
                field_delimiter=self.field_delimiter,
                max_bad_records=self.max_bad_records,
                quote_character=self.quote_character,
                ignore_unknown_values=self.ignore_unknown_values,
                allow_quoted_newlines=self.allow_quoted_newlines,
                allow_jagged_rows=self.allow_jagged_rows,
                encoding=self.encoding,
                schema_update_options=self.schema_update_options,
                src_fmt_configs=self.src_fmt_configs,
                time_partitioning=self.time_partitioning,
                cluster_fields=self.cluster_fields,
                encryption_configuration=self.encryption_configuration)

        if self.max_id_key:
            cursor.execute('SELECT MAX({}) FROM {}'.format(
                self.max_id_key, self.destination_project_dataset_table))
            row = cursor.fetchone()
            max_id = row[0] if row[0] else 0
            self.log.info('Loaded BQ data with max %s.%s=%s',
                          self.destination_project_dataset_table,
                          self.max_id_key, max_id)