def _generate_connection_uri(self): if self.use_proxy: if self.sql_proxy_use_tcp: if not self.sql_proxy_tcp_port: self.reserve_free_tcp_port() if not self.sql_proxy_unique_path: self.sql_proxy_unique_path = self._generate_unique_path() database_uris = CONNECTION_URIS[self.database_type] ssl_spec = None socket_path = None if self.use_proxy: proxy_uris = database_uris['proxy'] if self.sql_proxy_use_tcp: format_string = proxy_uris['tcp'] else: format_string = proxy_uris['socket'] socket_path = \ "{sql_proxy_socket_path}/{instance_socket_name}".format( sql_proxy_socket_path=self.sql_proxy_unique_path, instance_socket_name=self._get_instance_socket_name() ) else: public_uris = database_uris['public'] if self.use_ssl: format_string = public_uris['ssl'] ssl_spec = { 'cert': self.sslcert, 'key': self.sslkey, 'ca': self.sslrootcert } else: format_string = public_uris['non-ssl'] if not self.user: raise AirflowException( "The login parameter needs to be set in connection") if not self.public_ip: raise AirflowException( "The location parameter needs to be set in connection") if not self.password: raise AirflowException( "The password parameter needs to be set in connection") if not self.database: raise AirflowException( "The database parameter needs to be set in connection") connection_uri = format_string.format( user=quote_plus(self.user) if self.user else '', password=quote_plus(self.password) if self.password else '', database=quote_plus(self.database) if self.database else '', public_ip=self.public_ip, public_port=self.public_port, proxy_port=self.sql_proxy_tcp_port, socket_path=self._quote(socket_path), ssl_spec=self._quote(json.dumps(ssl_spec)) if ssl_spec else '', client_cert_file=self._quote(self.sslcert) if self.sslcert else '', client_key_file=self._quote(self.sslkey) if self.sslcert else '', server_ca_file=self._quote( self.sslrootcert if self.sslcert else '')) self.log.info( "DB connection URI %s", connection_uri.replace( quote_plus(self.password) if self.password else 'PASSWORD', 'XXXXXXXXXXXX')) return connection_uri
def _validate_inputs(self): super(CloudSqlInstanceImportOperator, self)._validate_inputs() if not self.body: raise AirflowException("The required parameter 'body' is empty")
def _validate_inputs(self): for attr_name in self.REQUIRED_ATTRIBUTES: if not getattr(self, attr_name): raise AirflowException('Empty parameter: {}'.format(attr_name))
'END_OF_LOG_MARK') ELASTICSEARCH_WRITE_STDOUT: str = conf.get('elasticsearch', 'WRITE_STDOUT') ELASTICSEARCH_JSON_FORMAT: str = conf.get('elasticsearch', 'JSON_FORMAT') ELASTICSEARCH_JSON_FIELDS: str = conf.get('elasticsearch', 'JSON_FIELDS') ELASTIC_REMOTE_HANDLERS: Dict[str, Dict[str, str]] = { 'task': { 'class': 'airflow.utils.log.es_task_handler.ElasticsearchTaskHandler', 'formatter': 'airflow', 'base_log_folder': str(os.path.expanduser(BASE_LOG_FOLDER)), 'log_id_template': ELASTICSEARCH_LOG_ID_TEMPLATE, 'filename_template': FILENAME_TEMPLATE, 'end_of_log_mark': ELASTICSEARCH_END_OF_LOG_MARK, 'host': ELASTICSEARCH_HOST, 'write_stdout': ELASTICSEARCH_WRITE_STDOUT, 'json_format': ELASTICSEARCH_JSON_FORMAT, 'json_fields': ELASTICSEARCH_JSON_FIELDS }, } DEFAULT_LOGGING_CONFIG['handlers'].update(ELASTIC_REMOTE_HANDLERS) else: raise AirflowException( "Incorrect remote log configuration. Please check the configuration of option 'host' in " "section 'elasticsearch' if you are using Elasticsearch. In the other case, " "'remote_base_log_folder' option in 'core' section.")
def _validate_inputs(self): super(CloudSqlInstanceDatabaseDeleteOperator, self)._validate_inputs() if not self.database: raise AirflowException( "The required parameter 'database' is empty")
def _validate_inputs(self): if not self.filter: raise AirflowException( "The required parameter 'filter' is empty or None")
def _validate_inputs(self): if not self.operation_name: raise AirflowException( "The required parameter 'operation_name' is empty or None")
def __init__(self, body): if not body: raise AirflowException( "The required parameter 'body' is empty or None") self.body = body
def _validate_inputs(self): TransferJobValidator(body=self.body).validate_body() if not self.job_name: raise AirflowException( "The required parameter 'job_name' is empty or None")
def _restrict_empty_body(self): if not self.body: raise AirflowException( "The required parameter 'body' is empty or None")
def terminate_cluster(self, clusterId): try: self.emr_connection.terminate_job_flows(JobFlowIds=[clusterId]) except Exception as e: self.logger.error("Error deleting the cluster", exc_info=True) raise AirflowException("Failed to terminate the EMR cluster")
def _raise_ex_unable_to_determine_name(self): raise AirflowException( "Unable to determine the {label} name. Please either set the name directly in the {label} " "object or provide the `location` and `{id_label}` parameters.". format(label=self.label, id_label=self.id_label))
def _check_for_error(response: Dict) -> None: if "error" in response: raise AirflowException(response)
def run_ssh_command_and_return_output(self, command) -> (int, str): """ Open and SSH Connection and execute a command Returns the exit status and output from stdout """ # Copied from ssh_operator.py . It's not reusable from there. try: if self.ssh_conn_id: if self.ssh_hook and isinstance(self.ssh_hook, SSHHook): self.log.info( "ssh_conn_id is ignored when ssh_hook is provided.") else: self.log.info("ssh_hook is not provided or invalid. " + "Trying ssh_conn_id to create SSHHook.") self.ssh_hook = SSHHook(ssh_conn_id=self.ssh_conn_id, timeout=self.timeout) if not self.ssh_hook: raise AirflowException( "Cannot operate without ssh_hook or ssh_conn_id.") if not command: raise AirflowException("SSH command not specified. Aborting.") with self.ssh_hook.get_conn() as ssh_client: self.log.info("Running command: %s", command) # set timeout taken as params stdin, stdout, stderr = ssh_client.exec_command( command=command, get_pty=False, timeout=self.timeout, ) # get channels channel = stdout.channel # closing stdin stdin.close() channel.shutdown_write() agg_stdout = b'' agg_stderr = b'' # capture any initial output in case channel is closed already stdout_buffer_length = len(stdout.channel.in_buffer) if stdout_buffer_length > 0: agg_stdout += stdout.channel.recv(stdout_buffer_length) # read from both stdout and stderr while not channel.closed or \ channel.recv_ready() or \ channel.recv_stderr_ready(): readq, _, _ = select([channel], [], [], self.timeout) for c in readq: if c.recv_ready(): line = stdout.channel.recv(len(c.in_buffer)) line = line agg_stdout += line self.log.info(line.decode('utf-8').strip('\n')) if c.recv_stderr_ready(): line = stderr.channel.recv_stderr( len(c.in_stderr_buffer)) line = line agg_stderr += line self.log.warning(line.decode('utf-8').strip('\n')) if stdout.channel.exit_status_ready() \ and not stderr.channel.recv_stderr_ready() \ and not stdout.channel.recv_ready(): stdout.channel.shutdown_read() stdout.channel.close() break stdout.close() stderr.close() exit_status = stdout.channel.recv_exit_status() return exit_status, agg_stdout.decode('utf-8') except EOFError: raise except Exception as e: raise AirflowException( "PBS Job Completion sensor error: {0}".format(str(e)))
def get_conn(self) -> paramiko.SSHClient: """Return SSH connection.""" self._load_connection_config() if not self.project_id: self.project_id = self._compute_hook.project_id missing_fields = [ k for k in ["instance_name", "zone", "project_id"] if not getattr(self, k) ] if not self.instance_name or not self.zone or not self.project_id: raise AirflowException( f"Required parameters are missing: {missing_fields}. These parameters be passed either as " "keyword parameter or as extra field in Airflow connection definition. Both are not set!" ) self.log.info( "Connecting to instance: instance_name=%s, user=%s, zone=%s, " "use_internal_ip=%s, use_iap_tunnel=%s, use_os_login=%s", self.instance_name, self.user, self.zone, self.use_internal_ip, self.use_iap_tunnel, self.use_oslogin, ) if not self.hostname: hostname = self._compute_hook.get_instance_address( zone=self.zone, resource_id=self.instance_name, project_id=self.project_id, use_internal_ip=self.use_internal_ip or self.use_iap_tunnel, ) else: hostname = self.hostname privkey, pubkey = self._generate_ssh_key(self.user) if self.use_oslogin: user = self._authorize_os_login(pubkey) else: user = self.user self._authorize_compute_engine_instance_metadata(pubkey) proxy_command = None if self.use_iap_tunnel: proxy_command_args = [ 'gcloud', 'compute', 'start-iap-tunnel', str(self.instance_name), '22', '--listen-on-stdin', f'--project={self.project_id}', f'--zone={self.zone}', '--verbosity=warning', ] proxy_command = " ".join( shlex.quote(arg) for arg in proxy_command_args) sshclient = self._connect_to_instance(user, hostname, privkey, proxy_command) return sshclient
def execute(self, context): bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to, location=self.location) if not self.schema_fields: if self.schema_object and self.source_format != 'DATASTORE_BACKUP': gcs_hook = GCSHook(google_cloud_storage_conn_id=self. google_cloud_storage_conn_id, delegate_to=self.delegate_to) schema_fields = json.loads( gcs_hook.download(self.bucket, self.schema_object).decode("utf-8")) elif self.schema_object is None and self.autodetect is False: raise AirflowException( 'At least one of `schema_fields`, ' '`schema_object`, or `autodetect` must be passed.') else: schema_fields = None else: schema_fields = self.schema_fields source_uris = [ 'gs://{}/{}'.format(self.bucket, source_object) for source_object in self.source_objects ] conn = bq_hook.get_conn() cursor = conn.cursor() if self.external_table: cursor.create_external_table( external_project_dataset_table=self. destination_project_dataset_table, schema_fields=schema_fields, source_uris=source_uris, source_format=self.source_format, compression=self.compression, skip_leading_rows=self.skip_leading_rows, field_delimiter=self.field_delimiter, max_bad_records=self.max_bad_records, quote_character=self.quote_character, ignore_unknown_values=self.ignore_unknown_values, allow_quoted_newlines=self.allow_quoted_newlines, allow_jagged_rows=self.allow_jagged_rows, encoding=self.encoding, src_fmt_configs=self.src_fmt_configs, encryption_configuration=self.encryption_configuration) else: cursor.run_load( destination_project_dataset_table=self. destination_project_dataset_table, schema_fields=schema_fields, source_uris=source_uris, source_format=self.source_format, autodetect=self.autodetect, create_disposition=self.create_disposition, skip_leading_rows=self.skip_leading_rows, write_disposition=self.write_disposition, field_delimiter=self.field_delimiter, max_bad_records=self.max_bad_records, quote_character=self.quote_character, ignore_unknown_values=self.ignore_unknown_values, allow_quoted_newlines=self.allow_quoted_newlines, allow_jagged_rows=self.allow_jagged_rows, encoding=self.encoding, schema_update_options=self.schema_update_options, src_fmt_configs=self.src_fmt_configs, time_partitioning=self.time_partitioning, cluster_fields=self.cluster_fields, encryption_configuration=self.encryption_configuration) if self.max_id_key: cursor.execute('SELECT MAX({}) FROM {}'.format( self.max_id_key, self.destination_project_dataset_table)) row = cursor.fetchone() max_id = row[0] if row[0] else 0 self.log.info('Loaded BQ data with max %s.%s=%s', self.destination_project_dataset_table, self.max_id_key, max_id)