def _poll(self): if not self._query_id: raise ProgrammingError('QueryExecutionId is none or empty.') while True: try: request = {'QueryExecutionId': self._query_id} response = retry_api_call(self._connection.get_query_execution, exceptions=self.retry_exceptions, attempt=self.retry_attempt, multiplier=self.retry_multiplier, max_delay=self.retry_max_deply, exp_base=self.retry_exponential_base, logger=_logger, **request) except Exception as e: _logger.exception('Failed to poll query result.') raise_from(OperationalError(*e.args), e) else: query_execution = response.get('QueryExecution', None) if not query_execution: raise DataError('KeyError `QueryExecution`') status = query_execution.get('Status', None) if not status: raise DataError('KeyError `Status`') state = status.get('State', None) if state == 'SUCCEEDED': self._completion_date_time = status.get( 'CompletionDateTime', None) self._submission_date_time = status.get( 'SubmissionDateTime', None) statistics = query_execution.get('Statistics', {}) self._data_scanned_in_bytes = statistics.get( 'DataScannedInBytes', None) self._execution_time_in_millis = statistics.get( 'EngineExecutionTimeInMillis', None) result_conf = query_execution.get('ResultConfiguration', {}) self._output_location = result_conf.get( 'OutputLocation', None) break elif state == 'FAILED': raise OperationalError( status.get('StateChangeReason', None)) elif state == 'CANCELLED': raise OperationalError( status.get('StateChangeReason', None)) else: time.sleep(self._poll_interval)
def _list_table_metadata( self, max_results: Optional[int] = None, catalog_name: Optional[str] = None, schema_name: Optional[str] = None, expression: Optional[str] = None, next_token: Optional[str] = None, ) -> Tuple[Optional[str], List[AthenaTableMetadata]]: request = self._build_list_table_metadata_request( max_results=max_results, catalog_name=catalog_name, schema_name=schema_name, expression=expression, next_token=next_token, ) try: response = retry_api_call( self.connection._client.list_table_metadata, config=self._retry_config, logger=_logger, **request ) except Exception as e: _logger.exception("Failed to list table metadata.") raise OperationalError(*e.args) from e else: return response.get("NextToken", None), [ AthenaTableMetadata({"TableMetadata": r}) for r in response.get("TableMetadataList", []) ]
def execute( self, operation: str, parameters: Optional[Dict[str, Any]] = None, work_group: Optional[str] = None, s3_staging_dir: Optional[str] = None, cache_size: int = 0, cache_expiration_time: int = 0, ): self._reset_state() self.query_id = self._execute( operation, parameters=parameters, work_group=work_group, s3_staging_dir=s3_staging_dir, cache_size=cache_size, cache_expiration_time=cache_expiration_time, ) query_execution = self._poll(self.query_id) if query_execution.state == AthenaQueryExecution.STATE_SUCCEEDED: self.result_set = self._result_set_class( self._connection, self._converter, query_execution, self.arraysize, self._retry_config, ) else: raise OperationalError(query_execution.state_change_reason) return self
def execute( self, operation, parameters=None, work_group=None, s3_staging_dir=None, cache_size=0, ): self._reset_state() self._query_id = self._execute( operation, parameters=parameters, work_group=work_group, s3_staging_dir=s3_staging_dir, cache_size=cache_size, ) query_execution = self._poll(self._query_id) if query_execution.state == AthenaQueryExecution.STATE_SUCCEEDED: self._result_set = AthenaResultSet( self._connection, self._converter, query_execution, self.arraysize, self._retry_config, ) else: raise OperationalError(query_execution.state_change_reason) return self
def _list_query_executions( self, max_results: Optional[int] = None, work_group: Optional[str] = None, next_token: Optional[str] = None, ) -> Tuple[Optional[str], List[AthenaQueryExecution]]: request = self._build_list_query_executions_request( max_results=max_results, work_group=work_group, next_token=next_token ) try: response = retry_api_call( self.connection._client.list_query_executions, config=self._retry_config, logger=_logger, **request ) except Exception as e: _logger.exception("Failed to list query executions.") raise OperationalError(*e.args) from e else: next_token = response.get("NextToken", None) query_ids = response.get("QueryExecutionIds", None) if not query_ids: return next_token, [] return next_token, self._batch_get_query_execution(query_ids)
def _as_pandas(self): import pandas as pd if not self.output_location: raise ProgrammingError('OutputLocation is none or empty.') bucket, key = self._parse_output_location(self.output_location) try: response = retry_api_call(self._client.get_object, config=self._retry_config, logger=_logger, Bucket=bucket, Key=key) except Exception as e: _logger.exception('Failed to download csv.') raise_from(OperationalError(*e.args), e) else: length = response['ContentLength'] if length: df = pd.read_csv(io.BytesIO(response['Body'].read()), dtype=self.dtypes, converters=self.converters, parse_dates=self.parse_dates, infer_datetime_format=True) df = self._trunc_date(df) else: # Allow empty response so DDL can be used df = pd.DataFrame() return df
def __fetch(self, next_token=None): if not self._query_execution.query_id: raise ProgrammingError('QueryExecutionId is none or empty.') if self._query_execution.state != 'SUCCEEDED': raise ProgrammingError('QueryExecutionState is not SUCCEEDED.') request = { 'QueryExecutionId': self._query_execution.query_id, 'MaxResults': self._arraysize, } if next_token: request.update({'NextToken': next_token}) try: response = retry_api_call( self._connection.client.get_query_results, exceptions=self.retry_exceptions, attempt=self.retry_attempt, multiplier=self.retry_multiplier, max_delay=self.retry_max_delay, exp_base=self.retry_exponential_base, logger=_logger, **request) except Exception as e: _logger.exception('Failed to fetch result set.') raise_from(OperationalError(*e.args), e) else: return response
def execute( self, operation, parameters=None, work_group=None, s3_staging_dir=None, cache_size=0, keep_default_na=False, na_values=None, quoting=1, ): self._reset_state() self._query_id = self._execute( operation, parameters=parameters, work_group=work_group, s3_staging_dir=s3_staging_dir, cache_size=cache_size, ) query_execution = self._poll(self._query_id) if query_execution.state == AthenaQueryExecution.STATE_SUCCEEDED: self._result_set = AthenaPandasResultSet( connection=self._connection, converter=self._converter, query_execution=query_execution, arraysize=self.arraysize, retry_config=self._retry_config, keep_default_na=keep_default_na, na_values=na_values, quoting=quoting, ) else: raise OperationalError(query_execution.state_change_reason) return self
def _cancel(self, query_id: str) -> None: request = {"QueryExecutionId": query_id} try: retry_api_call(self._connection.client.stop_query_execution, config=self._retry_config, logger=_logger, **request) except Exception as e: _logger.exception("Failed to cancel query.") raise OperationalError(*e.args) from e
def _cancel(self, query_id): request = {'QueryExecutionId': query_id} try: retry_api_call(self._connection.client.stop_query_execution, config=self._retry_config, logger=_logger, **request) except Exception as e: _logger.exception('Failed to cancel query.') raise_from(OperationalError(*e.args), e)
def execute(self, operation, parameters=None): self._reset_state() self._query_id = self._execute(operation, parameters) query_execution = self._poll(self._query_id) if query_execution.state == AthenaQueryExecution.STATE_SUCCEEDED: self._result_set = AthenaResultSet(self._connection, self._converter, query_execution, self.arraysize, self._retry_config) else: raise OperationalError(query_execution.state_change_reason) return self
def execute( self: _T, operation: str, parameters: Optional[Dict[str, Any]] = None, work_group: Optional[str] = None, s3_staging_dir: Optional[str] = None, cache_size: int = 0, cache_expiration_time: int = 0, keep_default_na: bool = False, na_values: Optional[Iterable[str]] = ("", ), quoting: int = 1, **kwargs, ) -> _T: self._reset_state() if self._unload: s3_staging_dir = s3_staging_dir if s3_staging_dir else self._s3_staging_dir assert ( s3_staging_dir ), "If the unload option is used, s3_staging_dir is required." operation, unload_location = self._formatter.wrap_unload( operation, s3_staging_dir=s3_staging_dir, format_=AthenaFileFormat.FILE_FORMAT_PARQUET, compression=AthenaCompression.COMPRESSION_SNAPPY, ) else: unload_location = None self.query_id = self._execute( operation, parameters=parameters, work_group=work_group, s3_staging_dir=s3_staging_dir, cache_size=cache_size, cache_expiration_time=cache_expiration_time, ) query_execution = self._poll(self.query_id) if query_execution.state == AthenaQueryExecution.STATE_SUCCEEDED: self.result_set = AthenaPandasResultSet( connection=self._connection, converter=self._converter, query_execution=query_execution, arraysize=self.arraysize, retry_config=self._retry_config, keep_default_na=keep_default_na, na_values=na_values, quoting=quoting, unload=self._unload, unload_location=unload_location, **kwargs, ) else: raise OperationalError(query_execution.state_change_reason) return self
def execute(self, operation, parameters=None): self._reset_state() self._query_id = self._execute(operation, parameters) query_execution = self._poll(self._query_id) if query_execution.state == AthenaQueryExecution.STATE_SUCCEEDED: self._result_set = AthenaResultSet( self._connection, self._converter, query_execution, self.arraysize, self.retry_exceptions, self.retry_attempt, self.retry_multiplier, self.retry_max_delay, self.retry_exponential_base) else: raise OperationalError(query_execution.state_change_reason)
def _get_query_execution(self, query_id: str) -> AthenaQueryExecution: request = {"QueryExecutionId": query_id} try: response = retry_api_call( self._connection.client.get_query_execution, config=self._retry_config, logger=_logger, **request) except Exception as e: _logger.exception("Failed to get query execution.") raise OperationalError(*e.args) from e else: return AthenaQueryExecution(response)
def _get_query_execution(self, query_id): request = {'QueryExecutionId': query_id} try: response = retry_api_call( self._connection.client.get_query_execution, config=self._retry_config, logger=_logger, **request) except Exception as e: _logger.exception('Failed to get query execution.') raise_from(OperationalError(*e.args), e) else: return AthenaQueryExecution(response)
def _cancel(self, query_id): request = {'QueryExecutionId': query_id} try: retry_api_call(self._connection.stop_query_execution, exceptions=self.retry_exceptions, attempt=self.retry_attempt, multiplier=self.retry_multiplier, max_delay=self.retry_max_delay, exp_base=self.retry_exponential_base, logger=_logger, **request) except Exception as e: _logger.exception('Failed to cancel query.') raise_from(OperationalError(*e.args), e)
def _as_pandas(self) -> "DataFrame": import pandas as pd if not self.output_location: raise ProgrammingError("OutputLocation is none or empty.") bucket, key = parse_output_location(self.output_location) try: response = retry_api_call( self._client.get_object, config=self._retry_config, logger=_logger, Bucket=bucket, Key=key, ) except Exception as e: _logger.exception("Failed to download csv.") raise OperationalError(*e.args) from e else: length = response["ContentLength"] if length: if self.output_location.endswith(".txt"): sep = "\t" header = None description = self.description if self.description else [] names: Optional[Any] = [d[0] for d in description] else: # csv format sep = "," header = 0 names = None df = pd.read_csv( response["Body"], sep=sep, header=header, names=names, dtype=self.dtypes, converters=self.converters, parse_dates=self.parse_dates, infer_datetime_format=True, skip_blank_lines=False, keep_default_na=self._keep_default_na, na_values=self._na_values, quoting=self._quoting, **self._kwargs, ) df = self._trunc_date(df) else: # Allow empty response df = pd.DataFrame() return df
def cancel(self): if not self._query_id: raise ProgrammingError('QueryExecutionId is none or empty.') try: request = {'QueryExecutionId': self._query_id} retry_api_call(self._connection.stop_query_execution, exceptions=self.retry_exceptions, attempt=self.retry_attempt, multiplier=self.retry_multiplier, max_delay=self.retry_max_deply, exp_base=self.retry_exponential_base, logger=_logger, **request) except Exception as e: _logger.exception('Failed to cancel query.') raise_from(OperationalError(*e.args), e)
def _query_execution(self, query_id): request = {'QueryExecutionId': query_id} try: response = retry_api_call(self._connection.get_query_execution, exceptions=self.retry_exceptions, attempt=self.retry_attempt, multiplier=self.retry_multiplier, max_delay=self.retry_max_delay, exp_base=self.retry_exponential_base, logger=_logger, **request) except Exception as e: _logger.exception('Failed to get query execution.') raise_from(OperationalError(*e.args), e) else: return AthenaQueryExecution(response)
def execute(self, sql, bindings=None): if bindings is not None: # presto doesn't actually pass bindings along so we have to do the # escaping and formatting ourselves bindings = tuple(self._escape_value(b) for b in bindings) sql = sql % bindings query_id, future = self._cursor.execute(sql) result_set = future.result() if result_set.state != AthenaQueryExecution.STATE_SUCCEEDED: raise OperationalError(result_set.state_change_reason) self._fetch_result = result_set.fetchall() self._query_id = query_id self._state = result_set.state return self
def _batch_get_query_execution( self, query_ids: List[str] ) -> List[AthenaQueryExecution]: try: response = retry_api_call( self.connection._client.batch_get_query_execution, config=self._retry_config, logger=_logger, QueryExecutionIds=query_ids, ) except Exception as e: _logger.exception("Failed to batch get query execution.") raise OperationalError(*e.args) from e else: return [ AthenaQueryExecution({"QueryExecution": r}) for r in response.get("QueryExecutions", []) ]
def _as_pandas(self): import pandas as pd if not self.output_location: raise ProgrammingError('OutputLocation is none or empty.') bucket, key = self._parse_output_location(self.output_location) try: response = retry_api_call(self._client.get_object, Bucket=bucket, Key=key) except Exception as e: _logger.exception('Failed to download csv.') raise_from(OperationalError(*e.args), e) else: df = pd.read_csv(io.BytesIO(response['Body'].read()), dtype=self._dtypes(), converters=self._converters(), parse_dates=self._parse_dates(), infer_datetime_format=True) df = self._trunc_date(df) return df
def __fetch(self, next_token=None): if not self._query_execution.query_id: raise ProgrammingError('QueryExecutionId is none or empty.') if self._query_execution.state != AthenaQueryExecution.STATE_SUCCEEDED: raise ProgrammingError('QueryExecutionState is not SUCCEEDED.') request = { 'QueryExecutionId': self._query_execution.query_id, 'MaxResults': self._arraysize, } if next_token: request.update({'NextToken': next_token}) try: response = retry_api_call(self._connection.client.get_query_results, config=self._retry_config, logger=_logger, **request) except Exception as e: _logger.exception('Failed to fetch result set.') raise_from(OperationalError(*e.args), e) else: return response
def _pre_fetch(self): if not self._query_id: raise ProgrammingError('QueryExecutionId is none or empty.') try: request = { 'QueryExecutionId': self._query_id, 'MaxResults': self._arraysize, } response = retry_api_call(self._connection.get_query_results, exceptions=self.retry_exceptions, attempt=self.retry_attempt, multiplier=self.retry_multiplier, max_delay=self.retry_max_deply, exp_base=self.retry_exponential_base, logger=_logger, **request) except Exception as e: _logger.exception('Failed to fetch result set.') raise_from(OperationalError(*e.args), e) else: self._process_meta_data(response) self._process_result_set(response)
def execute( self, operation: str, parameters: Optional[Dict[str, Any]] = None, work_group: Optional[str] = None, s3_staging_dir: Optional[str] = None, cache_size: int = 0, cache_expiration_time: int = 0, keep_default_na: bool = False, na_values: Optional[Iterable[str]] = ("", ), quoting: int = 1, **kwargs, ): self._reset_state() self.query_id = self._execute( operation, parameters=parameters, work_group=work_group, s3_staging_dir=s3_staging_dir, cache_size=cache_size, cache_expiration_time=cache_expiration_time, ) query_execution = self._poll(self.query_id) if query_execution.state == AthenaQueryExecution.STATE_SUCCEEDED: self.result_set = AthenaPandasResultSet( connection=self._connection, converter=self._converter, query_execution=query_execution, arraysize=self.arraysize, retry_config=self._retry_config, keep_default_na=keep_default_na, na_values=na_values, quoting=quoting, **kwargs, ) else: raise OperationalError(query_execution.state_change_reason) return self
def _as_pandas(self): import pandas as pd if not self.output_location: raise ProgrammingError('OutputLocation is none or empty.') bucket, key = parse_output_location(self.output_location) try: response = retry_api_call(self._client.get_object, config=self._retry_config, logger=_logger, Bucket=bucket, Key=key) except Exception as e: _logger.exception('Failed to download csv.') raise_from(OperationalError(*e.args), e) else: length = response['ContentLength'] if length: if self.output_location.endswith('.txt'): sep = '\t' header = None names = [d[0] for d in self.description] else: # csv format sep = ',' header = 0 names = None df = pd.read_csv(io.BytesIO(response['Body'].read()), sep=sep, header=header, names=names, dtype=self.dtypes, converters=self.converters, parse_dates=self.parse_dates, infer_datetime_format=True, skip_blank_lines=False) df = self._trunc_date(df) else: # Allow empty response df = pd.DataFrame() return df
def _get_table_metadata( self, table_name: str, catalog_name: Optional[str] = None, schema_name: Optional[str] = None, ) -> AthenaTableMetadata: request = { "CatalogName": catalog_name if catalog_name else self._catalog_name, "DatabaseName": schema_name if schema_name else self._schema_name, "TableName": table_name, } try: response = retry_api_call( self._connection.client.get_table_metadata, config=self._retry_config, logger=_logger, **request ) except Exception as e: _logger.exception("Failed to get table metadata.") raise OperationalError(*e.args) from e else: return AthenaTableMetadata(response)
def __fetch(self, next_token: Optional[str] = None): if not self.query_id: raise ProgrammingError("QueryExecutionId is none or empty.") if self.state != AthenaQueryExecution.STATE_SUCCEEDED: raise ProgrammingError("QueryExecutionState is not SUCCEEDED.") if self.is_closed: raise ProgrammingError("AthenaResultSet is closed.") request = { "QueryExecutionId": self.query_id, "MaxResults": self._arraysize, } if next_token: request.update({"NextToken": next_token}) try: connection = cast("Connection", self._connection) response = retry_api_call(connection.client.get_query_results, config=self._retry_config, logger=_logger, **request) except Exception as e: _logger.exception("Failed to fetch result set.") raise OperationalError(*e.args) from e else: return response