def _list_query_executions( self, max_results: int, work_group: Optional[str], next_token: Optional[str] = None, ) -> Tuple[str, List[Dict[str, Any]]]: request = self._build_list_query_executions_request( max_results, work_group, next_token ) response = retry_api_call( self.connection._client.list_query_executions, config=self._retry_config, logger=_logger, **request ) next_token = response.get("NextToken", None) query_ids = response.get("QueryExecutionIds", None) if not query_ids: return next_token, [] response = retry_api_call( self.connection._client.batch_get_query_execution, config=self._retry_config, logger=_logger, QueryExecutionIds=query_ids, ) return next_token, response.get("QueryExecutions", [])
def _cancel(self, query_id: str) -> None: request = {"QueryExecutionId": query_id} try: retry_api_call(self._connection.client.stop_query_execution, config=self._retry_config, logger=_logger, **request) except Exception as e: _logger.exception("Failed to cancel query.") raise OperationalError(*e.args) from e
def _cancel(self, query_id): request = {'QueryExecutionId': query_id} try: retry_api_call(self._connection.client.stop_query_execution, config=self._retry_config, logger=_logger, **request) except Exception as e: _logger.exception('Failed to cancel query.') raise_from(OperationalError(*e.args), e)
def _cancel(self, query_id): request = {'QueryExecutionId': query_id} try: retry_api_call(self._connection.stop_query_execution, exceptions=self.retry_exceptions, attempt=self.retry_attempt, multiplier=self.retry_multiplier, max_delay=self.retry_max_delay, exp_base=self.retry_exponential_base, logger=_logger, **request) except Exception as e: _logger.exception('Failed to cancel query.') raise_from(OperationalError(*e.args), e)
def _list_query_executions( self, max_results: Optional[int] = None, work_group: Optional[str] = None, next_token: Optional[str] = None, ) -> Tuple[Optional[str], List[AthenaQueryExecution]]: request = self._build_list_query_executions_request( max_results=max_results, work_group=work_group, next_token=next_token ) try: response = retry_api_call( self.connection._client.list_query_executions, config=self._retry_config, logger=_logger, **request ) except Exception as e: _logger.exception("Failed to list query executions.") raise OperationalError(*e.args) from e else: next_token = response.get("NextToken", None) query_ids = response.get("QueryExecutionIds", None) if not query_ids: return next_token, [] return next_token, self._batch_get_query_execution(query_ids)
def _execute( self, operation: str, parameters: Optional[Dict[str, Any]] = None, work_group: Optional[str] = None, s3_staging_dir: Optional[str] = None, cache_size: int = 0, ) -> str: query = self._formatter.format(operation, parameters) _logger.debug(query) request = self._build_start_query_execution_request( query, work_group, s3_staging_dir) query_id = self._find_previous_query_id(query, work_group, cache_size) if query_id is None: try: query_id = retry_api_call( self._connection.client.start_query_execution, config=self._retry_config, logger=_logger, **request).get("QueryExecutionId", None) except Exception as e: _logger.exception("Failed to execute query.") raise DatabaseError(*e.args) from e return query_id
def to_parquet( df: "DataFrame", bucket_name: str, prefix: str, retry_config: RetryConfig, session_kwargs: Dict[str, Any], client_kwargs: Dict[str, Any], compression: str = None, flavor: str = "spark", ) -> str: import pyarrow as pa from pyarrow import parquet as pq session = Session(**session_kwargs) client = session.resource("s3", **client_kwargs) bucket = client.Bucket(bucket_name) table = pa.Table.from_pandas(df) buf = pa.BufferOutputStream() pq.write_table(table, buf, compression=compression, flavor=flavor) response = retry_api_call( bucket.put_object, config=retry_config, Body=buf.getvalue().to_pybytes(), Key=prefix + str(uuid.uuid4()), ) return "s3://{0}/{1}".format(response.bucket_name, response.key)
def __fetch(self, next_token=None): if not self._query_execution.query_id: raise ProgrammingError('QueryExecutionId is none or empty.') if self._query_execution.state != 'SUCCEEDED': raise ProgrammingError('QueryExecutionState is not SUCCEEDED.') request = { 'QueryExecutionId': self._query_execution.query_id, 'MaxResults': self._arraysize, } if next_token: request.update({'NextToken': next_token}) try: response = retry_api_call( self._connection.client.get_query_results, exceptions=self.retry_exceptions, attempt=self.retry_attempt, multiplier=self.retry_multiplier, max_delay=self.retry_max_delay, exp_base=self.retry_exponential_base, logger=_logger, **request) except Exception as e: _logger.exception('Failed to fetch result set.') raise_from(OperationalError(*e.args), e) else: return response
def _as_pandas(self): import pandas as pd if not self.output_location: raise ProgrammingError('OutputLocation is none or empty.') bucket, key = self._parse_output_location(self.output_location) try: response = retry_api_call(self._client.get_object, config=self._retry_config, logger=_logger, Bucket=bucket, Key=key) except Exception as e: _logger.exception('Failed to download csv.') raise_from(OperationalError(*e.args), e) else: length = response['ContentLength'] if length: df = pd.read_csv(io.BytesIO(response['Body'].read()), dtype=self.dtypes, converters=self.converters, parse_dates=self.parse_dates, infer_datetime_format=True) df = self._trunc_date(df) else: # Allow empty response so DDL can be used df = pd.DataFrame() return df
def _execute(self, operation, parameters=None, work_group=None, s3_staging_dir=None, cache_size=0, concurrent_cache=True): query = self._formatter.format(operation, parameters) _logger.debug(query) request = self._build_start_query_execution_request( query, work_group, s3_staging_dir) query_id = self._find_previous_query_id(query, work_group, cache_size, concurrent_cache) if query_id is None: try: query_id = retry_api_call( self._connection.client.start_query_execution, config=self._retry_config, logger=_logger, **request).get('QueryExecutionId', None) except Exception as e: _logger.exception('Failed to execute query.') raise_from(DatabaseError(*e.args), e) return query_id
def cancel(self): if not self._query_id: raise ProgrammingError('QueryExecutionId is none or empty.') try: request = {'QueryExecutionId': self._query_id} retry_api_call(self._connection.stop_query_execution, exceptions=self.retry_exceptions, attempt=self.retry_attempt, multiplier=self.retry_multiplier, max_delay=self.retry_max_deply, exp_base=self.retry_exponential_base, logger=_logger, **request) except Exception as e: _logger.exception('Failed to cancel query.') raise_from(OperationalError(*e.args), e)
def _list_table_metadata( self, max_results: Optional[int] = None, catalog_name: Optional[str] = None, schema_name: Optional[str] = None, expression: Optional[str] = None, next_token: Optional[str] = None, ) -> Tuple[Optional[str], List[AthenaTableMetadata]]: request = self._build_list_table_metadata_request( max_results=max_results, catalog_name=catalog_name, schema_name=schema_name, expression=expression, next_token=next_token, ) try: response = retry_api_call( self.connection._client.list_table_metadata, config=self._retry_config, logger=_logger, **request ) except Exception as e: _logger.exception("Failed to list table metadata.") raise OperationalError(*e.args) from e else: return response.get("NextToken", None), [ AthenaTableMetadata({"TableMetadata": r}) for r in response.get("TableMetadataList", []) ]
def _find_previous_query_id(self, query, work_group, cache_size): query_id = None try: next_token = None while cache_size > 0: n = min(cache_size, 50) # 50 is max allowed by AWS API cache_size -= n request = self._build_list_query_executions_request( n, work_group, next_token ) response = retry_api_call( self.connection._client.list_query_executions, config=self._retry_config, logger=_logger, **request ) query_ids = response.get("QueryExecutionIds", None) if not query_ids: break # no queries left to check next_token = response.get("NextToken", None) query_executions = retry_api_call( self.connection._client.batch_get_query_execution, config=self._retry_config, logger=_logger, QueryExecutionIds=query_ids, ).get("QueryExecutions", []) for execution in query_executions: if ( execution["Query"] == query and execution["Status"]["State"] == AthenaQueryExecution.STATE_SUCCEEDED and execution["StatementType"] == AthenaQueryExecution.STATEMENT_TYPE_DML ): query_id = execution["QueryExecutionId"] break if query_id or next_token is None: break except Exception: _logger.warning("Failed to check the cache. Moving on without cache.") return query_id
def _get_query_execution(self, query_id): request = {'QueryExecutionId': query_id} try: response = retry_api_call( self._connection.client.get_query_execution, config=self._retry_config, logger=_logger, **request) except Exception as e: _logger.exception('Failed to get query execution.') raise_from(OperationalError(*e.args), e) else: return AthenaQueryExecution(response)
def _get_query_execution(self, query_id: str) -> AthenaQueryExecution: request = {"QueryExecutionId": query_id} try: response = retry_api_call( self._connection.client.get_query_execution, config=self._retry_config, logger=_logger, **request) except Exception as e: _logger.exception("Failed to get query execution.") raise OperationalError(*e.args) from e else: return AthenaQueryExecution(response)
def _poll(self): if not self._query_id: raise ProgrammingError('QueryExecutionId is none or empty.') while True: try: request = {'QueryExecutionId': self._query_id} response = retry_api_call(self._connection.get_query_execution, exceptions=self.retry_exceptions, attempt=self.retry_attempt, multiplier=self.retry_multiplier, max_delay=self.retry_max_deply, exp_base=self.retry_exponential_base, logger=_logger, **request) except Exception as e: _logger.exception('Failed to poll query result.') raise_from(OperationalError(*e.args), e) else: query_execution = response.get('QueryExecution', None) if not query_execution: raise DataError('KeyError `QueryExecution`') status = query_execution.get('Status', None) if not status: raise DataError('KeyError `Status`') state = status.get('State', None) if state == 'SUCCEEDED': self._completion_date_time = status.get( 'CompletionDateTime', None) self._submission_date_time = status.get( 'SubmissionDateTime', None) statistics = query_execution.get('Statistics', {}) self._data_scanned_in_bytes = statistics.get( 'DataScannedInBytes', None) self._execution_time_in_millis = statistics.get( 'EngineExecutionTimeInMillis', None) result_conf = query_execution.get('ResultConfiguration', {}) self._output_location = result_conf.get( 'OutputLocation', None) break elif state == 'FAILED': raise OperationalError( status.get('StateChangeReason', None)) elif state == 'CANCELLED': raise OperationalError( status.get('StateChangeReason', None)) else: time.sleep(self._poll_interval)
def _as_pandas(self) -> "DataFrame": import pandas as pd if not self.output_location: raise ProgrammingError("OutputLocation is none or empty.") bucket, key = parse_output_location(self.output_location) try: response = retry_api_call( self._client.get_object, config=self._retry_config, logger=_logger, Bucket=bucket, Key=key, ) except Exception as e: _logger.exception("Failed to download csv.") raise OperationalError(*e.args) from e else: length = response["ContentLength"] if length: if self.output_location.endswith(".txt"): sep = "\t" header = None description = self.description if self.description else [] names: Optional[Any] = [d[0] for d in description] else: # csv format sep = "," header = 0 names = None df = pd.read_csv( response["Body"], sep=sep, header=header, names=names, dtype=self.dtypes, converters=self.converters, parse_dates=self.parse_dates, infer_datetime_format=True, skip_blank_lines=False, keep_default_na=self._keep_default_na, na_values=self._na_values, quoting=self._quoting, **self._kwargs, ) df = self._trunc_date(df) else: # Allow empty response df = pd.DataFrame() return df
def _execute(self, operation, parameters=None): query = self._formatter.format(operation, parameters) _logger.debug(query) request = self._build_start_query_execution_request(query) try: response = retry_api_call( self._connection.client.start_query_execution, config=self._retry_config, logger=_logger, **request) except Exception as e: _logger.exception('Failed to execute query.') raise_from(DatabaseError(*e.args), e) else: return response.get('QueryExecutionId', None)
def _query_execution(self, query_id): request = {'QueryExecutionId': query_id} try: response = retry_api_call(self._connection.get_query_execution, exceptions=self.retry_exceptions, attempt=self.retry_attempt, multiplier=self.retry_multiplier, max_delay=self.retry_max_delay, exp_base=self.retry_exponential_base, logger=_logger, **request) except Exception as e: _logger.exception('Failed to get query execution.') raise_from(OperationalError(*e.args), e) else: return AthenaQueryExecution(response)
def _batch_get_query_execution( self, query_ids: List[str] ) -> List[AthenaQueryExecution]: try: response = retry_api_call( self.connection._client.batch_get_query_execution, config=self._retry_config, logger=_logger, QueryExecutionIds=query_ids, ) except Exception as e: _logger.exception("Failed to batch get query execution.") raise OperationalError(*e.args) from e else: return [ AthenaQueryExecution({"QueryExecution": r}) for r in response.get("QueryExecutions", []) ]
def _execute(self, operation, parameters=None): query = self._formatter.format(operation, parameters) _logger.debug(query) request = self._build_start_query_execution_request(query) try: response = retry_api_call(self._connection.start_query_execution, exceptions=self.retry_exceptions, attempt=self.retry_attempt, multiplier=self.retry_multiplier, max_delay=self.retry_max_delay, exp_base=self.retry_exponential_base, logger=_logger, **request) except Exception as e: _logger.exception('Failed to execute query.') raise_from(DatabaseError(*e.args), e) else: return response.get('QueryExecutionId', None)
def _as_pandas(self): import pandas as pd if not self.output_location: raise ProgrammingError('OutputLocation is none or empty.') bucket, key = self._parse_output_location(self.output_location) try: response = retry_api_call(self._client.get_object, Bucket=bucket, Key=key) except Exception as e: _logger.exception('Failed to download csv.') raise_from(OperationalError(*e.args), e) else: df = pd.read_csv(io.BytesIO(response['Body'].read()), dtype=self._dtypes(), converters=self._converters(), parse_dates=self._parse_dates(), infer_datetime_format=True) df = self._trunc_date(df) return df
def __fetch(self, next_token=None): if not self._query_execution.query_id: raise ProgrammingError('QueryExecutionId is none or empty.') if self._query_execution.state != AthenaQueryExecution.STATE_SUCCEEDED: raise ProgrammingError('QueryExecutionState is not SUCCEEDED.') request = { 'QueryExecutionId': self._query_execution.query_id, 'MaxResults': self._arraysize, } if next_token: request.update({'NextToken': next_token}) try: response = retry_api_call(self._connection.client.get_query_results, config=self._retry_config, logger=_logger, **request) except Exception as e: _logger.exception('Failed to fetch result set.') raise_from(OperationalError(*e.args), e) else: return response
def _pre_fetch(self): if not self._query_id: raise ProgrammingError('QueryExecutionId is none or empty.') try: request = { 'QueryExecutionId': self._query_id, 'MaxResults': self._arraysize, } response = retry_api_call(self._connection.get_query_results, exceptions=self.retry_exceptions, attempt=self.retry_attempt, multiplier=self.retry_multiplier, max_delay=self.retry_max_deply, exp_base=self.retry_exponential_base, logger=_logger, **request) except Exception as e: _logger.exception('Failed to fetch result set.') raise_from(OperationalError(*e.args), e) else: self._process_meta_data(response) self._process_result_set(response)
def _as_pandas(self): import pandas as pd if not self.output_location: raise ProgrammingError('OutputLocation is none or empty.') bucket, key = parse_output_location(self.output_location) try: response = retry_api_call(self._client.get_object, config=self._retry_config, logger=_logger, Bucket=bucket, Key=key) except Exception as e: _logger.exception('Failed to download csv.') raise_from(OperationalError(*e.args), e) else: length = response['ContentLength'] if length: if self.output_location.endswith('.txt'): sep = '\t' header = None names = [d[0] for d in self.description] else: # csv format sep = ',' header = 0 names = None df = pd.read_csv(io.BytesIO(response['Body'].read()), sep=sep, header=header, names=names, dtype=self.dtypes, converters=self.converters, parse_dates=self.parse_dates, infer_datetime_format=True, skip_blank_lines=False) df = self._trunc_date(df) else: # Allow empty response df = pd.DataFrame() return df
def _get_table_metadata( self, table_name: str, catalog_name: Optional[str] = None, schema_name: Optional[str] = None, ) -> AthenaTableMetadata: request = { "CatalogName": catalog_name if catalog_name else self._catalog_name, "DatabaseName": schema_name if schema_name else self._schema_name, "TableName": table_name, } try: response = retry_api_call( self._connection.client.get_table_metadata, config=self._retry_config, logger=_logger, **request ) except Exception as e: _logger.exception("Failed to get table metadata.") raise OperationalError(*e.args) from e else: return AthenaTableMetadata(response)
def __fetch(self, next_token: Optional[str] = None): if not self.query_id: raise ProgrammingError("QueryExecutionId is none or empty.") if self.state != AthenaQueryExecution.STATE_SUCCEEDED: raise ProgrammingError("QueryExecutionState is not SUCCEEDED.") if self.is_closed: raise ProgrammingError("AthenaResultSet is closed.") request = { "QueryExecutionId": self.query_id, "MaxResults": self._arraysize, } if next_token: request.update({"NextToken": next_token}) try: connection = cast("Connection", self._connection) response = retry_api_call(connection.client.get_query_results, config=self._retry_config, logger=_logger, **request) except Exception as e: _logger.exception("Failed to fetch result set.") raise OperationalError(*e.args) from e else: return response