Ejemplo n.º 1
0
 def _list_query_executions(
     self,
     max_results: int,
     work_group: Optional[str],
     next_token: Optional[str] = None,
 ) -> Tuple[str, List[Dict[str, Any]]]:
     request = self._build_list_query_executions_request(
         max_results, work_group, next_token
     )
     response = retry_api_call(
         self.connection._client.list_query_executions,
         config=self._retry_config,
         logger=_logger,
         **request
     )
     next_token = response.get("NextToken", None)
     query_ids = response.get("QueryExecutionIds", None)
     if not query_ids:
         return next_token, []
     response = retry_api_call(
         self.connection._client.batch_get_query_execution,
         config=self._retry_config,
         logger=_logger,
         QueryExecutionIds=query_ids,
     )
     return next_token, response.get("QueryExecutions", [])
Ejemplo n.º 2
0
 def _cancel(self, query_id: str) -> None:
     request = {"QueryExecutionId": query_id}
     try:
         retry_api_call(self._connection.client.stop_query_execution,
                        config=self._retry_config,
                        logger=_logger,
                        **request)
     except Exception as e:
         _logger.exception("Failed to cancel query.")
         raise OperationalError(*e.args) from e
Ejemplo n.º 3
0
 def _cancel(self, query_id):
     request = {'QueryExecutionId': query_id}
     try:
         retry_api_call(self._connection.client.stop_query_execution,
                        config=self._retry_config,
                        logger=_logger,
                        **request)
     except Exception as e:
         _logger.exception('Failed to cancel query.')
         raise_from(OperationalError(*e.args), e)
Ejemplo n.º 4
0
 def _cancel(self, query_id):
     request = {'QueryExecutionId': query_id}
     try:
         retry_api_call(self._connection.stop_query_execution,
                        exceptions=self.retry_exceptions,
                        attempt=self.retry_attempt,
                        multiplier=self.retry_multiplier,
                        max_delay=self.retry_max_delay,
                        exp_base=self.retry_exponential_base,
                        logger=_logger,
                        **request)
     except Exception as e:
         _logger.exception('Failed to cancel query.')
         raise_from(OperationalError(*e.args), e)
Ejemplo n.º 5
0
 def _list_query_executions(
     self,
     max_results: Optional[int] = None,
     work_group: Optional[str] = None,
     next_token: Optional[str] = None,
 ) -> Tuple[Optional[str], List[AthenaQueryExecution]]:
     request = self._build_list_query_executions_request(
         max_results=max_results, work_group=work_group, next_token=next_token
     )
     try:
         response = retry_api_call(
             self.connection._client.list_query_executions,
             config=self._retry_config,
             logger=_logger,
             **request
         )
     except Exception as e:
         _logger.exception("Failed to list query executions.")
         raise OperationalError(*e.args) from e
     else:
         next_token = response.get("NextToken", None)
         query_ids = response.get("QueryExecutionIds", None)
         if not query_ids:
             return next_token, []
         return next_token, self._batch_get_query_execution(query_ids)
Ejemplo n.º 6
0
    def _execute(
        self,
        operation: str,
        parameters: Optional[Dict[str, Any]] = None,
        work_group: Optional[str] = None,
        s3_staging_dir: Optional[str] = None,
        cache_size: int = 0,
    ) -> str:
        query = self._formatter.format(operation, parameters)
        _logger.debug(query)

        request = self._build_start_query_execution_request(
            query, work_group, s3_staging_dir)
        query_id = self._find_previous_query_id(query, work_group, cache_size)
        if query_id is None:
            try:
                query_id = retry_api_call(
                    self._connection.client.start_query_execution,
                    config=self._retry_config,
                    logger=_logger,
                    **request).get("QueryExecutionId", None)
            except Exception as e:
                _logger.exception("Failed to execute query.")
                raise DatabaseError(*e.args) from e
        return query_id
Ejemplo n.º 7
0
def to_parquet(
    df: "DataFrame",
    bucket_name: str,
    prefix: str,
    retry_config: RetryConfig,
    session_kwargs: Dict[str, Any],
    client_kwargs: Dict[str, Any],
    compression: str = None,
    flavor: str = "spark",
) -> str:
    import pyarrow as pa
    from pyarrow import parquet as pq

    session = Session(**session_kwargs)
    client = session.resource("s3", **client_kwargs)
    bucket = client.Bucket(bucket_name)
    table = pa.Table.from_pandas(df)
    buf = pa.BufferOutputStream()
    pq.write_table(table, buf, compression=compression, flavor=flavor)
    response = retry_api_call(
        bucket.put_object,
        config=retry_config,
        Body=buf.getvalue().to_pybytes(),
        Key=prefix + str(uuid.uuid4()),
    )
    return "s3://{0}/{1}".format(response.bucket_name, response.key)
Ejemplo n.º 8
0
 def __fetch(self, next_token=None):
     if not self._query_execution.query_id:
         raise ProgrammingError('QueryExecutionId is none or empty.')
     if self._query_execution.state != 'SUCCEEDED':
         raise ProgrammingError('QueryExecutionState is not SUCCEEDED.')
     request = {
         'QueryExecutionId': self._query_execution.query_id,
         'MaxResults': self._arraysize,
     }
     if next_token:
         request.update({'NextToken': next_token})
     try:
         response = retry_api_call(
             self._connection.client.get_query_results,
             exceptions=self.retry_exceptions,
             attempt=self.retry_attempt,
             multiplier=self.retry_multiplier,
             max_delay=self.retry_max_delay,
             exp_base=self.retry_exponential_base,
             logger=_logger,
             **request)
     except Exception as e:
         _logger.exception('Failed to fetch result set.')
         raise_from(OperationalError(*e.args), e)
     else:
         return response
Ejemplo n.º 9
0
 def _as_pandas(self):
     import pandas as pd
     if not self.output_location:
         raise ProgrammingError('OutputLocation is none or empty.')
     bucket, key = self._parse_output_location(self.output_location)
     try:
         response = retry_api_call(self._client.get_object,
                                   config=self._retry_config,
                                   logger=_logger,
                                   Bucket=bucket,
                                   Key=key)
     except Exception as e:
         _logger.exception('Failed to download csv.')
         raise_from(OperationalError(*e.args), e)
     else:
         length = response['ContentLength']
         if length:
             df = pd.read_csv(io.BytesIO(response['Body'].read()),
                              dtype=self.dtypes,
                              converters=self.converters,
                              parse_dates=self.parse_dates,
                              infer_datetime_format=True)
             df = self._trunc_date(df)
         else:  # Allow empty response so DDL can be used
             df = pd.DataFrame()
         return df
Ejemplo n.º 10
0
    def _execute(self,
                 operation,
                 parameters=None,
                 work_group=None,
                 s3_staging_dir=None,
                 cache_size=0,
                 concurrent_cache=True):
        query = self._formatter.format(operation, parameters)
        _logger.debug(query)

        request = self._build_start_query_execution_request(
            query, work_group, s3_staging_dir)
        query_id = self._find_previous_query_id(query, work_group, cache_size,
                                                concurrent_cache)
        if query_id is None:
            try:
                query_id = retry_api_call(
                    self._connection.client.start_query_execution,
                    config=self._retry_config,
                    logger=_logger,
                    **request).get('QueryExecutionId', None)
            except Exception as e:
                _logger.exception('Failed to execute query.')
                raise_from(DatabaseError(*e.args), e)
        return query_id
Ejemplo n.º 11
0
 def cancel(self):
     if not self._query_id:
         raise ProgrammingError('QueryExecutionId is none or empty.')
     try:
         request = {'QueryExecutionId': self._query_id}
         retry_api_call(self._connection.stop_query_execution,
                        exceptions=self.retry_exceptions,
                        attempt=self.retry_attempt,
                        multiplier=self.retry_multiplier,
                        max_delay=self.retry_max_deply,
                        exp_base=self.retry_exponential_base,
                        logger=_logger,
                        **request)
     except Exception as e:
         _logger.exception('Failed to cancel query.')
         raise_from(OperationalError(*e.args), e)
Ejemplo n.º 12
0
 def _list_table_metadata(
     self,
     max_results: Optional[int] = None,
     catalog_name: Optional[str] = None,
     schema_name: Optional[str] = None,
     expression: Optional[str] = None,
     next_token: Optional[str] = None,
 ) -> Tuple[Optional[str], List[AthenaTableMetadata]]:
     request = self._build_list_table_metadata_request(
         max_results=max_results,
         catalog_name=catalog_name,
         schema_name=schema_name,
         expression=expression,
         next_token=next_token,
     )
     try:
         response = retry_api_call(
             self.connection._client.list_table_metadata,
             config=self._retry_config,
             logger=_logger,
             **request
         )
     except Exception as e:
         _logger.exception("Failed to list table metadata.")
         raise OperationalError(*e.args) from e
     else:
         return response.get("NextToken", None), [
             AthenaTableMetadata({"TableMetadata": r})
             for r in response.get("TableMetadataList", [])
         ]
Ejemplo n.º 13
0
 def _find_previous_query_id(self, query, work_group, cache_size):
     query_id = None
     try:
         next_token = None
         while cache_size > 0:
             n = min(cache_size, 50)  # 50 is max allowed by AWS API
             cache_size -= n
             request = self._build_list_query_executions_request(
                 n, work_group, next_token
             )
             response = retry_api_call(
                 self.connection._client.list_query_executions,
                 config=self._retry_config,
                 logger=_logger,
                 **request
             )
             query_ids = response.get("QueryExecutionIds", None)
             if not query_ids:
                 break  # no queries left to check
             next_token = response.get("NextToken", None)
             query_executions = retry_api_call(
                 self.connection._client.batch_get_query_execution,
                 config=self._retry_config,
                 logger=_logger,
                 QueryExecutionIds=query_ids,
             ).get("QueryExecutions", [])
             for execution in query_executions:
                 if (
                     execution["Query"] == query
                     and execution["Status"]["State"]
                     == AthenaQueryExecution.STATE_SUCCEEDED
                     and execution["StatementType"]
                     == AthenaQueryExecution.STATEMENT_TYPE_DML
                 ):
                     query_id = execution["QueryExecutionId"]
                     break
             if query_id or next_token is None:
                 break
     except Exception:
         _logger.warning("Failed to check the cache. Moving on without cache.")
     return query_id
Ejemplo n.º 14
0
 def _get_query_execution(self, query_id):
     request = {'QueryExecutionId': query_id}
     try:
         response = retry_api_call(
             self._connection.client.get_query_execution,
             config=self._retry_config,
             logger=_logger,
             **request)
     except Exception as e:
         _logger.exception('Failed to get query execution.')
         raise_from(OperationalError(*e.args), e)
     else:
         return AthenaQueryExecution(response)
Ejemplo n.º 15
0
 def _get_query_execution(self, query_id: str) -> AthenaQueryExecution:
     request = {"QueryExecutionId": query_id}
     try:
         response = retry_api_call(
             self._connection.client.get_query_execution,
             config=self._retry_config,
             logger=_logger,
             **request)
     except Exception as e:
         _logger.exception("Failed to get query execution.")
         raise OperationalError(*e.args) from e
     else:
         return AthenaQueryExecution(response)
Ejemplo n.º 16
0
    def _poll(self):
        if not self._query_id:
            raise ProgrammingError('QueryExecutionId is none or empty.')
        while True:
            try:
                request = {'QueryExecutionId': self._query_id}
                response = retry_api_call(self._connection.get_query_execution,
                                          exceptions=self.retry_exceptions,
                                          attempt=self.retry_attempt,
                                          multiplier=self.retry_multiplier,
                                          max_delay=self.retry_max_deply,
                                          exp_base=self.retry_exponential_base,
                                          logger=_logger,
                                          **request)
            except Exception as e:
                _logger.exception('Failed to poll query result.')
                raise_from(OperationalError(*e.args), e)
            else:
                query_execution = response.get('QueryExecution', None)
                if not query_execution:
                    raise DataError('KeyError `QueryExecution`')
                status = query_execution.get('Status', None)
                if not status:
                    raise DataError('KeyError `Status`')

                state = status.get('State', None)
                if state == 'SUCCEEDED':
                    self._completion_date_time = status.get(
                        'CompletionDateTime', None)
                    self._submission_date_time = status.get(
                        'SubmissionDateTime', None)

                    statistics = query_execution.get('Statistics', {})
                    self._data_scanned_in_bytes = statistics.get(
                        'DataScannedInBytes', None)
                    self._execution_time_in_millis = statistics.get(
                        'EngineExecutionTimeInMillis', None)

                    result_conf = query_execution.get('ResultConfiguration',
                                                      {})
                    self._output_location = result_conf.get(
                        'OutputLocation', None)
                    break
                elif state == 'FAILED':
                    raise OperationalError(
                        status.get('StateChangeReason', None))
                elif state == 'CANCELLED':
                    raise OperationalError(
                        status.get('StateChangeReason', None))
                else:
                    time.sleep(self._poll_interval)
Ejemplo n.º 17
0
    def _as_pandas(self) -> "DataFrame":
        import pandas as pd

        if not self.output_location:
            raise ProgrammingError("OutputLocation is none or empty.")
        bucket, key = parse_output_location(self.output_location)
        try:
            response = retry_api_call(
                self._client.get_object,
                config=self._retry_config,
                logger=_logger,
                Bucket=bucket,
                Key=key,
            )
        except Exception as e:
            _logger.exception("Failed to download csv.")
            raise OperationalError(*e.args) from e
        else:
            length = response["ContentLength"]
            if length:
                if self.output_location.endswith(".txt"):
                    sep = "\t"
                    header = None
                    description = self.description if self.description else []
                    names: Optional[Any] = [d[0] for d in description]
                else:  # csv format
                    sep = ","
                    header = 0
                    names = None
                df = pd.read_csv(
                    response["Body"],
                    sep=sep,
                    header=header,
                    names=names,
                    dtype=self.dtypes,
                    converters=self.converters,
                    parse_dates=self.parse_dates,
                    infer_datetime_format=True,
                    skip_blank_lines=False,
                    keep_default_na=self._keep_default_na,
                    na_values=self._na_values,
                    quoting=self._quoting,
                    **self._kwargs,
                )
                df = self._trunc_date(df)
            else:  # Allow empty response
                df = pd.DataFrame()
            return df
Ejemplo n.º 18
0
    def _execute(self, operation, parameters=None):
        query = self._formatter.format(operation, parameters)
        _logger.debug(query)

        request = self._build_start_query_execution_request(query)
        try:
            response = retry_api_call(
                self._connection.client.start_query_execution,
                config=self._retry_config,
                logger=_logger,
                **request)
        except Exception as e:
            _logger.exception('Failed to execute query.')
            raise_from(DatabaseError(*e.args), e)
        else:
            return response.get('QueryExecutionId', None)
Ejemplo n.º 19
0
 def _query_execution(self, query_id):
     request = {'QueryExecutionId': query_id}
     try:
         response = retry_api_call(self._connection.get_query_execution,
                                   exceptions=self.retry_exceptions,
                                   attempt=self.retry_attempt,
                                   multiplier=self.retry_multiplier,
                                   max_delay=self.retry_max_delay,
                                   exp_base=self.retry_exponential_base,
                                   logger=_logger,
                                   **request)
     except Exception as e:
         _logger.exception('Failed to get query execution.')
         raise_from(OperationalError(*e.args), e)
     else:
         return AthenaQueryExecution(response)
Ejemplo n.º 20
0
 def _batch_get_query_execution(
     self, query_ids: List[str]
 ) -> List[AthenaQueryExecution]:
     try:
         response = retry_api_call(
             self.connection._client.batch_get_query_execution,
             config=self._retry_config,
             logger=_logger,
             QueryExecutionIds=query_ids,
         )
     except Exception as e:
         _logger.exception("Failed to batch get query execution.")
         raise OperationalError(*e.args) from e
     else:
         return [
             AthenaQueryExecution({"QueryExecution": r})
             for r in response.get("QueryExecutions", [])
         ]
Ejemplo n.º 21
0
    def _execute(self, operation, parameters=None):
        query = self._formatter.format(operation, parameters)
        _logger.debug(query)

        request = self._build_start_query_execution_request(query)
        try:
            response = retry_api_call(self._connection.start_query_execution,
                                      exceptions=self.retry_exceptions,
                                      attempt=self.retry_attempt,
                                      multiplier=self.retry_multiplier,
                                      max_delay=self.retry_max_delay,
                                      exp_base=self.retry_exponential_base,
                                      logger=_logger,
                                      **request)
        except Exception as e:
            _logger.exception('Failed to execute query.')
            raise_from(DatabaseError(*e.args), e)
        else:
            return response.get('QueryExecutionId', None)
Ejemplo n.º 22
0
 def _as_pandas(self):
     import pandas as pd
     if not self.output_location:
         raise ProgrammingError('OutputLocation is none or empty.')
     bucket, key = self._parse_output_location(self.output_location)
     try:
         response = retry_api_call(self._client.get_object,
                                   Bucket=bucket,
                                   Key=key)
     except Exception as e:
         _logger.exception('Failed to download csv.')
         raise_from(OperationalError(*e.args), e)
     else:
         df = pd.read_csv(io.BytesIO(response['Body'].read()),
                          dtype=self._dtypes(),
                          converters=self._converters(),
                          parse_dates=self._parse_dates(),
                          infer_datetime_format=True)
         df = self._trunc_date(df)
         return df
Ejemplo n.º 23
0
 def __fetch(self, next_token=None):
     if not self._query_execution.query_id:
         raise ProgrammingError('QueryExecutionId is none or empty.')
     if self._query_execution.state != AthenaQueryExecution.STATE_SUCCEEDED:
         raise ProgrammingError('QueryExecutionState is not SUCCEEDED.')
     request = {
         'QueryExecutionId': self._query_execution.query_id,
         'MaxResults': self._arraysize,
     }
     if next_token:
         request.update({'NextToken': next_token})
     try:
         response = retry_api_call(self._connection.client.get_query_results,
                                   config=self._retry_config,
                                   logger=_logger,
                                   **request)
     except Exception as e:
         _logger.exception('Failed to fetch result set.')
         raise_from(OperationalError(*e.args), e)
     else:
         return response
Ejemplo n.º 24
0
 def _pre_fetch(self):
     if not self._query_id:
         raise ProgrammingError('QueryExecutionId is none or empty.')
     try:
         request = {
             'QueryExecutionId': self._query_id,
             'MaxResults': self._arraysize,
         }
         response = retry_api_call(self._connection.get_query_results,
                                   exceptions=self.retry_exceptions,
                                   attempt=self.retry_attempt,
                                   multiplier=self.retry_multiplier,
                                   max_delay=self.retry_max_deply,
                                   exp_base=self.retry_exponential_base,
                                   logger=_logger,
                                   **request)
     except Exception as e:
         _logger.exception('Failed to fetch result set.')
         raise_from(OperationalError(*e.args), e)
     else:
         self._process_meta_data(response)
         self._process_result_set(response)
Ejemplo n.º 25
0
 def _as_pandas(self):
     import pandas as pd
     if not self.output_location:
         raise ProgrammingError('OutputLocation is none or empty.')
     bucket, key = parse_output_location(self.output_location)
     try:
         response = retry_api_call(self._client.get_object,
                                   config=self._retry_config,
                                   logger=_logger,
                                   Bucket=bucket,
                                   Key=key)
     except Exception as e:
         _logger.exception('Failed to download csv.')
         raise_from(OperationalError(*e.args), e)
     else:
         length = response['ContentLength']
         if length:
             if self.output_location.endswith('.txt'):
                 sep = '\t'
                 header = None
                 names = [d[0] for d in self.description]
             else:  # csv format
                 sep = ','
                 header = 0
                 names = None
             df = pd.read_csv(io.BytesIO(response['Body'].read()),
                              sep=sep,
                              header=header,
                              names=names,
                              dtype=self.dtypes,
                              converters=self.converters,
                              parse_dates=self.parse_dates,
                              infer_datetime_format=True,
                              skip_blank_lines=False)
             df = self._trunc_date(df)
         else:  # Allow empty response
             df = pd.DataFrame()
         return df
Ejemplo n.º 26
0
 def _get_table_metadata(
     self,
     table_name: str,
     catalog_name: Optional[str] = None,
     schema_name: Optional[str] = None,
 ) -> AthenaTableMetadata:
     request = {
         "CatalogName": catalog_name if catalog_name else self._catalog_name,
         "DatabaseName": schema_name if schema_name else self._schema_name,
         "TableName": table_name,
     }
     try:
         response = retry_api_call(
             self._connection.client.get_table_metadata,
             config=self._retry_config,
             logger=_logger,
             **request
         )
     except Exception as e:
         _logger.exception("Failed to get table metadata.")
         raise OperationalError(*e.args) from e
     else:
         return AthenaTableMetadata(response)
Ejemplo n.º 27
0
 def __fetch(self, next_token: Optional[str] = None):
     if not self.query_id:
         raise ProgrammingError("QueryExecutionId is none or empty.")
     if self.state != AthenaQueryExecution.STATE_SUCCEEDED:
         raise ProgrammingError("QueryExecutionState is not SUCCEEDED.")
     if self.is_closed:
         raise ProgrammingError("AthenaResultSet is closed.")
     request = {
         "QueryExecutionId": self.query_id,
         "MaxResults": self._arraysize,
     }
     if next_token:
         request.update({"NextToken": next_token})
     try:
         connection = cast("Connection", self._connection)
         response = retry_api_call(connection.client.get_query_results,
                                   config=self._retry_config,
                                   logger=_logger,
                                   **request)
     except Exception as e:
         _logger.exception("Failed to fetch result set.")
         raise OperationalError(*e.args) from e
     else:
         return response