def read_sql_athena(self, sql, database, s3_output=None, max_result_size=None): """ Executes any SQL query on AWS Athena and return a Dataframe of the result. P.S. If max_result_size is passed, then a iterator of Dataframes is returned. :param sql: SQL Query :param database: Glue/Athena Database :param s3_output: AWS S3 path :param max_result_size: Max number of bytes on each request to S3 :return: Pandas Dataframe or Iterator of Pandas Dataframes if max_result_size != None """ if not s3_output: s3_output = self._session.athena.create_athena_bucket() query_execution_id = self._session.athena.run_query( query=sql, database=database, s3_output=s3_output) query_response = self._session.athena.wait_query( query_execution_id=query_execution_id) if query_response["QueryExecution"]["Status"]["State"] in [ "FAILED", "CANCELLED" ]: reason = query_response["QueryExecution"]["Status"][ "StateChangeReason"] message_error = f"Query error: {reason}" raise AthenaQueryError(message_error) else: dtype, parse_timestamps, parse_dates, converters = self._session.athena.get_query_dtype( query_execution_id=query_execution_id) path = f"{s3_output}{query_execution_id}.csv" ret = self.read_csv(path=path, dtype=dtype, parse_dates=parse_timestamps, converters=converters, quoting=csv.QUOTE_ALL, max_result_size=max_result_size) if max_result_size is None: if len(ret.index) > 0: for col in parse_dates: ret[col] = ret[col].dt.date return ret else: return Pandas._apply_dates_to_generator( generator=ret, parse_dates=parse_dates)
def read_sql_athena(self, sql, database, s3_output=None): if not s3_output: account_id = (self._session.boto3_session.client( service_name="sts", config=self._session.botocore_config). get_caller_identity().get("Account")) session_region = self._session.boto3_session.region_name s3_output = f"s3://aws-athena-query-results-{account_id}-{session_region}/" s3_resource = self._session.boto3_session.resource("s3") s3_resource.Bucket(s3_output) query_execution_id = self._session.athena.run_query( query=sql, database=database, s3_output=s3_output) query_response = self._session.athena.wait_query( query_execution_id=query_execution_id) if query_response.get("QueryExecution").get("Status").get("State") in [ "FAILED", "CANCELLED" ]: reason = (query_response.get("QueryExecution").get("Status").get( "StateChangeReason")) message_error = f"Query error: {reason}" raise AthenaQueryError(message_error) else: path = f"{s3_output}{query_execution_id}.csv" dataframe = self.read_csv(path=path) return dataframe