class read_athena(): conn = connect(s3_staging_dir='s3://willjeventdata/seatgeek/main data', region_name='us-west-2') df = pd.read_sql("select * from tickets_db.seatgeek_events limit 100", conn)
def run_query(self, query, user): cursor = pyathena.connect( s3_staging_dir=self.configuration['s3_staging_dir'], region_name=self.configuration['region'], aws_access_key_id=self.configuration.get('aws_access_key', None), aws_secret_access_key=self.configuration.get( 'aws_secret_key', None), schema_name=self.configuration.get('schema', 'default'), encryption_option=self.configuration.get('encryption_option', None), kms_key=self.configuration.get('kms_key', None), formatter=SimpleFormatter()).cursor() try: cursor.execute(query) column_tuples = [(i[0], _TYPE_MAPPINGS.get(i[1], None)) for i in cursor.description] columns = self.fetch_columns(column_tuples) rows = [ dict(zip(([c['name'] for c in columns]), r)) for i, r in enumerate(cursor.fetchall()) ] data = {'columns': columns, 'rows': rows} json_data = json.dumps(data, cls=JSONEncoder) error = None except KeyboardInterrupt: if cursor.query_id: cursor.cancel() error = "Query cancelled by user." json_data = None except Exception, ex: if cursor.query_id: cursor.cancel() error = ex.message json_data = None
def _connect(cfg): global conn, cursor, databases try: info("Connect.") conn = connect(aws_access_key_id=cfg['aws']['access_key'], aws_secret_access_key=cfg['aws']['secret_key'], s3_staging_dir=cfg['aws']['s3_stage_dir'], region_name='ap-northeast-2') cursor = conn.cursor() databases = [ rv[0] for rv in cursor.execute('show databases').fetchall() ] except Exception as e: error("=== DB Error Message ===") error(str(e)) messagebox.showerror("에러", "DB 접속 에러. AWS 설정을 확인해 주세요.") aws_btn['state'] = 'normal' else: warning("Connect success.") # 모든 프로파일에 DB 설정 if databases is not None: for k, pro in profiles.items(): pro.set_databases(databases) win.after(10, _first_db_set) # enable_controls() finally: wait_dlg.top.destroy() unset_wait_cursor()
def __post_init__(self): # set up the datasets ( self.target_time_series, self.related_time_series, self.item_metadata, ) = self._get_datasets() # get the dataset group (from the provided forecast) self.dataset_group = self.forecast._dataset_group # set up column mappings column_mappings = {} for n in range(1, MAX_DIMENSIONS + 1): column_mappings[f"dimension_{n}"] = None for n in range(1, MAX_METADATA_FIELDS + 1): column_mappings[f"metadata_{n}"] = None self.column_mappings = column_mappings # set up the connection to AWS Athena self.cursor = connect( region_name=get_aws_region(), work_group=self.workgroup, schema_name=self.schema, cursor_class=AsyncCursor, ).cursor()
def _open_connection(self): return pyathena.connect( aws_access_key_id=self.config.access_key, aws_secret_access_key=self.config.secret_key, s3_staging_dir=self.config.staging_dir, region_name=self.config.region, )
def athena_python(sql='', s3_staging_dir=''): cnx = connect(aws_access_key_id=os.environ['AWS_ACCESS_KEY_ID'], aws_secret_access_key=os.environ['AWS_SECRET_ACCESS_KEY'], region_name=os.environ['AWS_DEFAULT_REGION'], s3_staging_dir=s3_staging_dir) athena_df = pd.read_sql(sql, cnx) return athena_df
def query(self, q): _conn_args = { 's3_staging_dir': self.staging_dir, 'database': self.database } if self.access_key is not None and self.secret_key is not None: _conn_args['aws_access_key_id'] = self.access_key _conn_args['aws_secret_access_key'] = self.secret_key if self.region_name is not None: _conn_args['region_name'] = self.region_name conn = connect(**_conn_args) cursor = conn.cursor() cursor.execute(q) # Load query results into Pandas DataFrame and show results df = as_pandas(cursor) col_map = {} for col in df.columns: col_map[col] = col return df, col_map
def _new_connection(self): config = self.config if self.mode == 'jdbc': conn_params = { 'Workgroup': config.workgroup, 'AwsRegion': config.region, 'S3OutputLocation': config.output_location } if config.default_database is not None: conn_params['Schema'] = config.default_database conn = pyathenajdbc.connect(**conn_params) else: # rest, csv assert self.mode in ['rest', 'csv'] conn_params = { 'work_group': config.workgroup, 'region_name': config.region, 'output_location': config.output_location } if config.default_database is not None: conn_params['schema_name'] = config.default_database if self.mode == 'csv': conn_params['cursor_class'] = PandasCursor conn = pyathena.connect(**conn_params) return conn
def get_device_data(data={}, devices=[]): added = 0 start = int(get_start().strftime('%s')) * 1000 stop = int(get_stop().strftime('%s')) * 1000 c = connect(s3_staging_dir='s3://parsyl-athena-output-production-useast1', region_name='us-east-1') for device_id in tqdm_notebook(devices, desc='device data loaded'): if device_id in data: continue stmt = """ SELECT time, temperature, humidity FROM parsyl_device_data_database.parsyl_data_lake_production_useast1_v3 WHERE device=%(device_id)s AND temperature IS NOT NULL AND time >= %(start)d AND time <= %(stop)d ORDER BY time """ try: with c.cursor() as cursor: cursor.execute(stmt, { 'device_id': device_id, 'start': start, 'stop': stop }) data[device_id] = as_pandas(cursor) added += 1 except Exception as e: print('ERROR querying device data - {}'.format(e)) c.close() return data, added
def create_connection(pyathena_rc_path=None): """Creates SQL Server connection using AWS Athena credentials Args: pyathena_rc_path (str): Path to the PyAthena RC file with the AWS Athena variables (default: None) Returns: Connection Cursor """ if pyathena_rc_path is None: pyathena_rc_path = Path(__file__).parent / "../../../pyathena/pyathena.rc" # Open Pyathena RC file and get list of all connection variables in a processable format with open(pyathena_rc_path) as f: lines = f.readlines() lines = [x.strip() for x in lines] lines = [x.split('export ')[1] for x in lines] lines = [line.replace('=', '="') + '"' if '="' not in line else line for line in lines] variables = [line.split('=') for line in lines] # Create variables using the processed variable names from the RC file AWS_CREDS = {} for key, var in variables: exec("{} = {}".format(key, var), AWS_CREDS) # Create connection cursor = connect(aws_access_key_id=AWS_CREDS['AWS_ACCESS_KEY_ID'], aws_secret_access_key=AWS_CREDS['AWS_SECRET_ACCESS_KEY'], s3_staging_dir=AWS_CREDS['AWS_ATHENA_S3_STAGING_DIR'], region_name=AWS_CREDS['AWS_DEFAULT_REGION'], work_group=AWS_CREDS['AWS_ATHENA_WORK_GROUP'], schema_name=SCHEMA_NAME).cursor(PandasCursor) return cursor
def run_query(self, query, user): cursor = pyathena.connect( s3_staging_dir=self.configuration['s3_staging_dir'], region_name=self.configuration['region'], aws_access_key_id=self.configuration.get('aws_access_key', None), aws_secret_access_key=self.configuration.get('aws_secret_key', None), schema_name=self.configuration.get('schema', 'default'), encryption_option=self.configuration.get('encryption_option', None), kms_key=self.configuration.get('kms_key', None), formatter=SimpleFormatter()).cursor() try: cursor.execute(query) column_tuples = [(i[0], _TYPE_MAPPINGS.get(i[1], None)) for i in cursor.description] columns = self.fetch_columns(column_tuples) rows = [dict(zip(([c['name'] for c in columns]), r)) for i, r in enumerate(cursor.fetchall())] data = {'columns': columns, 'rows': rows} json_data = json.dumps(data, cls=JSONEncoder) error = None except KeyboardInterrupt: if cursor.query_id: cursor.cancel() error = "Query cancelled by user." json_data = None except Exception, ex: if cursor.query_id: cursor.cancel() error = ex.message json_data = None
def _setup_session(request): request.addfinalizer(_teardown_session) _upload_rows() with contextlib.closing(connect()) as conn: with conn.cursor() as cursor: _create_database(cursor) _create_table(cursor)
def _cursor(cursor_class, request): if not hasattr(request, "param"): setattr(request, "param", {}) with contextlib.closing( connect(schema_name=ENV.schema, cursor_class=cursor_class, **request.param) ) as conn: with conn.cursor() as cursor: yield cursor
def lambda_handler(event, context): ATHENA_OUTPUT_BUCKET = os.environ['Athena_bucket'] DB_SCHEMA = os.environ['Db_schema'] USE_WEATHER_DATA = os.environ['With_weather_data'] pathParameter = event["pathParameters"] queryParameter = event["queryStringParameters"] if ("meter_id" not in pathParameter) \ or ("data_start" not in queryParameter) \ or ("data_end" not in queryParameter): return { 'statusCode': 500, 'body': "error: meter_id, data_start, and data_end needs to be provided." } METER_ID = pathParameter['meter_id'] ML_ENDPOINT_NAME = load_json_from_file( WORKING_BUCKET, "meteranalytics/initial_pass")["ML_endpoint_name"] DATA_START = queryParameter['data_start'].replace("-", "") DATA_END = queryParameter['data_end'].replace("-", "") connection = connect( s3_staging_dir='s3://{}/'.format(ATHENA_OUTPUT_BUCKET), region_name=REGION) query = '''select date_trunc('HOUR', reading_date_time) as datetime, sum(reading_value) as consumption from "{}".daily where meter_id = '{}' and date_str >= '{}' and date_str < '{}' group by 1; '''.format(DB_SCHEMA, METER_ID, DATA_START, DATA_END) result = pd.read_sql(query, connection) result = result.set_index('datetime') data_kw = result.resample('1H').sum() timeseries = data_kw.iloc[:, 0] #np.trim_zeros(data_kw.iloc[:,0], trim='f') freq = 'H' df_weather = None if USE_WEATHER_DATA == 1: df_weather = get_weather(connection, DATA_START, DB_SCHEMA) runtime = boto3.client('runtime.sagemaker') response = runtime.invoke_endpoint(EndpointName=ML_ENDPOINT_NAME, ContentType='application/json', Body=encode_request( timeseries[:], df_weather)) prediction_time = timeseries.index[-1] + pd.Timedelta(1, unit='H') df_prediction = decode_response(response['Body'].read(), freq, prediction_time) df_prediction.columns = ['consumption'] prediction_result = df_prediction.to_json() return {"statusCode": 200, "body": prediction_result}
def create_client(config, logger: Logger): """Generates an athena client object Args: config ([type]): [description] logger (Logger): [description] Returns: cursor: athena client object """ logger.info("Attempting to create Athena session") # Get the required parameters from config file and/or environment variables aws_access_key_id = config.get("aws_access_key_id") or os.environ.get( "AWS_ACCESS_KEY_ID" ) aws_secret_access_key = config.get("aws_secret_access_key") or os.environ.get( "AWS_SECRET_ACCESS_KEY" ) aws_session_token = config.get("aws_session_token") or os.environ.get( "AWS_SESSION_TOKEN" ) aws_profile = config.get("aws_profile") or os.environ.get("AWS_PROFILE") aws_region = config.get("aws_region") or os.environ.get("AWS_REGION") s3_staging_dir = config.get("s3_staging_dir") or os.environ.get("S3_STAGING_DIR") logger.info(f"Using Athena region {aws_region}") # AWS credentials based authentication if aws_access_key_id and aws_secret_access_key: cursor = connect( aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, region_name=aws_region, s3_staging_dir=s3_staging_dir, ).cursor() # AWS Profile based authentication else: cursor = connect( profile_name=aws_profile, region_name=aws_region, s3_staging_dir=s3_staging_dir, ).cursor() return cursor
def get_conn() -> AthenaConn: """Allow passing of pyathena connection.""" # load the environment env_name = ".env" load_dotenv(dotenv_path=os.path.join(PROJ_DIR, env_name)) return pyathena.connect(aws_access_key_id=os.getenv("ACCESS_KEY"), aws_secret_access_key=os.getenv("SECRET_KEY"), s3_staging_dir=os.getenv("S3_DIR"), region_name=os.getenv("REGION_NAME"))
def lambda_handler(event, context): ATHENA_OUTPUT_BUCKET = os.environ['Athena_bucket'] S3_BUCKET = os.environ['Working_bucket'] DB_SCHEMA = os.environ['Db_schema'] BATCH_START = event['Batch_start'] BATCH_END = event['Batch_end'] FORECAST_START = event['Data_end'] FORECAST_PERIOD = event['Forecast_period'] prediction_length = FORECAST_PERIOD * 24 connection = connect( s3_staging_dir='s3://{}/'.format(ATHENA_OUTPUT_BUCKET), region_name=REGION) output = 'meteranalytics/inference/batch_%s_%s/batch.json.out' % ( BATCH_START, BATCH_END) boto3.Session().resource('s3').Bucket(S3_BUCKET).Object( output).download_file('/tmp/batch.out.json') print('get inference result') freq = 'H' prediction_time = pd.Timestamp(FORECAST_START, freq=freq) prediction_index = pd.date_range( start=prediction_time, end=prediction_time + pd.Timedelta(prediction_length - 1, unit='H'), freq=freq) dict_of_samples = {} meterids = get_meters(connection, BATCH_START, BATCH_END, DB_SCHEMA) results = pd.DataFrame(columns=['meterid', 'datetime', 'kwh']) i = 0 with open('/tmp/batch.out.json') as fp: for line in fp: df = pd.DataFrame(data={ **json.loads(line)['quantiles'], **dict_of_samples }, index=prediction_index) dataframe = pd.DataFrame({ 'meter_id': np.array([meterids[i] for x in range(df['0.9'].count())]), 'datetime': df.index.values, 'consumption': df['0.9'].values }) i = i + 1 results = results.append(dataframe) results.to_csv('/tmp/forecast.csv', index=False) boto3.Session().resource('s3').Bucket(S3_BUCKET).Object( os.path.join( 'meteranalytics', 'forecast/{}/batch_{}_{}.csv'.format( FORECAST_START, BATCH_START, BATCH_END))).upload_file('/tmp/forecast.csv')
def create_connection(self): # pyathena.connect will do the role resolving # aws_credentials = self.aws_credentials.resolve_role('soda_scan') return pyathena.connect( aws_access_key_id=self.aws_credentials.access_key_id, aws_secret_access_key=self.aws_credentials.secret_access_key, s3_staging_dir=self.athena_staging_dir, region_name=self.aws_credentials.region_name, role_arn=self.aws_credentials.role_arn)
def run_query(self, query, user): cursor = pyathena.connect( s3_staging_dir=self.configuration['s3_staging_dir'], region_name=self.configuration['region'], aws_access_key_id=self.configuration.get('aws_access_key', None), aws_secret_access_key=self.configuration.get( 'aws_secret_key', None), schema_name=self.configuration.get('schema', 'default'), encryption_option=self.configuration.get('encryption_option', None), kms_key=self.configuration.get('kms_key', None), formatter=SimpleFormatter()).cursor() try: cursor.execute(query) column_tuples = [(i[0], _TYPE_MAPPINGS.get(i[1], None)) for i in cursor.description] columns = self.fetch_columns(column_tuples) rows = [ dict(zip(([c['name'] for c in columns]), r)) for i, r in enumerate(cursor.fetchall()) ] qbytes = None athena_query_id = None try: qbytes = cursor.data_scanned_in_bytes except AttributeError as e: logger.debug( "Athena Upstream can't get data_scanned_in_bytes: %s", e) try: athena_query_id = cursor.query_id except AttributeError as e: logger.debug("Athena Upstream can't get query_id: %s", e) data = { 'columns': columns, 'rows': rows, 'metadata': { 'data_scanned': qbytes, 'athena_query_id': athena_query_id } } json_data = simplejson.dumps(data, ignore_nan=True, cls=SimpleJSONEncoder) error = None except KeyboardInterrupt: if cursor.query_id: cursor.cancel() error = "Query cancelled by user." json_data = None except Exception as ex: if cursor.query_id: cursor.cancel() error = ex.message json_data = None return json_data, error
def authenticate(self): conn = connect( #jvm_path='/Library/Java/JavaVirtualMachines/jdk1.8.0_161.jdk/Contents/MacOS/libjli.dylib', profile_name=self.profile_name, aws_access_key_id=self.credentials.access_key, aws_secret_access_key=self.credentials.secret_key, aws_session_token=self.credentials.token, region_name=self.region_name, s3_staging_dir=self.s3_staging_dir) return conn
def __init__(self): self.cursor = connect( aws_access_key_id=settings.EXTERNAL_SERVICES["AWS"]["ATHENA"] ["ACCESS_KEY"], aws_secret_access_key=settings.EXTERNAL_SERVICES["AWS"]["ATHENA"] ["SECRET_KEY"], region_name=settings.EXTERNAL_SERVICES["AWS"]["ATHENA"] ["REGION_NAME"], s3_staging_dir=settings.EXTERNAL_SERVICES["AWS"]["ATHENA"] ["STAGING_DIR"]).cursor()
def athena_add_partition(cluster, d_yyyy, d_mm, d_dd): cursor = connect(aws_access_key_id=conf.athena_access_key, aws_secret_access_key=conf.athena_secret_access_key, s3_staging_dir=conf.s3_staging_dir, region_name=conf.region_name).cursor() command = "ALTER TABLE " + conf.athena_schema + "." + conf.athena_table + " ADD IF NOT EXISTS PARTITION " command = command + "(cluster='" + cluster + "', yyyy='" + d_yyyy + "', mm='" + d_mm + "', dd='" + d_dd + "') " command = command + "location 's3://" + conf.s3_path + cluster + "/" + d_yyyy + "/" + d_mm + "/" + d_dd + "'" Mylogger.info("Athena_add_partition:" + command) cursor.execute(command)
def create_connection(): # Create a database connection based on all of the environment information set above. app.logger.info("creating connection...") aws_credentials = get_raven_athena_credentials() return connect( s3_staging_dir=ATHENA_BUCKET, region_name=AWS_REGION, aws_access_key_id=aws_credentials["AccessKeyId"], aws_secret_access_key=aws_credentials["SecretAccessKey"], aws_session_token=aws_credentials["SessionToken"], )
def _get_client(self): """ Connect to Athena lazily :rtype: mysql.connector.connection.MySQLConnection """ self.logger.info('Connecting to Athena in "%s"...', self._connection_params['region_name']) # https://pypi.org/project/PyAthena/ return connect(**self._connection_params)
def connect_athena(path='configs/athena.yaml'): """Gets athena cursor given athena an athena configuration file.` Returns ------- pyathena cursor Athena database cursor """ athena_config = yaml.load(open(path, 'r')) return connect(**athena_config)
def run_query(self, query, user): cursor = pyathena.connect( s3_staging_dir=self.configuration["s3_staging_dir"], schema_name=self.configuration.get("schema", "default"), encryption_option=self.configuration.get("encryption_option", None), kms_key=self.configuration.get("kms_key", None), work_group=self.configuration.get("work_group", "primary"), formatter=SimpleFormatter(), **self._get_iam_credentials(user=user) ).cursor() try: cursor.execute(query) column_tuples = [ (i[0], _TYPE_MAPPINGS.get(i[1], None)) for i in cursor.description ] columns = self.fetch_columns(column_tuples) rows = [ dict(zip(([c["name"] for c in columns]), r)) for i, r in enumerate(cursor.fetchall()) ] qbytes = None athena_query_id = None try: qbytes = cursor.data_scanned_in_bytes except AttributeError as e: logger.debug("Athena Upstream can't get data_scanned_in_bytes: %s", e) try: athena_query_id = cursor.query_id except AttributeError as e: logger.debug("Athena Upstream can't get query_id: %s", e) data = { "columns": columns, "rows": rows, "metadata": { "data_scanned": qbytes, "athena_query_id": athena_query_id, }, } json_data = json_dumps(data, ignore_nan=True) error = None except (KeyboardInterrupt, InterruptException): if cursor.query_id: cursor.cancel() error = "Query cancelled by user." json_data = None except Exception as ex: if cursor.query_id: cursor.cancel() error = str(ex) json_data = None return json_data, error
def connect(self, database=None): conn = pyathena.connect( aws_access_key_id=self.aws_access_key_id, aws_secret_access_key=self.aws_secret_access_key, region_name=self.region_name, s3_staging_dir=self.s3_staging_dir, schema_name=database or self.database) self.database = database or self.database if hasattr(self, 'conn'): self.conn.close() self.conn = conn
def run_query(sql): from pyathena import connect cursor = connect( profile_name=aws_profile, s3_staging_dir=result_uri, schema_name="learn_athena", ).cursor() cursor.execute(sql) for row in cursor.fetchall(): print(row)
def lambda_handler(event, context): ATHENA_OUTPUT_BUCKET = os.environ['Athena_bucket'] DB_SCHEMA = os.environ['Db_schema'] USE_WEATHER_DATA = os.environ['With_weather_data'] pathParameter = event["pathParameters"] queryParameter = event["queryStringParameters"] if ("meter_id" not in pathParameter) \ or ("data_start" not in queryParameter) \ or ("data_end" not in queryParameter) \ or ("outlier_only" not in queryParameter): return { 'statusCode': 500, 'body': "error: meter_id, data_start, data_end and outlier_only needs to be provided." } METER_ID = pathParameter['meter_id'] DATA_START = queryParameter['data_start'] DATA_END = queryParameter['data_end'] OUTLIER_ONLY = queryParameter['outlier_only'] connection = connect( s3_staging_dir='s3://{}/'.format(ATHENA_OUTPUT_BUCKET), region_name=REGION) if USE_WEATHER_DATA == 1: query = '''with weather_daily as ( select date_trunc('day', date_parse(time,'%Y-%m-%d %H:%i:%s')) as datetime, avg(temperature) as temperature, avg(apparenttemperature) as apparenttemperature, avg(humidity) as humidity from default.weather group by 1 ) select ma.*, mw.temperature, mw.apparenttemperature from "{}".anomaly ma, weather_daily mw where meter_id = '{}' and cast(ma.ds as timestamp) >= timestamp '{}' and cast(ma.ds as timestamp) < timestamp '{}' and cast(ma.ds as timestamp) = mw.datetime '''.format(DB_SCHEMA, DB_SCHEMA, METER_ID, DATA_START, DATA_END) else: query = ''' select * from "{}".anomaly where meter_id = '{}' and cast(ds as timestamp) >= timestamp '{}' and cast(ds as timestamp) < timestamp '{}' '''.format(DB_SCHEMA, METER_ID, DATA_START, DATA_END) if OUTLIER_ONLY == 1: query = query + ' and anomaly <> 0' df_consumption = pd.read_sql(query, connection) return {"statusCode": 200, "body": df_consumption.to_json()}
def connect(self): if not self._connected: # Check for database if not athena_exist(self._dbname, self._s3bucket, self._remotepath, verbose=False): self._connected = False pm.print_error('Athena [{0}] does not exist'.format(self._dbname), exit_code=1) if check_aws_env(): try: self._connection = pyathena.connect(aws_access_key_id=os.environ['AWS_ACCESS_KEY_ID'], aws_secret_access_key=os.environ['AWS_SECRET_ACCESS_KEY'], s3_staging_dir=self._output_location, region_name=os.environ['AWS_REGION']) except Exception as e: pm.print_error('Error connecting to database') pm.print_separator() pm.print_error(str(e)) pm.print_separator() self._connected = False else: try: self._connection = pyathena.connect(s3_staging_dir=self._output_location, region_name=os.environ['AWS_REGION']) except Exception as e: pm.print_error('Error connecting to database') pm.print_separator() pm.print_error(str(e)) pm.print_separator() self._connected = False self._connected = True
def make_athena_connection(s3_staging_dir=None, region_name=None, schema_name=None, work_group=None): """ Function that connects to an Athena database. Parameters ---------- s3_staging_dir : str s3 path to data in Athena. Starts with 's3://'. region_name : str AWS region, e.g., 'us-west-2' schema_name : str name of the Athena database schema (a.k.a. database_name in AthenaEngine) work_group : None or str name of the AWS work_group Returns ------- con : pyathena.connection.Connection cur : pyathena.cursor.Cursor engine : sqlalchemy.engine.Engine """ # consolidate direct and configured arguments kwargs = _get_aws_connection_kwargs(s3_staging_dir=s3_staging_dir, region_name=region_name, schema_name=schema_name, work_group=work_group) con = pyathena.connect(**kwargs) cur = con.cursor() engine = None if kwargs['schema_name'] is not None: assert kwargs['region_name'] is not None, kwargs conn_str = make_athena_connection.CONNECTION_STRING.format( region_name=kwargs['region_name'], database_name=kwargs['schema_name']) sep = '?' if kwargs['s3_staging_dir'] is not None: conn_str += '{sep}s3_staging_dir={s3_staging_dir}'.format( sep=sep, s3_staging_dir=kwargs['s3_staging_dir']) sep = '&' if kwargs['work_group'] is not None: conn_str += '{sep}work_group={work_group}'.format( sep=sep, work_group=kwargs['work_group']) logger.debug( f"Connecting to AWS Athena with connection string:{conn_str}") engine = sa.create_engine(conn_str) return con, cur, engine
def run_query(self, query, user): cursor = pyathena.connect( s3_staging_dir=self.configuration['s3_staging_dir'], region_name=self.configuration['region'], aws_access_key_id=self.configuration.get('aws_access_key', None), aws_secret_access_key=self.configuration.get('aws_secret_key', None), schema_name=self.configuration.get('schema', 'default'), encryption_option=self.configuration.get('encryption_option', None), kms_key=self.configuration.get('kms_key', None), work_group=self.configuration.get('work_group', 'primary'), formatter=SimpleFormatter()).cursor() try: cursor.execute(query) column_tuples = [(i[0], _TYPE_MAPPINGS.get(i[1], None)) for i in cursor.description] columns = self.fetch_columns(column_tuples) rows = [dict(zip(([c['name'] for c in columns]), r)) for i, r in enumerate(cursor.fetchall())] qbytes = None athena_query_id = None try: qbytes = cursor.data_scanned_in_bytes except AttributeError as e: logger.debug("Athena Upstream can't get data_scanned_in_bytes: %s", e) try: athena_query_id = cursor.query_id except AttributeError as e: logger.debug("Athena Upstream can't get query_id: %s", e) data = { 'columns': columns, 'rows': rows, 'metadata': { 'data_scanned': qbytes, 'athena_query_id': athena_query_id } } json_data = json_dumps(data, ignore_nan=True) error = None except (KeyboardInterrupt, InterruptException): if cursor.query_id: cursor.cancel() error = "Query cancelled by user." json_data = None except Exception as ex: if cursor.query_id: cursor.cancel() error = ex.message json_data = None return json_data, error