def test_as_pandas_boolean_na_values(self, cursor): cursor.execute(""" SELECT * FROM boolean_na_values """) df = as_pandas(cursor) rows = [tuple([row["a"], row["b"]]) for _, row in df.iterrows()] self.assertEqual(rows, [(True, False), (False, None), (None, None)])
def run_query(self, sql: str, as_df: bool = False, use_cache: bool = False) -> AthenaQueryResult: """Run query on Athena. Args: sql: SQL query. as_df (optional): Whether to return the result as DataFrame. Returns: Query result as DataFrame or List[Tuple]. """ cache_size = 0 if use_cache: if self.work_group is None: raise ValueError("Workgroup must be specified to use cache.") cache_size = 50 self.cursor.execute(sql, cache_size=cache_size) if as_df: return as_pandas(self.cursor) res = [] desc = self.cursor.description for row in self.cursor: r = {} for i, c in enumerate(desc): r[c[0]] = row[i] res.append(r) return res
def get_device_data(data={}, devices=[]): added = 0 start = int(get_start().strftime('%s')) * 1000 stop = int(get_stop().strftime('%s')) * 1000 c = connect(s3_staging_dir='s3://parsyl-athena-output-production-useast1', region_name='us-east-1') for device_id in tqdm_notebook(devices, desc='device data loaded'): if device_id in data: continue stmt = """ SELECT time, temperature, humidity FROM parsyl_device_data_database.parsyl_data_lake_production_useast1_v3 WHERE device=%(device_id)s AND temperature IS NOT NULL AND time >= %(start)d AND time <= %(stop)d ORDER BY time """ try: with c.cursor() as cursor: cursor.execute(stmt, { 'device_id': device_id, 'start': start, 'stop': stop }) data[device_id] = as_pandas(cursor) added += 1 except Exception as e: print('ERROR querying device data - {}'.format(e)) c.close() return data, added
def read_athena(conn_func: Callable[[], AthenaConn], query: str, limit: Optional[int] = None) -> pd.DataFrame: """Read in clinical notes joined on icd codes.""" # define query string full_query: str = query + (f"\nORDER BY RAND()\nLIMIT {limit};" if limit else ";") # retrieve notes from AWS Athena with conn_func() as conn: cursor = conn.cursor() df: pd.DataFrame = as_pandas(cursor.execute(full_query)) return df.dropna()
def run_query(database_name,table_name): Athena_output_bucket='YOUR_S3_BUCKET/path/to/' region_name='us-west-2' cursor = connect(s3_staging_dir='s3://'+Athena_output_bucket, region_name=region_name).cursor() query = \ """ SELECT * FROM "{0}"."{1}" """.format(database_name,table_name) cursor.execute(query) df = as_pandas(cursor) print(df.describe())
def test_as_pandas_integer_na_values(self, cursor): cursor.execute(""" SELECT * FROM integer_na_values """) df = as_pandas(cursor, coerce_float=True) rows = [tuple([row["a"], row["b"]]) for _, row in df.iterrows()] # TODO AssertionError: Lists differ: # [(1.0, 2.0), (1.0, nan), (nan, nan)] != [(1.0, 2.0), (1.0, nan), (nan, nan)] # self.assertEqual(rows, [ # (1.0, 2.0), # (1.0, np.nan), # (np.nan, np.nan), # ]) np.testing.assert_array_equal(rows, [(1, 2), (1, np.nan), (np.nan, np.nan)])
def test_as_pandas(self, cursor): cursor.execute(""" SELECT col_boolean , col_tinyint , col_smallint , col_int , col_bigint , col_float , col_double , col_string , col_timestamp , CAST(col_timestamp AS time) AS col_time , col_date , col_binary , col_array , CAST(col_array AS json) AS col_array_json , col_map , CAST(col_map AS json) AS col_map_json , col_struct , col_decimal FROM one_row_complex """) df = as_pandas(cursor) rows = [ tuple([ row["col_boolean"], row["col_tinyint"], row["col_smallint"], row["col_int"], row["col_bigint"], row["col_float"], row["col_double"], row["col_string"], row["col_timestamp"], row["col_time"], row["col_date"], row["col_binary"], row["col_array"], row["col_array_json"], row["col_map"], row["col_map_json"], row["col_struct"], row["col_decimal"], ]) for _, row in df.iterrows() ] expected = [( True, 127, 32767, 2147483647, 9223372036854775807, 0.5, 0.25, "a string", datetime(2017, 1, 1, 0, 0, 0), datetime(2017, 1, 1, 0, 0, 0).time(), date(2017, 1, 2), b"123", "[1, 2]", [1, 2], "{1=2, 3=4}", { "1": 2, "3": 4 }, "{a=1, b=2}", Decimal("0.1"), )] self.assertEqual(rows, expected)
def query_aws(query: str) -> pd.DataFrame: """Execute a query on the Athena database and return as pandas.""" with get_conn() as conn: cursor = conn.cursor() df = as_pandas(cursor.execute(query)) return df
def test_as_pandas(self, cursor): cursor.execute(""" SELECT col_boolean ,col_tinyint ,col_smallint ,col_int ,col_bigint ,col_float ,col_double ,col_string ,col_timestamp ,CAST(col_timestamp AS time) AS col_time ,col_date ,col_binary ,col_array ,CAST(col_array AS json) AS col_array_json ,col_map ,CAST(col_map AS json) AS col_map_json ,col_struct ,col_decimal FROM one_row_complex """) df = as_pandas(cursor) rows = [ tuple([ row['col_boolean'], row['col_tinyint'], row['col_smallint'], row['col_int'], row['col_bigint'], row['col_float'], row['col_double'], row['col_string'], row['col_timestamp'], row['col_time'], row['col_date'], row['col_binary'], row['col_array'], row['col_array_json'], row['col_map'], row['col_map_json'], row['col_struct'], row['col_decimal'], ]) for _, row in df.iterrows() ] expected = [( True, 127, 32767, 2147483647, 9223372036854775807, 0.5, 0.25, 'a string', datetime(2017, 1, 1, 0, 0, 0), datetime(2017, 1, 1, 0, 0, 0).time(), date(2017, 1, 2), b'123', '[1, 2]', [1, 2], '{1=2, 3=4}', { '1': 2, '3': 4 }, '{a=1, b=2}', Decimal('0.1'), )] self.assertEqual(rows, expected)
} }, }, 'TableType': 'EXTERNAL_TABLE', 'Parameters': { 'classification': 'csv' } } ) #Making a query cursor = connect(region_name=region, s3_staging_dir='s3://teste-nina/').cursor() cursor.execute('SELECT * FROM "nina-teste"."covid data" limit 10;'.format(database_name, table_name)) df_sql = as_pandas(cursor) #Creating aggregation dataset fot the total Deaths, Total cases, Total of recovers and all that still active df_agg = df.agg({'Confirmed': ['sum'], 'Deaths': ['sum'], 'Recovered': ['sum'], 'Active': ['sum']}) #Calculating the letality of covid in Brazil def calc_letality(row): return row['Deaths'] / row['Confirmed'] * 100 df_agg['Letality'] = df_agg.apply(calc_letality, axis=1) df_agg.to_csv('covid-brasil-agg.csv', index=False, header=False) #Creating a new bucket to upload the aggregation bucket_name_agg = 'teste-nina-agg' s3_client.create_bucket(Bucket=bucket_name_agg)