Ejemplo n.º 1
0
 def test_as_pandas_boolean_na_values(self, cursor):
     cursor.execute("""
         SELECT * FROM boolean_na_values
         """)
     df = as_pandas(cursor)
     rows = [tuple([row["a"], row["b"]]) for _, row in df.iterrows()]
     self.assertEqual(rows, [(True, False), (False, None), (None, None)])
Ejemplo n.º 2
0
    def run_query(self,
                  sql: str,
                  as_df: bool = False,
                  use_cache: bool = False) -> AthenaQueryResult:
        """Run query on Athena.

        Args:
            sql: SQL query.
            as_df (optional): Whether to return the result as DataFrame.

        Returns:
            Query result as DataFrame or List[Tuple].
        """
        cache_size = 0
        if use_cache:
            if self.work_group is None:
                raise ValueError("Workgroup must be specified to use cache.")
            cache_size = 50
        self.cursor.execute(sql, cache_size=cache_size)

        if as_df:
            return as_pandas(self.cursor)

        res = []
        desc = self.cursor.description
        for row in self.cursor:
            r = {}
            for i, c in enumerate(desc):
                r[c[0]] = row[i]
            res.append(r)
        return res
Ejemplo n.º 3
0
def get_device_data(data={}, devices=[]):
    added = 0
    start = int(get_start().strftime('%s')) * 1000
    stop = int(get_stop().strftime('%s')) * 1000

    c = connect(s3_staging_dir='s3://parsyl-athena-output-production-useast1',
                region_name='us-east-1')

    for device_id in tqdm_notebook(devices, desc='device data loaded'):
        if device_id in data:
            continue

        stmt = """
            SELECT time, temperature, humidity
            FROM parsyl_device_data_database.parsyl_data_lake_production_useast1_v3
            WHERE device=%(device_id)s AND temperature IS NOT NULL AND time >= %(start)d AND time <= %(stop)d
            ORDER BY time
        """
        try:
            with c.cursor() as cursor:
                cursor.execute(stmt, {
                    'device_id': device_id,
                    'start': start,
                    'stop': stop
                })
                data[device_id] = as_pandas(cursor)
                added += 1
        except Exception as e:
            print('ERROR querying device data - {}'.format(e))

    c.close()

    return data, added
Ejemplo n.º 4
0
def read_athena(conn_func: Callable[[], AthenaConn],
                query: str,
                limit: Optional[int] = None) -> pd.DataFrame:
    """Read in clinical notes joined on icd codes."""
    # define query string
    full_query: str = query + (f"\nORDER BY RAND()\nLIMIT {limit};"
                               if limit else ";")

    # retrieve notes from AWS Athena
    with conn_func() as conn:
        cursor = conn.cursor()
        df: pd.DataFrame = as_pandas(cursor.execute(full_query))
    return df.dropna()
Ejemplo n.º 5
0
def run_query(database_name,table_name):

    Athena_output_bucket='YOUR_S3_BUCKET/path/to/'
    region_name='us-west-2'

    cursor = connect(s3_staging_dir='s3://'+Athena_output_bucket,
                 region_name=region_name).cursor()
    query = \
        """
        SELECT * FROM "{0}"."{1}" 
        """.format(database_name,table_name)
    
    cursor.execute(query)
    df = as_pandas(cursor)
    print(df.describe())
Ejemplo n.º 6
0
 def test_as_pandas_integer_na_values(self, cursor):
     cursor.execute("""
         SELECT * FROM integer_na_values
         """)
     df = as_pandas(cursor, coerce_float=True)
     rows = [tuple([row["a"], row["b"]]) for _, row in df.iterrows()]
     # TODO AssertionError: Lists differ:
     #  [(1.0, 2.0), (1.0, nan), (nan, nan)] != [(1.0, 2.0), (1.0, nan), (nan, nan)]
     # self.assertEqual(rows, [
     #     (1.0, 2.0),
     #     (1.0, np.nan),
     #     (np.nan, np.nan),
     # ])
     np.testing.assert_array_equal(rows, [(1, 2), (1, np.nan),
                                          (np.nan, np.nan)])
Ejemplo n.º 7
0
 def test_as_pandas(self, cursor):
     cursor.execute("""
         SELECT
           col_boolean
           , col_tinyint
           , col_smallint
           , col_int
           , col_bigint
           , col_float
           , col_double
           , col_string
           , col_timestamp
           , CAST(col_timestamp AS time) AS col_time
           , col_date
           , col_binary
           , col_array
           , CAST(col_array AS json) AS col_array_json
           , col_map
           , CAST(col_map AS json) AS col_map_json
           , col_struct
           , col_decimal
         FROM one_row_complex
         """)
     df = as_pandas(cursor)
     rows = [
         tuple([
             row["col_boolean"],
             row["col_tinyint"],
             row["col_smallint"],
             row["col_int"],
             row["col_bigint"],
             row["col_float"],
             row["col_double"],
             row["col_string"],
             row["col_timestamp"],
             row["col_time"],
             row["col_date"],
             row["col_binary"],
             row["col_array"],
             row["col_array_json"],
             row["col_map"],
             row["col_map_json"],
             row["col_struct"],
             row["col_decimal"],
         ]) for _, row in df.iterrows()
     ]
     expected = [(
         True,
         127,
         32767,
         2147483647,
         9223372036854775807,
         0.5,
         0.25,
         "a string",
         datetime(2017, 1, 1, 0, 0, 0),
         datetime(2017, 1, 1, 0, 0, 0).time(),
         date(2017, 1, 2),
         b"123",
         "[1, 2]",
         [1, 2],
         "{1=2, 3=4}",
         {
             "1": 2,
             "3": 4
         },
         "{a=1, b=2}",
         Decimal("0.1"),
     )]
     self.assertEqual(rows, expected)
Ejemplo n.º 8
0
def query_aws(query: str) -> pd.DataFrame:
    """Execute a query on the Athena database and return as pandas."""
    with get_conn() as conn:
        cursor = conn.cursor()
        df = as_pandas(cursor.execute(query))
    return df
Ejemplo n.º 9
0
 def test_as_pandas(self, cursor):
     cursor.execute("""
     SELECT
       col_boolean
       ,col_tinyint
       ,col_smallint
       ,col_int
       ,col_bigint
       ,col_float
       ,col_double
       ,col_string
       ,col_timestamp
       ,CAST(col_timestamp AS time) AS col_time
       ,col_date
       ,col_binary
       ,col_array
       ,CAST(col_array AS json) AS col_array_json
       ,col_map
       ,CAST(col_map AS json) AS col_map_json
       ,col_struct
       ,col_decimal
     FROM one_row_complex
     """)
     df = as_pandas(cursor)
     rows = [
         tuple([
             row['col_boolean'],
             row['col_tinyint'],
             row['col_smallint'],
             row['col_int'],
             row['col_bigint'],
             row['col_float'],
             row['col_double'],
             row['col_string'],
             row['col_timestamp'],
             row['col_time'],
             row['col_date'],
             row['col_binary'],
             row['col_array'],
             row['col_array_json'],
             row['col_map'],
             row['col_map_json'],
             row['col_struct'],
             row['col_decimal'],
         ]) for _, row in df.iterrows()
     ]
     expected = [(
         True,
         127,
         32767,
         2147483647,
         9223372036854775807,
         0.5,
         0.25,
         'a string',
         datetime(2017, 1, 1, 0, 0, 0),
         datetime(2017, 1, 1, 0, 0, 0).time(),
         date(2017, 1, 2),
         b'123',
         '[1, 2]',
         [1, 2],
         '{1=2, 3=4}',
         {
             '1': 2,
             '3': 4
         },
         '{a=1, b=2}',
         Decimal('0.1'),
     )]
     self.assertEqual(rows, expected)
Ejemplo n.º 10
0
                }
            },
        },
        'TableType': 'EXTERNAL_TABLE',
        'Parameters': {
            'classification': 'csv'
        }
    }
)


#Making a query 
cursor = connect(region_name=region, s3_staging_dir='s3://teste-nina/').cursor()
cursor.execute('SELECT * FROM "nina-teste"."covid data" limit 10;'.format(database_name, table_name))

df_sql = as_pandas(cursor)

#Creating aggregation dataset fot the total Deaths, Total cases, Total of recovers and all that still active
df_agg = df.agg({'Confirmed': ['sum'], 'Deaths': ['sum'], 'Recovered': ['sum'], 'Active': ['sum']})

#Calculating the letality of covid in Brazil
def calc_letality(row):
    return row['Deaths'] / row['Confirmed'] * 100

df_agg['Letality'] = df_agg.apply(calc_letality, axis=1)

df_agg.to_csv('covid-brasil-agg.csv', index=False, header=False)

#Creating a new bucket to upload the aggregation
bucket_name_agg = 'teste-nina-agg'
s3_client.create_bucket(Bucket=bucket_name_agg)