Ejemplo n.º 1
0
def scan_athena(query, env):
    if env == Env.KCL_PREP.value:
        conn = connect(profile_name='health-customers',
                       s3_staging_dir=
                       's3://aws-athena-query-results-eu-west-1-036573440528/',
                       region_name='eu-west-1',
                       schema_name='kclprep')
        return pd.read_sql(query, conn)
    if env == Env.KCL_TEST.value:
        conn = connect(profile_name='health',
                       s3_staging_dir=
                       's3://aws-athena-query-results-458907533143-us-west-2/',
                       region_name='us-west-2',
                       schema_name='test')
        return pd.read_sql(query, conn)
    if env == Env.SHEBA_TEST.value:
        conn = connect(profile_name='health',
                       s3_staging_dir=
                       's3://aws-athena-query-results-458907533143-us-west-2/',
                       region_name='us-west-2',
                       schema_name='shebatest')
        return pd.read_sql(query, conn)
    if env == Env.SHEBA_PREP.value:
        conn = connect(profile_name='health-customers',
                       s3_staging_dir=
                       's3://aws-athena-query-results-eu-west-1-036573440528/',
                       region_name='eu-west-1',
                       schema_name='kclprep')
        return pd.read_sql(query, conn)
Ejemplo n.º 2
0
def query_from_athena_using_jdbc():
    conn = connect(
        s3_staging_dir=
        's3://krux-attribute-metrics-staging/stream-processor/pixel_attributes_stream/',
        region_name='us-east-1',
        driver_path=
        '/home/vagrant/.virtualenvs/pycharm/lib/python2.7/site-packages/pyathenajdbc/AthenaJDBC41-1.1.0.jar'
    )

    try:
        with conn.cursor() as cursor:
            cursor.execute("""
                select
                  process_time
                  ,sum(views)
                from spectrum_db.pixel_attributes_stream
                where organization = 'c819846f-b6e4-4ef4-b357-28399862b473'
                  and process_time > '2017-10-30_18:55'
                group by process_time
                order by 1 desc
                limit 10
            """)
            print(cursor.description)
            print(cursor.fetchall())
    finally:
        conn.close()
Ejemplo n.º 3
0
    def _execute_query(self, params, user, chart_query, db):

        query, bind_params = jinjasql.prepare_query(chart_query, {
            "params": params,
            "user": user
        })
        conn = connections[str(db)]
        if conn.settings_dict['NAME'] == 'Athena':
            conn = connect(driver_path=os.path.join(
                os.path.dirname(os.path.abspath(__file__)),
                'athena-jdbc/AthenaJDBC41-1.0.0.jar'))
        with conn.cursor() as cursor:
            cursor.execute(query, bind_params)
            rows = []
            cols = [desc[0] for desc in cursor.description]
            for db_row in cursor:
                row_list = []
                for col in db_row:
                    value = col
                    if isinstance(value, str):
                        # If value contains a non english alphabet
                        value = value.encode('utf-8')
                    else:
                        value = value
                    row_list.append(value)
                rows.append(row_list)
        return Table(columns=cols, data=rows)
Ejemplo n.º 4
0
    def run_query(self, query, user):
        conn = connect(
            s3_staging_dir=self.configuration['s3_staging_dir'],
            region_name=self.configuration['region'],
            access_key=self.configuration.get('aws_access_key', None),
            secret_key=self.configuration.get('aws_secret_key', None),
            schema_name=self.configuration.get('schema', 'default'),
            jvm_path=self.configuration.get('jvm_path', None),
            jvm_options=self.configuration.get['jvm_options'].split(',')
            if self.configuration.get('jvm_options', None) else None)
        try:
            with conn.cursor() as cursor:
                cursor.execute(query)
                column_tuples = [(i[0], _ATHENA_TYPES_MAPPING.get(i[1], None))
                                 for i in cursor.description]
                columns = self.fetch_columns(column_tuples)
                rows = [
                    dict(zip(([c['name'] for c in columns]), r))
                    for i, r in enumerate(cursor.fetchall())
                ]
                data = {'columns': columns, 'rows': rows}
                json_data = json.dumps(data, cls=JSONEncoder)
                error = None
        except Exception as e:
            json_data = None
            error = e.message
        finally:
            if conn:
                conn.close()

        return json_data, error
Ejemplo n.º 5
0
    def refresh_paritions(self,
                          temp_athena_staging_dir=None,
                          database_name=None):
        """
        Refresh the partitions in a table, if they exist
        """

        if self.partitions:
            if not temp_athena_staging_dir:
                if self.database:
                    temp_athena_staging_dir = self.database.s3_athena_temp_folder
                else:
                    raise ValueError(
                        "You must provide a path to a directory in s3 for Athena to cache query results"
                    )

            conn = connect(s3_staging_dir=temp_athena_staging_dir,
                           region_name='eu-west-1')

            if not database_name:
                if self.database:
                    database_name = self.database.name
                else:
                    raise KeyError(
                        "You must provide a database name, or register a database object against the table"
                    )

            sql = "MSCK REPAIR TABLE {}.{}".format(database_name, self.name)

            try:
                with conn.cursor() as cursor:
                    cursor.execute(sql)
            finally:
                conn.close()
Ejemplo n.º 6
0
    def execute_query(self, query, data=None):
        """Run a SELECT statement.

        Args:
            query: The SELECT statement to be executed
            data[Optional]: The data to be used for parametrized query

        Returns:
            Returns the result as pandas dataframe
        """
        try:
            with contextlib.closing(connect()) as conn:
                with conn.cursor() as cursor:
                    cursor.execute(query, data)
                    result_file = cursor.query_id + '.csv'

                    self.__query_id = cursor.query_id

            bucket_name = os.environ["AWS_ATHENA_S3_STAGING_DIR"]
            s3_client = boto3.client('s3')
            # Remove the s3:// part from bucket name
            obj = s3_client.get_object(Bucket=bucket_name[5:], Key=result_file)
            df = pd.read_csv(io.BytesIO(obj['Body'].read()), encoding='utf8')

        except Exception as ex:
            raise ex
        # Query executed, now load the result csv to pandas df
        return df
Ejemplo n.º 7
0
    def _new_connection(self):
        config = self.config
        if self.mode == 'jdbc':
            conn_params = {
                'Workgroup': config.workgroup,
                'AwsRegion': config.region,
                'S3OutputLocation': config.output_location
            }
            if config.default_database is not None:
                conn_params['Schema'] = config.default_database

            conn = pyathenajdbc.connect(**conn_params)

        else:  # rest, csv
            assert self.mode in ['rest', 'csv']

            conn_params = {
                'work_group': config.workgroup,
                'region_name': config.region,
                'output_location': config.output_location
            }
            if config.default_database is not None:
                conn_params['schema_name'] = config.default_database
            if self.mode == 'csv':
                conn_params['cursor_class'] = PandasCursor
            conn = pyathena.connect(**conn_params)

        return conn
Ejemplo n.º 8
0
 def get_conn(self):
     """Returns a connection object"""
     db = self.get_connection(self.athena_conn_id)
     return pyathenajdbc.connect(
         s3_staging_dir=db.extra_dejson.get('s3_staging_dir', None),
         region_name=db.host,
         access_key=db.login,
         secret_key=db.password,
         catalog=db.extra_dejson.get('catalog', 'hive'),
     )
Ejemplo n.º 9
0
def get(query):
    print("running query..")
    with contextlib.closing(
            connect(s3_staging_dir=os.getenv('ATHENA_BUCKET', 'UNSET'),
                    region_name=os.getenv('AWS_REGION', 'UNSET'))) as conn:
        with conn.cursor() as cursor:
            cursor.execute(query)
            df = as_pandas(cursor)

    return df.values.tolist
Ejemplo n.º 10
0
 def check_table(schema_name, table_name):
     """Check if a table exists in the given athena schema"""
     try:
         query = "SHOW TABLES FROM {0}".format(schema_name)
         with contextlib.closing(connect()) as conn:
             with conn.cursor() as cursor:
                 cursor.execute(query)
                 result = cursor.fetchall()
         result = table_name in [table_name[0] for table_name in result]
         return result
     except Exception as ex:
         raise ex
Ejemplo n.º 11
0
def execute_sql_athena(query_str):
    results = None
    conn = connect(
        s3_staging_dir='s3://aws-athena-query-results-827860021338-eu-west-1/',
        region_name='eu-west-1')
    try:
        with conn.cursor() as cursor:
            cursor.execute(query_str)
            results = cursor.fetchall()
    finally:
        conn.close()
    return results
Ejemplo n.º 12
0
def make_partitions(db_name, table_name, temp_dir):
    """
    Temp dir: e.g 's3://alpha-dag-data-warehouse-template/temp_delete/'
    """

    conn = connect(s3_staging_dir=temp_dir, region_name='eu-west-1')

    sql = """
    MSCK REPAIR TABLE {}.{};
    """.format(db_name, table_name)

    cursor = conn.cursor()
    cursor.execute(sql)
    conn.commit()
    def __init__(self):
        self.s3 = boto3.resource("s3",
                                 aws_access_key_id=S3_KEY,
                                 aws_secret_access_key=S3_SECRET_ACCESS_KEY)

        self.s3_client = boto3.client(
            "s3",
            aws_access_key_id=S3_KEY,
            aws_secret_access_key=S3_SECRET_ACCESS_KEY)

        self.conn = connect(
            access_key=S3_KEY,
            secret_key=S3_SECRET_ACCESS_KEY,
            region_name='us-east-1',
            s3_staging_dir='s3://{bucket}/athena/aijus/query_stage/'.format(
                bucket=DB_BUCKET))
Ejemplo n.º 14
0
    def execute_non_query(self, query, data=None):
        """Run a insert/select/ddl queries statement.

        Args:
            query: The query statement to be executed
            data[Optional]: The data to be used for parametrized query

        Returns:
            Returns the result as pandas dataframe
        """
        try:
            with contextlib.closing(connect()) as conn:
                with conn.cursor() as cursor:
                    cursor.execute(query, data)

        except Exception as ex:
            raise ex
Ejemplo n.º 15
0
def _select_pd_jdbc(sql: str,
                    aws_region: str,
                    database: Optional[str] = None,
                    params: Union[dict, list, None] = None,
                    workgroup: Optional[str] = None,
                    s3_output_location: Optional[str] = None) -> pd.DataFrame:
    sql_one_line = ' '.join(sql.split())
    klog.trace(f'running query on athena, method jdbc: {sql_one_line}')
    plain_sql, _ = kdb.convert_paramstyle(sql_one_line,
                                          params,
                                          in_style=paramstyle,
                                          out_style='plain')
    klog.trace(f'plain query: {plain_sql}')
    if klog.log_level > 0:
        klog.debug('running query on athena, method jdbc')

    _sql, _params = kdb.convert_paramstyle(sql_one_line,
                                           params,
                                           in_style=paramstyle,
                                           out_style='pyformat')

    connection_params = {
        'Workgroup': workgroup,
        'AwsRegion': aws_region,
        'S3OutputLocation': s3_output_location
    }
    if database is not None:
        connection_params['Schema'] = database

    with contextlib.closing(pyathenajdbc.connect(**connection_params)) as conn:
        with contextlib.closing(conn.cursor()) as cursor:
            results = cursor.execute(_sql, _params)
            klog.trace('query executed')

            if klog.log_level > 0:
                klog.debug('query execution completed.')
            df = pyathenajdbc.util.as_pandas(results)
            klog.trace('query results read into dataframe with {} rows.',
                       len(df))
            return df
Ejemplo n.º 16
0
def query_from_athena_using_jdbc():
    conn = connect(s3_staging_dir='s3://krux-attribute-metrics-staging/stream-processor/pixel_attributes_stream/',
                   region_name='us-east-1',
                   driver_path='/home/vagrant/.virtualenvs/pycharm/lib/python2.7/site-packages/pyathenajdbc/AthenaJDBC41-1.1.0.jar')

    try:
        with conn.cursor() as cursor:
            cursor.execute("""
                select
                  process_time
                  ,sum(views)
                from spectrum_db.pixel_attributes_stream
                where organization = 'c819846f-b6e4-4ef4-b357-28399862b473'
                  and process_time > '2017-10-30_18:55'
                group by process_time
                order by 1 desc
                limit 10
            """)
            print(cursor.description)
            print(cursor.fetchall())
    finally:
        conn.close()
from pyathenajdbc import connect

import pandas as pd


def processCategories(line):  #split categories to list
    line = list(map(str, line.split("_")))
    line = list(filter(None, line))
    return line


if __name__ == "__main__":
    conn = connect(
        access_key=ACCESS_KEY,
        secret_key=AWS_SECRET_ACCESS_KEY,
        s3_staging_dir='s3://athena-bucket-reviews',
        region_name='us-east-2',
    )

    df_false = pd.read_sql(
        """
		SELECT product_id as asin, product_category, count(review_headline) as count_reviews, avg(star_rating) as rating 
		FROM amazon_reviews_parquet 
		GROUP BY product_id, product_category 
		ORDER BY RANDOM() LIMIT 5000;""", conn)
    df_false['product_category'] = df_false['product_category'].apply(
        processCategories)

    data_true = "/reviews/products.csv"
    data_true = pd.read_csv(data_true, engine='python')
Ejemplo n.º 18
0
    def connect(self, work_group=None):
        from pyathenajdbc import connect

        return connect(Schema=SCHEMA, Workgroup=work_group)
Ejemplo n.º 19
0
 def connect(self):
     return connect()
Ejemplo n.º 20
0
def teardown_module(module):
    with contextlib.closing(connect()) as conn:
        with conn.cursor() as cursor:
            _drop_database(cursor)
Ejemplo n.º 21
0
def setup_module(module):
    with contextlib.closing(connect()) as conn:
        with conn.cursor() as cursor:
            _create_database(cursor)
            _create_table(cursor)
Ejemplo n.º 22
0
def _teardown_session():
    with contextlib.closing(connect()) as conn:
        with conn.cursor() as cursor:
            _drop_database(cursor)
Ejemplo n.º 23
0
def _setup_session(request):
    request.addfinalizer(_teardown_session)
    with contextlib.closing(connect()) as conn:
        with conn.cursor() as cursor:
            _create_database(cursor)
            _create_table(cursor)
Ejemplo n.º 24
0
#Using JDBC https://github.com/laughingman7743/PyAthenaJDBC/
import os
import pyathenajdbc

from pyathenajdbc import connect

conn = connect(S3OutputLocation='s3://YOUR_S3_BUCKET/path/1/',
               AwsRegion='us-west-2',
               Profile='YOUR_PROFILE_NAME')
try:
    with conn.cursor() as cursor:
        cursor.execute("""
        SELECT col_string FROM one_row_complex
        WHERE col_string = %(param)s
        """, {'param': 'a string'})
        print(cursor.fetchall())
finally:
    conn.close()


conn = connect(S3OutputLocation='s3://YOUR_S3_BUCKET/path/2/',
               AwsRegion='us-west-2')
try:
    with conn.cursor() as cursor:
        cursor.execute("""
        SELECT * FROM many_rows LIMIT 10
        """)
        for row in cursor:
            print(row)
finally:
    conn.close()
Ejemplo n.º 25
0
 def execute_other_thread():
     with contextlib.closing(connect(schema_name=SCHEMA)) as conn:
         with conn.cursor() as cursor:
             cursor.execute('SELECT * FROM one_row')
             return cursor.fetchall()
Ejemplo n.º 26
0
def connection_athena(access_key, secret_key, s3_staging_dir, region_name):
    conn = connect(access_key=access_key,
                   secret_key=secret_key,
                   s3_staging_dir=s3_staging_dir,
                   region_name=region_name)
    return conn
Ejemplo n.º 27
0
 def connect(self):
     return connect(schema_name=SCHEMA)
Ejemplo n.º 28
0
 def connecti(self):
     self.conn = pyathenajdbc.connect(
         s3_staging_dir="s3://athena-internship",
         region_name="us-east-1",
         log_path=log_path,
     )