def test_get_columns_dict(self): main_db = db.session.query( models.Database).filter_by(database_name='main').first() df = main_db.get_df("SELECT * FROM multiformat_time_series", None) cdf = dataframe.CaravelDataFrame(df) if main_db.sqlalchemy_uri.startswith('sqlite'): self.assertEqual([{ 'is_date': True, 'type': 'datetime_string', 'name': 'ds', 'is_dim': False }, { 'is_date': True, 'type': 'datetime_string', 'name': 'ds2', 'is_dim': False }, { 'agg': 'sum', 'is_date': False, 'type': 'int64', 'name': 'epoch_ms', 'is_dim': False }, { 'agg': 'sum', 'is_date': False, 'type': 'int64', 'name': 'epoch_s', 'is_dim': False }, { 'is_date': True, 'type': 'datetime_string', 'name': 'string0', 'is_dim': False }, { 'is_date': False, 'type': 'object', 'name': 'string1', 'is_dim': True }, { 'is_date': True, 'type': 'datetime_string', 'name': 'string2', 'is_dim': False }, { 'is_date': False, 'type': 'object', 'name': 'string3', 'is_dim': True }], cdf.columns_dict) else: self.assertEqual([{ 'is_date': True, 'type': 'datetime_string', 'name': 'ds', 'is_dim': False }, { 'is_date': True, 'type': 'datetime64[ns]', 'name': 'ds2', 'is_dim': False }, { 'agg': 'sum', 'is_date': False, 'type': 'int64', 'name': 'epoch_ms', 'is_dim': False }, { 'agg': 'sum', 'is_date': False, 'type': 'int64', 'name': 'epoch_s', 'is_dim': False }, { 'is_date': True, 'type': 'datetime_string', 'name': 'string0', 'is_dim': False }, { 'is_date': False, 'type': 'object', 'name': 'string1', 'is_dim': True }, { 'is_date': True, 'type': 'datetime_string', 'name': 'string2', 'is_dim': False }, { 'is_date': False, 'type': 'object', 'name': 'string3', 'is_dim': True }], cdf.columns_dict)
def get_sql_results(self, query_id, return_results=True, store_results=False): """Executes the sql query returns the results.""" if not self.request.called_directly: engine = sqlalchemy.create_engine( app.config.get('SQLALCHEMY_DATABASE_URI'), poolclass=NullPool) session_class = sessionmaker() session_class.configure(bind=engine) session = session_class() else: session = db.session() session.commit() # HACK query = session.query(models.Query).filter_by(id=query_id).one() database = query.database executed_sql = query.sql.strip().strip(';') db_engine_spec = database.db_engine_spec def handle_error(msg): """Local method handling error while processing the SQL""" query.error_message = msg query.status = QueryStatus.FAILED query.tmp_table_name = None session.commit() raise Exception(query.error_message) # Limit enforced only for retrieving the data, not for the CTA queries. is_select = is_query_select(executed_sql) if not is_select and not database.allow_dml: handle_error( "Only `SELECT` statements are allowed against this database") if query.select_as_cta: if not is_select: handle_error( "Only `SELECT` statements can be used with the CREATE TABLE " "feature.") if not query.tmp_table_name: start_dttm = datetime.fromtimestamp(query.start_time) query.tmp_table_name = 'tmp_{}_table_{}'.format( query.user_id, start_dttm.strftime('%Y_%m_%d_%H_%M_%S')) executed_sql = create_table_as(executed_sql, query.tmp_table_name, database.force_ctas_schema) query.select_as_cta_used = True elif (query.limit and is_select and db_engine_spec.limit_method == LimitMethod.WRAP_SQL): executed_sql = database.wrap_sql_limit(executed_sql, query.limit) query.limit_used = True engine = database.get_sqla_engine(schema=query.schema) try: template_processor = get_template_processor(database=database, query=query) executed_sql = template_processor.process_template(executed_sql) except Exception as e: logging.exception(e) msg = "Template rendering failed: " + utils.error_msg_from_exception(e) handle_error(msg) try: query.executed_sql = executed_sql logging.info("Running query: \n{}".format(executed_sql)) result_proxy = engine.execute(query.executed_sql, schema=query.schema) except Exception as e: logging.exception(e) handle_error(utils.error_msg_from_exception(e)) cursor = result_proxy.cursor query.status = QueryStatus.RUNNING session.flush() db_engine_spec.handle_cursor(cursor, query, session) cdf = None if result_proxy.cursor: column_names = [col[0] for col in result_proxy.cursor.description] if db_engine_spec.limit_method == LimitMethod.FETCH_MANY: data = result_proxy.fetchmany(query.limit) else: data = result_proxy.fetchall() cdf = dataframe.CaravelDataFrame( pd.DataFrame(data, columns=column_names)) query.rows = result_proxy.rowcount query.progress = 100 query.status = QueryStatus.SUCCESS if query.rows == -1 and cdf: # Presto doesn't provide result_proxy.row_count query.rows = cdf.size if query.select_as_cta: query.select_sql = '{}'.format( database.select_star(query.tmp_table_name, limit=query.limit)) query.end_time = utils.now_as_float() session.flush() payload = { 'query_id': query.id, 'status': query.status, 'data': [], } payload['data'] = cdf.data if cdf else [] payload['columns'] = cdf.columns_dict if cdf else [] payload['query'] = query.to_dict() payload = json.dumps(payload, default=utils.json_iso_dttm_ser) if store_results and results_backend: key = '{}'.format(uuid.uuid4()) logging.info("Storing results in results backend, key: {}".format(key)) results_backend.set(key, zlib.compress(payload)) query.results_key = key session.flush() session.commit() if return_results: return payload
def get_sql_results(query_id, return_results=True): """Executes the sql query returns the results.""" session = db.session() session.commit() # HACK query = session.query(models.Query).filter_by(id=query_id).one() database = query.database executed_sql = query.sql.strip().strip(';') def handle_error(msg): """Local method handling error while processing the SQL""" query.error_message = msg query.status = QueryStatus.FAILED query.tmp_table_name = None session.commit() raise Exception(query.error_message) # Limit enforced only for retrieving the data, not for the CTA queries. is_select = is_query_select(executed_sql) if not is_select and not database.allow_dml: handle_error( "Only `SELECT` statements are allowed against this database") if query.select_as_cta: if not is_select: handle_error( "Only `SELECT` statements can be used with the CREATE TABLE " "feature.") if not query.tmp_table_name: start_dttm = datetime.fromtimestamp(query.start_time) query.tmp_table_name = 'tmp_{}_table_{}'.format( query.user_id, start_dttm.strftime('%Y_%m_%d_%H_%M_%S')) executed_sql = create_table_as(executed_sql, query.tmp_table_name, database.force_ctas_schema) query.select_as_cta_used = True elif query.limit and is_select: executed_sql = database.wrap_sql_limit(executed_sql, query.limit) query.limit_used = True engine = database.get_sqla_engine(schema=query.schema) try: query.executed_sql = executed_sql logging.info("Running query: \n{}".format(executed_sql)) result_proxy = engine.execute(query.executed_sql, schema=query.schema) except Exception as e: logging.exception(e) handle_error(utils.error_msg_from_exception(e)) cursor = result_proxy.cursor query.status = QueryStatus.RUNNING session.flush() if database.backend == 'presto': polled = cursor.poll() # poll returns dict -- JSON status information or ``None`` # if the query is done # https://github.com/dropbox/PyHive/blob/ # b34bdbf51378b3979eaf5eca9e956f06ddc36ca0/pyhive/presto.py#L178 while polled: # Update the object and wait for the kill signal. stats = polled.get('stats', {}) if stats: completed_splits = float(stats.get('completedSplits')) total_splits = float(stats.get('totalSplits')) if total_splits and completed_splits: progress = 100 * (completed_splits / total_splits) if progress > query.progress: query.progress = progress session.commit() time.sleep(1) polled = cursor.poll() cdf = None if result_proxy.cursor: column_names = [col[0] for col in result_proxy.cursor.description] data = result_proxy.fetchall() cdf = dataframe.CaravelDataFrame( pd.DataFrame(data, columns=column_names)) # TODO consider generating tuples instead of dicts to send # less data through the wire. The command bellow does that, # but we'd need to align on the client side. # data = df.values.tolist() query.rows = result_proxy.rowcount query.progress = 100 query.status = QueryStatus.SUCCESS if query.rows == -1 and cdf: # Presto doesn't provide result_proxy.row_count query.rows = cdf.size if query.select_as_cta: query.select_sql = '{}'.format( database.select_star(query.tmp_table_name, limit=query.limit)) query.end_time = utils.now_as_float() session.commit() if return_results: payload = { 'query_id': query.id, 'status': query.status, 'data': [], } if query.status == models.QueryStatus.SUCCESS: payload['data'] = cdf.data if cdf else [] payload['columns'] = cdf.columns_dict if cdf else [] else: payload['error'] = query.error_message return payload '''