def _read_table(*, connection: tab_api.Connection, table: TableType) -> pd.DataFrame: if isinstance(table, str): table = tab_api.TableName(table) table_def = connection.catalog.get_table_definition(table) columns = table_def.columns dtypes: Dict[str, str] = {} for column in columns: column_type = pantab_types._ColumnType(column.type, column.nullability) dtypes[column.name.unescaped] = _tableau_to_pandas_type(column_type) with connection.execute_query(f"SELECT * from {table}") as result: df = pd.DataFrame(result) df.columns = dtypes.keys() # The tableauhyperapi.Timestamp class is not implicitly convertible to a datetime # so we need to run an apply against applicable types for key, val in dtypes.items(): if val == "datetime64[ns]": df[key] = df[key].apply(lambda x: x._to_datetime()) elif val == "datetime64[ns, UTC]": df[key] = df[key].apply(lambda x: x._to_datetime()).dt.tz_localize( "UTC") elif val == "timedelta64[ns]": df[key] = df[key].apply(_interval_to_timedelta) df = df.astype(dtypes) df = df.fillna(value=np.nan) # Replace any appearances of None return df
class Hyper(System): def __init__(self, filename): self.db = HyperProcess(Telemetry.DO_NOT_SEND_USAGE_DATA_TO_TABLEAU) self.conn = Connection(self.db.endpoint, filename, CreateMode.CREATE) def create(self, ddl): count = self.conn.execute_command(ddl) return [count] def load(self, filename): count = self.conn.execute_command("COPY logs FROM '" + filename + "' WITH (FORMAT csv, HEADER)") return [count] def query(self, sql): #schema = result.schema() return self.conn.execute_query(sql)
def test_to_dss_date(self): schema_converter = SchemaConversion() path_to_hyper = "data/superstore_sample.hyper" hyper = HyperProcess(Telemetry.DO_NOT_SEND_USAGE_DATA_TO_TABLEAU) connection = Connection(hyper.endpoint, path_to_hyper) hyper_table = TableName('public', 'Orders') hyper_table_def = connection.catalog.get_table_definition(hyper_table) result = connection.execute_query(f'SELECT * FROM {hyper_table}') for row in result: pass sample_date = row[2].to_date() dss_date = datetime.datetime(sample_date.year, sample_date.month, sample_date.day) connection.close() hyper.close() dss_columns = schema_converter.hyper_columns_to_dss_columns( hyper_table_def.columns) return True
def _read_table(*, connection: tab_api.Connection, table: TableType) -> pd.DataFrame: if isinstance(table, str): table = tab_api.TableName(table) table_def = connection.catalog.get_table_definition(table) columns = table_def.columns dtypes: Dict[str, str] = {} for column in columns: column_type = pantab_types._ColumnType(column.type, column.nullability) try: dtypes[column.name. unescaped] = pantab_types._pandas_types[column_type] except KeyError as e: raise TypeError( f"Column {column.name} has unsupported datatype {column.type} " f"with nullability {column.nullability}") from e query = f"SELECT * from {table}" with connection.execute_query(query) as result: return _read_query_result(result, dtypes)
class TableauTableReader(object): def __init__(self, schema_name, table_name): """ Wrapper for the Tableau Hyper formatter :param schema_name : name of the schema as stored in the Tableau Hyper file :param table_name : name of the table as stored in the Tableau Hyper file """ self.table_name = table_name self.schema_name = schema_name self.hyper_table = None self.hyper_columns = None self.hyper_storage_types = None self.dss_columns = None self.dss_storage_types = None self.rows = [] self.row_index = 0 self.path_to_hyper = None self.hyper = None self.connection = None self.schema_converter = SchemaConversion() # Handle batch querying self.offset = 0 self.limit = 10000 self.end_read = False def create_tmp_hyper_file(self): """ Create a temporary file to store the streaming buffer :return: self.path_to_hyper: path to the temporary file """ cache_dir = get_cache_location_from_user_config() # Set the delete parameter to False imperatively to avoid early deletion self.path_to_hyper = tempfile.NamedTemporaryFile( suffix=".hyper", prefix="tmp_hyper_file_", delete=False, dir=cache_dir).name logger.info( "Creating temporary file to store future buffer stream from Hyper: {} " .format(self.path_to_hyper)) def read_buffer(self, stream): """ Read and store the full stream :param stream: stream coming from the Tableau Hyper file :return: """ line = True with open(self.path_to_hyper, "ab") as f: while line: line = stream.read(1024) f.write(line) logger.info("Stored the full stream as bytes") def open_connection(self): """ Open the connection to the Tableau Hyper file and the database """ self.hyper = HyperProcess(Telemetry.DO_NOT_SEND_USAGE_DATA_TO_TABLEAU) self.connection = Connection(self.hyper.endpoint, self.path_to_hyper) logger.info("Opened the connection to Tableau Hyper file") def read_hyper_columns(self): """ Read from the Tableau Hyper file the columns and schema of the table :return: self.hyper_storage_types """ logger.info("Trying to read Tableau Hyper table {}.{} ...".format( self.schema_name, self.table_name)) hyper_table = TableName(self.schema_name, self.table_name) self.hyper_table = hyper_table try: table_def = self.connection.catalog.get_table_definition( hyper_table) except HyperException as e: logger.warning( "The target table does not exists in this hyper file. Requested table: {}.{}" .format(self.table_name, self.schema_name)) raise Exception("Table does not exist: {}.{}".format( self.schema_name, self.table_name)) self.hyper_columns = table_def.columns self.hyper_storage_types = [ column.type.tag for column in self.hyper_columns ] self.dss_columns = self.schema_converter.hyper_columns_to_dss_columns( self.hyper_columns) self.dss_storage_types = [ column['type'] for column in self.dss_columns ] self.schema_converter.set_dss_storage_types(self.dss_storage_types) self.schema_converter.set_hyper_storage_types(self.hyper_storage_types) def fetch_rows(self, offset, limit): """ Retrieve all the rows from the Tableau Hyper file, convert values on the fly """ sql_hyper_query = f'SELECT {build_query(self.hyper_columns)} FROM {self.hyper_table} OFFSET {offset} LIMIT {limit}' logger.warning("SQL query: {} ".format(sql_hyper_query)) try: result = self.connection.execute_query(sql_hyper_query) except Exception as err: logger.fatal("Tried to execute query but was unsuccessful.") raise err for row in result: self.rows.append(row) def close_connection(self): """ Close the connection to the Tableau Hyper file """ self.connection.close() self.hyper.close() if os.path.exists(self.path_to_hyper): os.remove(self.path_to_hyper) def read_schema(self): """ Access schema """ logger.info("Send to dss during read_schema: {}".format( self.dss_columns)) return self.dss_columns def read_row(self): """ Read one row from the stored data """ if self.end_read: return None if len(self.rows) == 0: self.fetch_rows(self.offset, self.limit) self.offset += self.limit if len(self.rows) == 0: self.close_connection() self.end_read = True logger.info("Finished reading rows from hyper file...") return None else: hyper_row = self.rows.pop() dss_row = self.schema_converter.prepare_row_to_dss(hyper_row) row = {} for column, value in zip(self.dss_columns, dss_row): row[column["name"]] = value self.row_index += 1 return row
class HyperKernel(Kernel): implementation = 'Hyper' implementation_version = '0.0' language = 'sql' language_version = '0.0' language_info = { 'name': 'sql', 'mimetype': 'text/sql', 'file_extension': '.sql', } banner = "Hyper 🚀 - Your friendly neighborhood SQL database.\n" +\ "Type '\\?' for help." def __init__(self, *args, **kwargs): super(HyperKernel, self).__init__(*args, **kwargs) self._hyper_process = HyperProcess(Telemetry.DO_NOT_SEND_USAGE_DATA_TO_TABLEAU, 'jupyter_sql_kernel') self._connection = Connection(self._hyper_process.endpoint) self._output_func = self._display_output def do_shutdown(self, restart): self._connection.close() self._hyper_process.close() return {'status': 'ok', 'restart': restart} def _success_response(self, payloads=[]): return { 'status': 'ok', # The base class increments the execution count for us already 'execution_count': self.execution_count, 'payload': payloads, 'user_expressions': {}, } def _error_response(self, ename, evalue, traceback): # Format & send the error message error_response = { 'ename': ename, 'evalue': evalue, 'traceback': traceback } self.send_response(self.iopub_socket, 'error', error_response) error_response['status'] = 'error' error_response['execution_count'] = self.execution_count return error_response def _send_text(self, txt): self.send_response(self.iopub_socket, 'display_data', {'data': {'text/plain': txt}, 'metadata': {}}) def _format_hyper_error(self, e): formatted = f"Error:\n{e.main_message}" if e.hint: formatted += f"HINT: {e.hint}" return formatted def _display_output(self, sql_result, silent): if not silent: column_names = [c.name for c in sql_result.schema.columns] result = list(sql_result) if column_names or result: response_data = { 'text/plain': tabulate(result, headers=column_names), 'text/html': tabulate(result, headers=column_names, tablefmt='html'), } # Integration with the "@tableau/query-graphs-jupyterlab-extension" extension for plan rendering in JupyterLab if column_names == ["plan"]: try: response_data['application/vnd.tableau.hyper-queryplan'] = json.loads("".join(row[0] for row in result)) except json.JSONDecodeError as e: pass # Support for "Vega output" form Hyper. # In case the user is skilled enough to write a SQL query which outputs a Vega visualizations, go ahead and display the visualization in JupyterLab. if len(column_names) == 1 and len(result) == 1 and isinstance(result[0][0], str): try: parsed = json.loads(result[0][0]) if isinstance(parsed, dict): if parsed.get("$schema", "").startswith('https://vega.github.io/schema/vega/'): response_data['application/vnd.vega.v5+json'] = parsed del response_data['text/html'] if parsed.get("$schema", "").startswith('https://vega.github.io/schema/vega-lite/'): response_data['application/vnd.vegalite.v3+json'] = parsed del response_data['text/html'] except json.JSONDecodeError as e: pass self.send_response(self.iopub_socket, 'display_data', {'source': 'sql', 'data': response_data, 'metadata': {}}) def _create_file_output_func(self, filename): def _file_output(self, sql_result, silent): with open(filename, "a") as f: column_names = [c.name for c in sql_result.schema.columns] result = list(sql_result) f.write(tabulate(result, headers=column_names)) f.write("\n") return _file_output.__get__(self, HyperKernel) def _discard_output(self, sql_result, silent): if sql_result is not None and sql_result.schema is not None: # We still want to fetch the whole result (to not screw up timing measurements) for i in sql_result: pass def execute_sql(self, code, silent): "Execute a SQL query and display the results to the user" start_time = time.perf_counter() try: with self._connection.execute_query(code) as sql_result: self._output_func(sql_result, silent) except HyperException as e: # Format & send the error message return self._error_response(str("HyperException"), str(e.args[0]), [self._format_hyper_error(e)]) end_time = time.perf_counter() elapsed = end_time - start_time self._send_text('{:.3f}s elapsed'.format(elapsed)) return self._success_response() def _command_input_sql(self, args): """ Read SQL query from a file and execute it """ if len(args) != 1: return self._error_response("InvalidClientCommandArguments", repr(args), ["Unexpected number of arguments"]) filename = args[0] try: with open(filename) as f: file_content = f.read() except: return self._error_response("IOError", repr(args), [f"Unable to read file '{filename}'"]) self.execute_sql(file_content, silent=False) def _command_redirect_output(self, args): """ Redirect output into a file """ if len(args) > 1: return self._error_response("InvalidClientCommandArguments", repr(args), ["Unexpected number of arguments"]) if len(args) == 0: self._output_func = self._display_output elif args[0] == "-": self._output_func = self._discard_output else: filename = args[0] # Truncate the file & create if it does not exist try: with open(filename, "w"): pass except: return self._error_response("IOError", repr(args), [f"Unable to read file '{filename}'"]) self._output_func = self._create_file_output_func(filename) def _command_attach(self, args): """ Open a Hyper file """ if len(args) != 2: return self._error_response("InvalidClientCommandArguments", repr(args), ["Unexpected number of arguments"]) database_path = args[0] alias = args[1] try: self._connection.catalog.attach_database(database_path, alias) except HyperException as e: # Format & send the error message return self._error_response(str("HyperException"), str(e.args[0]), [self._format_hyper_error(e)]) def _command_detach(self, args): """ Close a Hyper file """ if len(args) != 1: return self._error_response("InvalidClientCommandArguments", repr(args), ["Unexpected number of arguments"]) alias = args[0] try: self._connection.catalog.detach_database(alias) except HyperException as e: # Format & send the error message return self._error_response(str("HyperException"), str(e.args[0]), [self._format_hyper_error(e)]) def _process_client_command(self, code, silent): "Execute a client command" commands = { "i": self._command_input_sql, "o": self._command_redirect_output, "attach": self._command_attach, "detach": self._command_detach, } # Tokenize command line code = code.lstrip() assert code[0] == '\\' code = code[1:] args = list(shlex.split(code, posix=True)) cmd = args.pop(0) if cmd == "?" or cmd == "help": help_text = 'SQL command reference: https://help.tableau.com/current/api/hyper_api/en-us/reference/sql/sql-commands.html\n' help_text += 'Additional client-side commands:\n' help_text += tabulate((["\\" + c[0], c[1].__doc__] for c in commands.items()), tablefmt='plain') help_text += '\n' help_text += 'Parameters are parsed in POSIX shell manner.\n' self._send_text(help_text) return self._success_response() if cmd not in commands: return self._error_response("UnknownClientCommand", cmd, [f"Unknown client command \{cmd}"]) response = commands[cmd](args) return response if response is not None else self._success_response() def do_execute(self, code, silent, store_history=True, user_expressions=None, allow_stdin=False): if code.lstrip()[0] == '\\': return self._process_client_command(code, silent) else: return self.execute_sql(code, silent)