def __init__(self): self.location = "./output/meli_challenge_result.hyper" self.test_location = "../output/meli_challenge_result.hyper" self.searchResult_table = TableDefinition('results', [ TableDefinition.Column('id', SqlType.text(), Nullability.NOT_NULLABLE), TableDefinition.Column('site_id', SqlType.text(), Nullability.NOT_NULLABLE), TableDefinition.Column('title', SqlType.text(), Nullability.NOT_NULLABLE), TableDefinition.Column('seller', SqlType.text(), Nullability.NOT_NULLABLE), TableDefinition.Column('price', SqlType.text(), Nullability.NOT_NULLABLE), TableDefinition.Column('prices', SqlType.json(), Nullability.NOT_NULLABLE), TableDefinition.Column('sale_price', SqlType.text(), Nullability.NULLABLE), TableDefinition.Column('currency_id', SqlType.text(), Nullability.NULLABLE), TableDefinition.Column('available_quantity', SqlType.int(), Nullability.NULLABLE), TableDefinition.Column('sold_quantity', SqlType.int(), Nullability.NULLABLE), TableDefinition.Column('buying_mode', SqlType.text(), Nullability.NULLABLE), TableDefinition.Column('listing_type_id', SqlType.text(), Nullability.NULLABLE), TableDefinition.Column('stop_time', SqlType.text(), Nullability.NULLABLE), TableDefinition.Column('condition', SqlType.text(), Nullability.NULLABLE), TableDefinition.Column('permalink', SqlType.text(), Nullability.NULLABLE), TableDefinition.Column('thumbnail', SqlType.text(), Nullability.NULLABLE), TableDefinition.Column('accepts_mercadopago', SqlType.bool(), Nullability.NULLABLE), TableDefinition.Column('installments', SqlType.text(), Nullability.NULLABLE), TableDefinition.Column('address', SqlType.text(), Nullability.NULLABLE), TableDefinition.Column('shipping', SqlType.text(), Nullability.NULLABLE), TableDefinition.Column('seller_address', SqlType.text(), Nullability.NULLABLE), TableDefinition.Column('attributes', SqlType.text(), Nullability.NOT_NULLABLE), TableDefinition.Column('original_price', SqlType.text(), Nullability.NULLABLE), TableDefinition.Column('category_id', SqlType.text(), Nullability.NOT_NULLABLE), TableDefinition.Column('official_store_id', SqlType.int(), Nullability.NULLABLE), TableDefinition.Column('domain_id', SqlType.text(), Nullability.NULLABLE), TableDefinition.Column('catalog_product_id', SqlType.text(), Nullability.NULLABLE), TableDefinition.Column('tags', SqlType.text(), Nullability.NULLABLE), TableDefinition.Column('catalog_listing', SqlType.bool(), Nullability.NULLABLE), TableDefinition.Column('order_backend', SqlType.int(), Nullability.NULLABLE), ])
def convert_datatype(coldatatype): """ [summary] This converts the datatype of the column of a given dataframe. Args: Datatype of the column in string format Returns: The tableau hyper extract compatible datatype after converting the dataframe datatype and a default value for NaN cases """ datatype = SqlType.text() def_value = '' if 'datetime' in coldatatype.lower(): datatype = SqlType.timestamp() elif 'str' in coldatatype.lower(): datatype = SqlType.text() elif 'boolean' in coldatatype.lower(): datatype = SqlType.bool() elif 'int' in coldatatype.lower(): datatype = SqlType.int() def_value = 0 elif 'float' in coldatatype.lower(): datatype = SqlType.double() def_value = 0 elif 'period' in coldatatype.lower(): datatype = SqlType.interval() elif 'object' in coldatatype.lower(): datatype = SqlType.text() else: datatype = SqlType.text() return (datatype, def_value)
def convert_struct_field(column: StructField) -> TableDefinition.Column: """Converts a Spark StructField to a Tableau Hyper SqlType""" if column.dataType == IntegerType(): sql_type = SqlType.int() elif column.dataType == LongType(): sql_type = SqlType.big_int() elif column.dataType == ShortType(): sql_type = SqlType.small_int() elif column.dataType == DoubleType(): sql_type = SqlType.double() elif column.dataType == FloatType(): sql_type = SqlType.double() elif column.dataType == BooleanType(): sql_type = SqlType.bool() elif column.dataType == DateType(): sql_type = SqlType.date() elif column.dataType == TimestampType(): sql_type = SqlType.timestamp() elif column.dataType == StringType(): sql_type = SqlType.text() else: # Trap the DecimalType case if str(column.dataType).startswith("DecimalType"): # Max precision is only up to 18 decimal places in Tableau Hyper API precision = column.dataType.precision if column.dataType.precision <= 18 else 18 scale = column.dataType.scale sql_type = SqlType.numeric(precision, scale) else: raise ValueError(f'Invalid StructField datatype for column `{column.name}` : {column.dataType}') nullable = NULLABLE if column.nullable else NOT_NULLABLE return TableDefinition.Column(name=column.name, type=sql_type, nullability=nullable)
def __init__(self): """ Handler for conversion of storage types between DSS and Tableau Hyper DSS storage types: "string","date","geopoint","geometry","array","map","object","double", "boolean","float","bigint","int","smallint","tinyint" Tableau Hyper storage types: TypeTag.BOOL, TypeTag.BIG_INT, TypeTag.SMALL_INT, TypeTag.INT, TypeTag.NUMERIC, TypeTag.DOUBLE, TypeTag.OID, TypeTag.BYTES, TypeTag.TEXT, TypeTag.VARCHAR, TypeTag.CHAR, TypeTag.JSON, TypeTag.DATE, TypeTag.INTERVAL, TypeTag.TIME, TypeTag.TIMESTAMP, TypeTag.TIMESTAMP_TZ, TypeTag.GEOGRAPHY """ handle_null = lambda f: lambda x: None if pd.isna(x) else f(x) # Mapping DSS to Tableau Hyper types self.mapping_dss_to_hyper = { 'array': (SqlType.text(), handle_null(str)), 'bigint': (SqlType.big_int(), handle_null(int)), 'boolean': (SqlType.bool(), handle_null(bool)), 'date': (SqlType.timestamp(), handle_null(to_hyper_timestamp)), 'double': (SqlType.double(), handle_null(float)), 'float': (SqlType.double(), handle_null(float)), 'geometry': (SqlType.text(), handle_null(str)), 'geopoint': (SqlType.geography(), handle_null(to_hyper_geography)), 'int': (SqlType.int(), handle_null(int)), 'map': (SqlType.text(), handle_null(str)), 'object': (SqlType.text(), handle_null(str)), 'smallint': (SqlType.small_int(), handle_null(int)), 'string': (SqlType.text(), handle_null(str)), 'tinyint': (SqlType.small_int(), handle_null(int)), } # Mapping Tableau Hyper to DSS types self.mapping_hyper_to_dss = { TypeTag.BIG_INT: ('bigint', handle_null(int)), TypeTag.BYTES: ('string', handle_null(str)), TypeTag.BOOL: ('boolean', handle_null(bool)), TypeTag.CHAR: ('string', handle_null(str)), TypeTag.DATE: ('date', handle_null(to_dss_date)), TypeTag.DOUBLE: ('double', handle_null(float)), TypeTag.GEOGRAPHY: ('geopoint', handle_null(to_dss_geopoint)), TypeTag.INT: ('int', handle_null(int)), TypeTag.INTERVAL: ('string', handle_null(str)), TypeTag.JSON: ('string', handle_null(str)), TypeTag.NUMERIC: ('double', handle_null(float)), TypeTag.OID: ('string', handle_null(str)), TypeTag.SMALL_INT: ('smallint', handle_null(int)), TypeTag.TEXT: ('string', handle_null(str)), TypeTag.TIME: ('string', handle_null(str)), TypeTag.TIMESTAMP: ('date', handle_null(to_dss_timestamp)), TypeTag.TIMESTAMP_TZ: ('string', handle_null(str)), TypeTag.VARCHAR: ('string', handle_null(str)) }
def df_to_extract(df, output_path): ''' Converts a Pandas dataframe to a Tableau Extract. Parameters ---------- df (pandas dataframe): Dataframe to turn into a Tableau extract output_path (str): Where to create the Tableau extract ''' # Replace nan's with 0 df = df.replace(np.nan, 0.0, regex=True) print('Creating Tableau data extract...') with HyperProcess(telemetry=Telemetry.DO_NOT_SEND_USAGE_DATA_TO_TABLEAU) as hyper: with Connection(hyper.endpoint, output_path, CreateMode.CREATE_AND_REPLACE) as connection: # Create schema connection.catalog.create_schema('Extract') # Create list of column definitions, based on the datatypes in pandas dataframe dtype_map = { 'int32': SqlType.int(), 'int64': SqlType.big_int(), 'float32': SqlType.double(), 'float64': SqlType.double(), 'datetime64[ns]': SqlType.date(), 'object': SqlType.text() } table_def = [] # Get column headers to loop through them df_columns = list(df) for col_header in df_columns: dtype_str = str(df[col_header].dtype) # Use dtype_str to lookup appropiate SqlType from dtype_map and append new column to table definition table_def.append(TableDefinition.Column(col_header, dtype_map[dtype_str])) # Define table extract_table = TableDefinition(TableName('Extract', 'Extract'), table_def) # Create table connection.catalog.create_table(extract_table) # Insert data with Inserter(connection, extract_table) as inserter: for idx, row in df.iterrows(): inserter.add_row(row) inserter.execute()
def _hyper_sql_type(self, source_column): """ Finds the correct Hyper column type for source_column source_column (obj): Source column (Instance of google.cloud.bigquery.schema.SchemaField) Returns a tableauhyperapi.SqlType Object """ source_column_type = source_column.field_type return_sql_type = { "BOOL": SqlType.bool(), "BYTES": SqlType.bytes(), "DATE": SqlType.date(), "DATETIME": SqlType.timestamp(), "INT64": SqlType.big_int(), "INTEGER": SqlType.int(), "NUMERIC": SqlType.numeric(18, 9), "FLOAT64": SqlType.double(), "STRING": SqlType.text(), "TIME": SqlType.time(), "TIMESTAMP": SqlType.timestamp_tz(), }.get(source_column_type) if return_sql_type is None: error_message = "No Hyper SqlType defined for BigQuery source type: {}".format( source_column_type ) logger.error(error_message) raise LookupError(error_message) logger.debug( "Translated source column type {} to Hyper SqlType {}".format( source_column_type, return_sql_type ) ) return return_sql_type
query_res['column_ex2'] = query_res['column_ex2'].astype(str) query_res['column_ex3'] = query_res['column_ex3'].astype(str) query_res['column_ex4'] = query_res['column_ex4'].astype(str) query_res['column_ex5'] = query_res['column_ex5'].astype(str) query_res['column_ex6'] = query_res['column_ex6'].astype(str) with HyperProcess(telemetry=Telemetry.SEND_USAGE_DATA_TO_TABLEAU) as hyper: with Connection(endpoint=hyper.endpoint, database=path_to_database, create_mode=CreateMode.CREATE_AND_REPLACE) as connection: connection.catalog.create_schema('Extract') # Define the hyper table (Same definition as the full refresh) example_table = TableDefinition(TableName('Extract', 'Extract'), [ TableDefinition.Column('column_ex', SqlType.varchar(500)), TableDefinition.Column('column_ex2', SqlType.varchar(500)), TableDefinition.Column('column_ex_int', SqlType.int()), TableDefinition.Column('column_ex3', SqlType.varchar(500)), TableDefinition.Column('column_ex_int2', SqlType.int()), TableDefinition.Column('column_ex4', SqlType.varchar(500)), TableDefinition.Column('column_ex5', SqlType.varchar(500)), TableDefinition.Column('column_ex_int3', SqlType.int()), TableDefinition.Column('column_ex6', SqlType.varchar(500)), ]) print("The table is defined.") connection.catalog.create_table(table_definition=example_table) # Insert data from dataframe to hyper table with Inserter(connection, example_table) as inserter: for i in range(len(query_res)): inserter.add_row( [ query_res['column_ex'][i], \ query_res['column_ex2'][i], \
# Step 1: Start Hyper instance with HyperProcess(Telemetry.SEND_USAGE_DATA_TO_TABLEAU) as hyper: #step 2: Start hyper process with Connection(endpoint=hyper.endpoint, create_mode=CreateMode.CREATE_AND_REPLACE, database=PATH_TO_HYPER) as connection: print( connection.execute_scalar_query( query="SELECT'securing connections with endpoint!'")) #create table def with Table name empdetails = TableDefinition( table_name=TableName('Extract', 'empdetails'), columns=[ TableDefinition.Column('EMP_ID', SqlType.int()), TableDefinition.Column('EMP_NAME', SqlType.text()), TableDefinition.Column('MONTHY_SALARY', SqlType.int()), TableDefinition.Column('UNI_FLAG', SqlType.int()) ]) #Create the table in the connection catalog connection.catalog.create_schema('Extract') connection.catalog.create_table(empdetails) print('Validating the data...') cmd = f"COPY{empdetails.table_name} from {escape_string_literal(PATH_TO_CSV)}\ with " "(format csv , NULL 'NULL', delimiter ',' , HEADER)" table_count = connection.execute_command(cmd) total_time = int((time.time() - start)) total_time = "{:.2f}".format(total_time)
def run_insert_data_with_expressions(): """ An example of how to push down computations to Hyper during insertion with expressions. """ print("EXAMPLE - Push down computations to Hyper during insertion with expressions") path_to_database = Path("orders.hyper") # Starts the Hyper Process with telemetry enabled to send data to Tableau. # To opt out, simply set telemetry=Telemetry.DO_NOT_SEND_USAGE_DATA_TO_TABLEAU. with HyperProcess(telemetry=Telemetry.SEND_USAGE_DATA_TO_TABLEAU) as hyper: # Creates new Hyper file "orders.hyper". # Replaces file with CreateMode.CREATE_AND_REPLACE if it already exists. with Connection(endpoint=hyper.endpoint, database=path_to_database, create_mode=CreateMode.CREATE_AND_REPLACE) as connection: connection.catalog.create_schema(schema=extract_table.table_name.schema_name) connection.catalog.create_table(table_definition=extract_table) # Hyper API's Inserter allows users to transform data during insertion. # To make use of data transformation during insertion, the inserter requires the following inputs # 1. The connection to the Hyper instance containing the table. # 2. The table name or table defintion into which data is inserted. # 3. List of Inserter.ColumnMapping. # This list informs the inserter how each column in the target table is tranformed. # The list must contain all the columns into which data is inserted. # "Inserter.ColumnMapping" maps a valid SQL expression (if any) to a column in the target table. # For example Inserter.ColumnMapping('target_column_name', f'{escape_name("colA")}*{escape_name("colB")}') # The column "target_column" contains the product of "colA" and "colB" after successful insertion. # SQL expression string is optional in Inserter.ColumnMapping. # For a column without any transformation only the column name is required. # For example Inserter.ColumnMapping('no_data_transformation_column') # 4. The Column Definition of all input values provided to the Inserter # Inserter definition contains the column definition for the values that are inserted inserter_definition = [ TableDefinition.Column(name='Order ID', type=SqlType.int(), nullability=NOT_NULLABLE), TableDefinition.Column(name='Ship Timestamp Text', type=SqlType.text(), nullability=NOT_NULLABLE), TableDefinition.Column(name='Ship Mode', type=SqlType.text(), nullability=NOT_NULLABLE), TableDefinition.Column(name='Ship Priority Text', type=SqlType.text(), nullability=NOT_NULLABLE)] # Column 'Order Id' is inserted into "Extract"."Extract" as-is # Column 'Ship Timestamp' in "Extract"."Extract" of timestamp type is computed from Column 'Ship Timestamp Text' of text type using 'to_timestamp()' # Column 'Ship Mode' is inserted into "Extract"."Extract" as-is # Column 'Ship Priority' is "Extract"."Extract" of integer type is computed from Colum 'Ship Priority Text' of text type using 'CASE' statement shipPriorityAsIntCaseExpression = f'CASE {escape_name("Ship Priority Text")} ' \ f'WHEN {escape_string_literal("Urgent")} THEN 1 ' \ f'WHEN {escape_string_literal("Medium")} THEN 2 ' \ f'WHEN {escape_string_literal("Low")} THEN 3 END' column_mappings = [ 'Order ID', Inserter.ColumnMapping( 'Ship Timestamp', f'to_timestamp({escape_name("Ship Timestamp Text")}, {escape_string_literal("YYYY-MM-DD HH24:MI:SS")})'), 'Ship Mode', Inserter.ColumnMapping('Ship Priority', shipPriorityAsIntCaseExpression) ] # Data to be inserted data_to_insert = [ [399, '2012-09-13 10:00:00', 'Express Class', 'Urgent'], [530, '2012-07-12 14:00:00', 'Standard Class', 'Low'] ] # Insert data into "Extract"."Extract" table with expressions with Inserter(connection, extract_table, column_mappings, inserter_definition=inserter_definition) as inserter: inserter.add_rows(rows=data_to_insert) inserter.execute() print("The data was added to the table.") print("The connection to the Hyper file has been closed.") print("The Hyper process has been shut down.")
from pathlib import Path from tableauhyperapi import HyperProcess, Telemetry, \ Connection, CreateMode, \ NOT_NULLABLE, NULLABLE, SqlType, TableDefinition, \ Inserter, \ escape_name, escape_string_literal, \ TableName, Name, \ HyperException # The table is called "Extract" and will be created in the "Extract" schema. # This has historically been the default table name and schema for extracts created by Tableau extract_table = TableDefinition( table_name=TableName("Extract", "Extract"), columns=[ TableDefinition.Column(name='Order ID', type=SqlType.int(), nullability=NOT_NULLABLE), TableDefinition.Column(name='Ship Timestamp', type=SqlType.timestamp(), nullability=NOT_NULLABLE), TableDefinition.Column(name='Ship Mode', type=SqlType.text(), nullability=NOT_NULLABLE), TableDefinition.Column(name='Ship Priority', type=SqlType.int(), nullability=NOT_NULLABLE) ] ) def run_insert_data_with_expressions(): """ An example of how to push down computations to Hyper during insertion with expressions. """ print("EXAMPLE - Push down computations to Hyper during insertion with expressions") path_to_database = Path("orders.hyper") # Starts the Hyper Process with telemetry enabled to send data to Tableau.
TableDefinition.Column('party_group_description', SqlType.varchar(30), NULLABLE), TableDefinition.Column('party_type_code', SqlType.varchar(30), NULLABLE), TableDefinition.Column('party_type_description', SqlType.varchar(30), NULLABLE), TableDefinition.Column('party_subtype_code', SqlType.varchar(30), NULLABLE), TableDefinition.Column('party_subtype_description', SqlType.varchar(30), NULLABLE), TableDefinition.Column('party_service_code', SqlType.varchar(30), NULLABLE), TableDefinition.Column('party_service_description', SqlType.varchar(30), NULLABLE), TableDefinition.Column('br_segment', SqlType.varchar(30), NULLABLE), TableDefinition.Column('br_sub_segment', SqlType.varchar(30), NULLABLE), TableDefinition.Column('channel_name', SqlType.varchar(30), NULLABLE), TableDefinition.Column('market_normalized_name', SqlType.varchar(100), NULLABLE), TableDefinition.Column('brand_normalized_name', SqlType.varchar(30), NULLABLE), TableDefinition.Column('form_strength_normalized_name', SqlType.varchar(35), NULLABLE), TableDefinition.Column('normalized_name', SqlType.varchar(35), NULLABLE), TableDefinition.Column('competitor_flag', SqlType.bool(), NULLABLE), TableDefinition.Column('year', SqlType.int(), NULLABLE), TableDefinition.Column('semester_number', SqlType.int(), NULLABLE), TableDefinition.Column('quarter_number', SqlType.int(), NULLABLE), TableDefinition.Column('month_number', SqlType.int(), NULLABLE), TableDefinition.Column('week_number', SqlType.int(), NULLABLE), TableDefinition.Column('transaction_timestamp', SqlType.date(), NULLABLE), TableDefinition.Column('ddd_source_units', SqlType.int(), NULLABLE), TableDefinition.Column('ddd_source_units_uom', SqlType.varchar(30), NULLABLE), TableDefinition.Column('ddd_units', SqlType.int(), NULLABLE), TableDefinition.Column('ddd_units_uom', SqlType.varchar(30), NULLABLE), TableDefinition.Column('ddd_dollars', SqlType.numeric(10,4), NULLABLE), TableDefinition.Column('ddd_dot', SqlType.numeric(10,5), NULLABLE), TableDefinition.Column('ddd_mcg', SqlType.numeric(10,5), NULLABLE), TableDefinition.Column('ddd_normalized_units', SqlType.int(), NULLABLE), TableDefinition.Column('ddd_normalized_units_uom', SqlType.varchar(30), NULLABLE), TableDefinition.Column('chargeback_source_units', SqlType.int(), NULLABLE),
def sparkConnect(): # fetching DF from spark filestore if cf.file_type == 'csv': df = spark.read.format(cf.file_type) \ .option("inferSchema", cf.infer_schema) \ .option("header", cf.first_row_is_header) \ .option("sep", cf.delimiter) \ .load(cf.input_file_path) # print('\n', cf.input_file_path, '\n', cf.schema, '\n') # fetching table from db from databricks elif cf.file_type == 'jdbc': df = spark.read.format("jdbc") \ .option("driver", cf.driver) \ .option("url", cf.url) \ .option("dbtable", cf.table) \ .option("user", cf.user) \ .option("password", cf.password) \ .option("inferSchema", cf.infer_schema) \ .option("header", cf.first_row_is_header) \ .load() df.write.format("csv") \ .option("enoding", cf.charset) \ .option("header", cf.first_row_is_header) \ .option("sep", cf.delimiter) \ .save('/home/hari/HyperConverter/test') # pdf = df.select('*').toPandas() # path = '/home/hari/HyperConverter/test.csv' # pdf.to_csv(path, sep=',', index=False) path = glob.glob('/home/hari/HyperConverter/test/part*.csv') cf.input_file_path = path[0] cf.input_file_path = path print('\n', cf.input_file_path, '\n') col = list(df.dtypes) print(col) print(len(col)) for i in range(len(col)): col[i] = list(col[i]) col[i][1] = type_[col[i][1]] # print('\n', col, '\n') x = [] for i, j in col: print(i, j) if j == 'varchar': max_length = df.agg({i: "max"}).collect()[0] #print(max_length) xyz = max_length["max({})".format(i)] if xyz != None: max_length = len(xyz) if 19 <= max_length <= 40: max_length = 100 else: max_length = 30 else: max_length = 35 print(max_length) x.append( TableDefinition.Column(i, SqlType.varchar(max_length), NULLABLE)) elif j == 'int': x.append(TableDefinition.Column(i, SqlType.int(), NULLABLE)) elif j == 'date': x.append(TableDefinition.Column(i, SqlType.date(), NULLABLE)) elif j == 'numeric': x.append( TableDefinition.Column(i, SqlType.numeric(10, 4), NULLABLE)) elif j == 'bool': x.append(TableDefinition.Column(i, SqlType.bool(), NULLABLE)) elif j == 'big_int': x.append(TableDefinition.Column(i, SqlType.big_int(), NULLABLE)) elif j == 'double': x.append(TableDefinition.Column(i, SqlType.double(), NULLABLE)) elif j == 'text': print("this is culprate", i, j) x.append(TableDefinition.Column(i, SqlType.text(), NULLABLE)) print(x) print(len(x)) return x