def __init__(self): self.location = "./output/meli_challenge_result.hyper" self.test_location = "../output/meli_challenge_result.hyper" self.searchResult_table = TableDefinition('results', [ TableDefinition.Column('id', SqlType.text(), Nullability.NOT_NULLABLE), TableDefinition.Column('site_id', SqlType.text(), Nullability.NOT_NULLABLE), TableDefinition.Column('title', SqlType.text(), Nullability.NOT_NULLABLE), TableDefinition.Column('seller', SqlType.text(), Nullability.NOT_NULLABLE), TableDefinition.Column('price', SqlType.text(), Nullability.NOT_NULLABLE), TableDefinition.Column('prices', SqlType.json(), Nullability.NOT_NULLABLE), TableDefinition.Column('sale_price', SqlType.text(), Nullability.NULLABLE), TableDefinition.Column('currency_id', SqlType.text(), Nullability.NULLABLE), TableDefinition.Column('available_quantity', SqlType.int(), Nullability.NULLABLE), TableDefinition.Column('sold_quantity', SqlType.int(), Nullability.NULLABLE), TableDefinition.Column('buying_mode', SqlType.text(), Nullability.NULLABLE), TableDefinition.Column('listing_type_id', SqlType.text(), Nullability.NULLABLE), TableDefinition.Column('stop_time', SqlType.text(), Nullability.NULLABLE), TableDefinition.Column('condition', SqlType.text(), Nullability.NULLABLE), TableDefinition.Column('permalink', SqlType.text(), Nullability.NULLABLE), TableDefinition.Column('thumbnail', SqlType.text(), Nullability.NULLABLE), TableDefinition.Column('accepts_mercadopago', SqlType.bool(), Nullability.NULLABLE), TableDefinition.Column('installments', SqlType.text(), Nullability.NULLABLE), TableDefinition.Column('address', SqlType.text(), Nullability.NULLABLE), TableDefinition.Column('shipping', SqlType.text(), Nullability.NULLABLE), TableDefinition.Column('seller_address', SqlType.text(), Nullability.NULLABLE), TableDefinition.Column('attributes', SqlType.text(), Nullability.NOT_NULLABLE), TableDefinition.Column('original_price', SqlType.text(), Nullability.NULLABLE), TableDefinition.Column('category_id', SqlType.text(), Nullability.NOT_NULLABLE), TableDefinition.Column('official_store_id', SqlType.int(), Nullability.NULLABLE), TableDefinition.Column('domain_id', SqlType.text(), Nullability.NULLABLE), TableDefinition.Column('catalog_product_id', SqlType.text(), Nullability.NULLABLE), TableDefinition.Column('tags', SqlType.text(), Nullability.NULLABLE), TableDefinition.Column('catalog_listing', SqlType.bool(), Nullability.NULLABLE), TableDefinition.Column('order_backend', SqlType.int(), Nullability.NULLABLE), ])
def test_get_table_def(self): data = [ (1001, 1, "Jane", "Doe", "2000-05-01", 29.0, False), (1002, 2, "John", "Doe", "1988-05-03", 33.0, False), (2201, 3, "Elonzo", "Smith", "1990-05-03", 21.0, True), (None, None, None, None, None, None, None) # Test Nulls ] df = get_spark_session()\ .createDataFrame(data, ["id", "dept_id", "first_name", "last_name", "dob", "age", "is_temp"])\ .createOrReplaceTempView("employees") df = get_spark_session().sql( "select id, cast(dept_id as short), first_name, " "last_name, dob, age, is_temp from employees") table_def = get_table_def(df, "Extract", "Extract") # Ensure that the Table Name matches assert (table_def.table_name.name == Name("Extract")) # Ensure that the the TableDefinition column names match assert (table_def.get_column(0).name == Name("id")) assert (table_def.get_column(1).name == Name("dept_id")) assert (table_def.get_column(2).name == Name("first_name")) assert (table_def.get_column(3).name == Name("last_name")) assert (table_def.get_column(4).name == Name("dob")) assert (table_def.get_column(5).name == Name("age")) assert (table_def.get_column(6).name == Name("is_temp")) # Ensure that the column data types were converted correctly assert (table_def.get_column(0).type == SqlType.big_int()) assert (table_def.get_column(1).type == SqlType.small_int()) assert (table_def.get_column(2).type == SqlType.text()) assert (table_def.get_column(3).type == SqlType.text()) assert (table_def.get_column(4).type == SqlType.text()) assert (table_def.get_column(5).type == SqlType.double()) assert (table_def.get_column(6).type == SqlType.bool())
def convert_datatype(coldatatype): """ [summary] This converts the datatype of the column of a given dataframe. Args: Datatype of the column in string format Returns: The tableau hyper extract compatible datatype after converting the dataframe datatype and a default value for NaN cases """ datatype = SqlType.text() def_value = '' if 'datetime' in coldatatype.lower(): datatype = SqlType.timestamp() elif 'str' in coldatatype.lower(): datatype = SqlType.text() elif 'boolean' in coldatatype.lower(): datatype = SqlType.bool() elif 'int' in coldatatype.lower(): datatype = SqlType.int() def_value = 0 elif 'float' in coldatatype.lower(): datatype = SqlType.double() def_value = 0 elif 'period' in coldatatype.lower(): datatype = SqlType.interval() elif 'object' in coldatatype.lower(): datatype = SqlType.text() else: datatype = SqlType.text() return (datatype, def_value)
def convert_struct_field(column: StructField) -> TableDefinition.Column: """Converts a Spark StructField to a Tableau Hyper SqlType""" if column.dataType == IntegerType(): sql_type = SqlType.int() elif column.dataType == LongType(): sql_type = SqlType.big_int() elif column.dataType == ShortType(): sql_type = SqlType.small_int() elif column.dataType == DoubleType(): sql_type = SqlType.double() elif column.dataType == FloatType(): sql_type = SqlType.double() elif column.dataType == BooleanType(): sql_type = SqlType.bool() elif column.dataType == DateType(): sql_type = SqlType.date() elif column.dataType == TimestampType(): sql_type = SqlType.timestamp() elif column.dataType == StringType(): sql_type = SqlType.text() else: # Trap the DecimalType case if str(column.dataType).startswith("DecimalType"): # Max precision is only up to 18 decimal places in Tableau Hyper API precision = column.dataType.precision if column.dataType.precision <= 18 else 18 scale = column.dataType.scale sql_type = SqlType.numeric(precision, scale) else: raise ValueError(f'Invalid StructField datatype for column `{column.name}` : {column.dataType}') nullable = NULLABLE if column.nullable else NOT_NULLABLE return TableDefinition.Column(name=column.name, type=sql_type, nullability=nullable)
def __init__(self): """ Handler for conversion of storage types between DSS and Tableau Hyper DSS storage types: "string","date","geopoint","geometry","array","map","object","double", "boolean","float","bigint","int","smallint","tinyint" Tableau Hyper storage types: TypeTag.BOOL, TypeTag.BIG_INT, TypeTag.SMALL_INT, TypeTag.INT, TypeTag.NUMERIC, TypeTag.DOUBLE, TypeTag.OID, TypeTag.BYTES, TypeTag.TEXT, TypeTag.VARCHAR, TypeTag.CHAR, TypeTag.JSON, TypeTag.DATE, TypeTag.INTERVAL, TypeTag.TIME, TypeTag.TIMESTAMP, TypeTag.TIMESTAMP_TZ, TypeTag.GEOGRAPHY """ handle_null = lambda f: lambda x: None if pd.isna(x) else f(x) # Mapping DSS to Tableau Hyper types self.mapping_dss_to_hyper = { 'array': (SqlType.text(), handle_null(str)), 'bigint': (SqlType.big_int(), handle_null(int)), 'boolean': (SqlType.bool(), handle_null(bool)), 'date': (SqlType.timestamp(), handle_null(to_hyper_timestamp)), 'double': (SqlType.double(), handle_null(float)), 'float': (SqlType.double(), handle_null(float)), 'geometry': (SqlType.text(), handle_null(str)), 'geopoint': (SqlType.geography(), handle_null(to_hyper_geography)), 'int': (SqlType.int(), handle_null(int)), 'map': (SqlType.text(), handle_null(str)), 'object': (SqlType.text(), handle_null(str)), 'smallint': (SqlType.small_int(), handle_null(int)), 'string': (SqlType.text(), handle_null(str)), 'tinyint': (SqlType.small_int(), handle_null(int)), } # Mapping Tableau Hyper to DSS types self.mapping_hyper_to_dss = { TypeTag.BIG_INT: ('bigint', handle_null(int)), TypeTag.BYTES: ('string', handle_null(str)), TypeTag.BOOL: ('boolean', handle_null(bool)), TypeTag.CHAR: ('string', handle_null(str)), TypeTag.DATE: ('date', handle_null(to_dss_date)), TypeTag.DOUBLE: ('double', handle_null(float)), TypeTag.GEOGRAPHY: ('geopoint', handle_null(to_dss_geopoint)), TypeTag.INT: ('int', handle_null(int)), TypeTag.INTERVAL: ('string', handle_null(str)), TypeTag.JSON: ('string', handle_null(str)), TypeTag.NUMERIC: ('double', handle_null(float)), TypeTag.OID: ('string', handle_null(str)), TypeTag.SMALL_INT: ('smallint', handle_null(int)), TypeTag.TEXT: ('string', handle_null(str)), TypeTag.TIME: ('string', handle_null(str)), TypeTag.TIMESTAMP: ('date', handle_null(to_dss_timestamp)), TypeTag.TIMESTAMP_TZ: ('string', handle_null(str)), TypeTag.VARCHAR: ('string', handle_null(str)) }
def fn_convert_to_hyper_types(given_type): switcher = { 'empty': SqlType.text(), 'bool': SqlType.bool(), 'int': SqlType.big_int(), 'float-dot': SqlType.double(), 'date-YMD': SqlType.date(), 'date-MDY': SqlType.date(), 'date-DMY': SqlType.date(), 'time-24': SqlType.time(), 'time-12': SqlType.time(), 'datetime-24-YMD': SqlType.timestamp(), 'datetime-12-MDY': SqlType.timestamp(), 'datetime-24-DMY': SqlType.timestamp(), 'str': SqlType.text() } identified_type = switcher.get(given_type) if identified_type is None: identified_type = SqlType.text() return identified_type
def _hyper_sql_type(self, source_column): """ Finds the correct Hyper column type for source_column source_column (obj): Source column (Instance of google.cloud.bigquery.schema.SchemaField) Returns a tableauhyperapi.SqlType Object """ source_column_type = source_column.field_type return_sql_type = { "BOOL": SqlType.bool(), "BYTES": SqlType.bytes(), "DATE": SqlType.date(), "DATETIME": SqlType.timestamp(), "INT64": SqlType.big_int(), "INTEGER": SqlType.int(), "NUMERIC": SqlType.numeric(18, 9), "FLOAT64": SqlType.double(), "STRING": SqlType.text(), "TIME": SqlType.time(), "TIMESTAMP": SqlType.timestamp_tz(), }.get(source_column_type) if return_sql_type is None: error_message = "No Hyper SqlType defined for BigQuery source type: {}".format( source_column_type ) logger.error(error_message) raise LookupError(error_message) logger.debug( "Translated source column type {} to Hyper SqlType {}".format( source_column_type, return_sql_type ) ) return return_sql_type
Connection, SqlType, TableDefinition, CreateMode, TableName, Inserter, ) dtype_mapper = { "string": SqlType.text(), "str": SqlType.text(), "object": SqlType.text(), "O": SqlType.text(), "int64": SqlType.big_int(), "float64": SqlType.double(), "bool": SqlType.bool(), "datetime64[ns]": SqlType.timestamp(), "timedelta[ns]": SqlType.interval(), "category": SqlType.text(), } def read_hyper(path_to_hyper_file, custom_schema="Extract"): """Read a Tableau Hyper file and turn it into a Pandas DataFrame. Currently can only read single table extracts, which is Tableau's default way of creating an extract. Args: path_to_hyper_file: Specify the path to the .hyper file custom_schema: If you need to change the schema name. Defaults to "Extract"
TableDefinition.Column('party_group_code', SqlType.varchar(30), NULLABLE), TableDefinition.Column('party_group_description', SqlType.varchar(30), NULLABLE), TableDefinition.Column('party_type_code', SqlType.varchar(30), NULLABLE), TableDefinition.Column('party_type_description', SqlType.varchar(30), NULLABLE), TableDefinition.Column('party_subtype_code', SqlType.varchar(30), NULLABLE), TableDefinition.Column('party_subtype_description', SqlType.varchar(30), NULLABLE), TableDefinition.Column('party_service_code', SqlType.varchar(30), NULLABLE), TableDefinition.Column('party_service_description', SqlType.varchar(30), NULLABLE), TableDefinition.Column('br_segment', SqlType.varchar(30), NULLABLE), TableDefinition.Column('br_sub_segment', SqlType.varchar(30), NULLABLE), TableDefinition.Column('channel_name', SqlType.varchar(30), NULLABLE), TableDefinition.Column('market_normalized_name', SqlType.varchar(100), NULLABLE), TableDefinition.Column('brand_normalized_name', SqlType.varchar(30), NULLABLE), TableDefinition.Column('form_strength_normalized_name', SqlType.varchar(35), NULLABLE), TableDefinition.Column('normalized_name', SqlType.varchar(35), NULLABLE), TableDefinition.Column('competitor_flag', SqlType.bool(), NULLABLE), TableDefinition.Column('year', SqlType.int(), NULLABLE), TableDefinition.Column('semester_number', SqlType.int(), NULLABLE), TableDefinition.Column('quarter_number', SqlType.int(), NULLABLE), TableDefinition.Column('month_number', SqlType.int(), NULLABLE), TableDefinition.Column('week_number', SqlType.int(), NULLABLE), TableDefinition.Column('transaction_timestamp', SqlType.date(), NULLABLE), TableDefinition.Column('ddd_source_units', SqlType.int(), NULLABLE), TableDefinition.Column('ddd_source_units_uom', SqlType.varchar(30), NULLABLE), TableDefinition.Column('ddd_units', SqlType.int(), NULLABLE), TableDefinition.Column('ddd_units_uom', SqlType.varchar(30), NULLABLE), TableDefinition.Column('ddd_dollars', SqlType.numeric(10,4), NULLABLE), TableDefinition.Column('ddd_dot', SqlType.numeric(10,5), NULLABLE), TableDefinition.Column('ddd_mcg', SqlType.numeric(10,5), NULLABLE), TableDefinition.Column('ddd_normalized_units', SqlType.int(), NULLABLE), TableDefinition.Column('ddd_normalized_units_uom', SqlType.varchar(30), NULLABLE),
def sparkConnect(): # fetching DF from spark filestore if cf.file_type == 'csv': df = spark.read.format(cf.file_type) \ .option("inferSchema", cf.infer_schema) \ .option("header", cf.first_row_is_header) \ .option("sep", cf.delimiter) \ .load(cf.input_file_path) # print('\n', cf.input_file_path, '\n', cf.schema, '\n') # fetching table from db from databricks elif cf.file_type == 'jdbc': df = spark.read.format("jdbc") \ .option("driver", cf.driver) \ .option("url", cf.url) \ .option("dbtable", cf.table) \ .option("user", cf.user) \ .option("password", cf.password) \ .option("inferSchema", cf.infer_schema) \ .option("header", cf.first_row_is_header) \ .load() df.write.format("csv") \ .option("enoding", cf.charset) \ .option("header", cf.first_row_is_header) \ .option("sep", cf.delimiter) \ .save('/home/hari/HyperConverter/test') # pdf = df.select('*').toPandas() # path = '/home/hari/HyperConverter/test.csv' # pdf.to_csv(path, sep=',', index=False) path = glob.glob('/home/hari/HyperConverter/test/part*.csv') cf.input_file_path = path[0] cf.input_file_path = path print('\n', cf.input_file_path, '\n') col = list(df.dtypes) print(col) print(len(col)) for i in range(len(col)): col[i] = list(col[i]) col[i][1] = type_[col[i][1]] # print('\n', col, '\n') x = [] for i, j in col: print(i, j) if j == 'varchar': max_length = df.agg({i: "max"}).collect()[0] #print(max_length) xyz = max_length["max({})".format(i)] if xyz != None: max_length = len(xyz) if 19 <= max_length <= 40: max_length = 100 else: max_length = 30 else: max_length = 35 print(max_length) x.append( TableDefinition.Column(i, SqlType.varchar(max_length), NULLABLE)) elif j == 'int': x.append(TableDefinition.Column(i, SqlType.int(), NULLABLE)) elif j == 'date': x.append(TableDefinition.Column(i, SqlType.date(), NULLABLE)) elif j == 'numeric': x.append( TableDefinition.Column(i, SqlType.numeric(10, 4), NULLABLE)) elif j == 'bool': x.append(TableDefinition.Column(i, SqlType.bool(), NULLABLE)) elif j == 'big_int': x.append(TableDefinition.Column(i, SqlType.big_int(), NULLABLE)) elif j == 'double': x.append(TableDefinition.Column(i, SqlType.double(), NULLABLE)) elif j == 'text': print("this is culprate", i, j) x.append(TableDefinition.Column(i, SqlType.text(), NULLABLE)) print(x) print(len(x)) return x