def convert_struct_field(column: StructField) -> TableDefinition.Column: """Converts a Spark StructField to a Tableau Hyper SqlType""" if column.dataType == IntegerType(): sql_type = SqlType.int() elif column.dataType == LongType(): sql_type = SqlType.big_int() elif column.dataType == ShortType(): sql_type = SqlType.small_int() elif column.dataType == DoubleType(): sql_type = SqlType.double() elif column.dataType == FloatType(): sql_type = SqlType.double() elif column.dataType == BooleanType(): sql_type = SqlType.bool() elif column.dataType == DateType(): sql_type = SqlType.date() elif column.dataType == TimestampType(): sql_type = SqlType.timestamp() elif column.dataType == StringType(): sql_type = SqlType.text() else: # Trap the DecimalType case if str(column.dataType).startswith("DecimalType"): # Max precision is only up to 18 decimal places in Tableau Hyper API precision = column.dataType.precision if column.dataType.precision <= 18 else 18 scale = column.dataType.scale sql_type = SqlType.numeric(precision, scale) else: raise ValueError(f'Invalid StructField datatype for column `{column.name}` : {column.dataType}') nullable = NULLABLE if column.nullable else NOT_NULLABLE return TableDefinition.Column(name=column.name, type=sql_type, nullability=nullable)
def _hyper_sql_type(self, source_column): """ Finds the correct Hyper column type for source_column source_column (obj): Source column (Instance of google.cloud.bigquery.schema.SchemaField) Returns a tableauhyperapi.SqlType Object """ source_column_type = source_column.field_type return_sql_type = { "BOOL": SqlType.bool(), "BYTES": SqlType.bytes(), "DATE": SqlType.date(), "DATETIME": SqlType.timestamp(), "INT64": SqlType.big_int(), "INTEGER": SqlType.int(), "NUMERIC": SqlType.numeric(18, 9), "FLOAT64": SqlType.double(), "STRING": SqlType.text(), "TIME": SqlType.time(), "TIMESTAMP": SqlType.timestamp_tz(), }.get(source_column_type) if return_sql_type is None: error_message = "No Hyper SqlType defined for BigQuery source type: {}".format( source_column_type ) logger.error(error_message) raise LookupError(error_message) logger.debug( "Translated source column type {} to Hyper SqlType {}".format( source_column_type, return_sql_type ) ) return return_sql_type
TableDefinition.Column('market_normalized_name', SqlType.varchar(100), NULLABLE), TableDefinition.Column('brand_normalized_name', SqlType.varchar(30), NULLABLE), TableDefinition.Column('form_strength_normalized_name', SqlType.varchar(35), NULLABLE), TableDefinition.Column('normalized_name', SqlType.varchar(35), NULLABLE), TableDefinition.Column('competitor_flag', SqlType.bool(), NULLABLE), TableDefinition.Column('year', SqlType.int(), NULLABLE), TableDefinition.Column('semester_number', SqlType.int(), NULLABLE), TableDefinition.Column('quarter_number', SqlType.int(), NULLABLE), TableDefinition.Column('month_number', SqlType.int(), NULLABLE), TableDefinition.Column('week_number', SqlType.int(), NULLABLE), TableDefinition.Column('transaction_timestamp', SqlType.date(), NULLABLE), TableDefinition.Column('ddd_source_units', SqlType.int(), NULLABLE), TableDefinition.Column('ddd_source_units_uom', SqlType.varchar(30), NULLABLE), TableDefinition.Column('ddd_units', SqlType.int(), NULLABLE), TableDefinition.Column('ddd_units_uom', SqlType.varchar(30), NULLABLE), TableDefinition.Column('ddd_dollars', SqlType.numeric(10,4), NULLABLE), TableDefinition.Column('ddd_dot', SqlType.numeric(10,5), NULLABLE), TableDefinition.Column('ddd_mcg', SqlType.numeric(10,5), NULLABLE), TableDefinition.Column('ddd_normalized_units', SqlType.int(), NULLABLE), TableDefinition.Column('ddd_normalized_units_uom', SqlType.varchar(30), NULLABLE), TableDefinition.Column('chargeback_source_units', SqlType.int(), NULLABLE), TableDefinition.Column('chargeback_source_units_uom', SqlType.varchar(30), NULLABLE), TableDefinition.Column('chargeback_units', SqlType.int(), NULLABLE), TableDefinition.Column('chargeback_units_uom', SqlType.varchar(30), NULLABLE), TableDefinition.Column('chargeback_dollars', SqlType.numeric(10,4), NULLABLE), TableDefinition.Column('chargeback_dot', SqlType.numeric(10,5), NULLABLE), TableDefinition.Column('chargeback_mcg', SqlType.numeric(10,5), NULLABLE), TableDefinition.Column('chargeback_normalized_units', SqlType.numeric(10,5), NULLABLE), TableDefinition.Column('chargeback_normalized_units_uom', SqlType.varchar(30), NULLABLE), TableDefinition.Column('self_reported_source_units', SqlType.numeric(10,5), NULLABLE), TableDefinition.Column('self_reported_source_units_uom', SqlType.varchar(30), NULLABLE),
def sparkConnect(): # fetching DF from spark filestore if cf.file_type == 'csv': df = spark.read.format(cf.file_type) \ .option("inferSchema", cf.infer_schema) \ .option("header", cf.first_row_is_header) \ .option("sep", cf.delimiter) \ .load(cf.input_file_path) # print('\n', cf.input_file_path, '\n', cf.schema, '\n') # fetching table from db from databricks elif cf.file_type == 'jdbc': df = spark.read.format("jdbc") \ .option("driver", cf.driver) \ .option("url", cf.url) \ .option("dbtable", cf.table) \ .option("user", cf.user) \ .option("password", cf.password) \ .option("inferSchema", cf.infer_schema) \ .option("header", cf.first_row_is_header) \ .load() df.write.format("csv") \ .option("enoding", cf.charset) \ .option("header", cf.first_row_is_header) \ .option("sep", cf.delimiter) \ .save('/home/hari/HyperConverter/test') # pdf = df.select('*').toPandas() # path = '/home/hari/HyperConverter/test.csv' # pdf.to_csv(path, sep=',', index=False) path = glob.glob('/home/hari/HyperConverter/test/part*.csv') cf.input_file_path = path[0] cf.input_file_path = path print('\n', cf.input_file_path, '\n') col = list(df.dtypes) print(col) print(len(col)) for i in range(len(col)): col[i] = list(col[i]) col[i][1] = type_[col[i][1]] # print('\n', col, '\n') x = [] for i, j in col: print(i, j) if j == 'varchar': max_length = df.agg({i: "max"}).collect()[0] #print(max_length) xyz = max_length["max({})".format(i)] if xyz != None: max_length = len(xyz) if 19 <= max_length <= 40: max_length = 100 else: max_length = 30 else: max_length = 35 print(max_length) x.append( TableDefinition.Column(i, SqlType.varchar(max_length), NULLABLE)) elif j == 'int': x.append(TableDefinition.Column(i, SqlType.int(), NULLABLE)) elif j == 'date': x.append(TableDefinition.Column(i, SqlType.date(), NULLABLE)) elif j == 'numeric': x.append( TableDefinition.Column(i, SqlType.numeric(10, 4), NULLABLE)) elif j == 'bool': x.append(TableDefinition.Column(i, SqlType.bool(), NULLABLE)) elif j == 'big_int': x.append(TableDefinition.Column(i, SqlType.big_int(), NULLABLE)) elif j == 'double': x.append(TableDefinition.Column(i, SqlType.double(), NULLABLE)) elif j == 'text': print("this is culprate", i, j) x.append(TableDefinition.Column(i, SqlType.text(), NULLABLE)) print(x) print(len(x)) return x