Exemple #1
0
def convert_struct_field(column: StructField) -> TableDefinition.Column:
    """Converts a Spark StructField to a Tableau Hyper SqlType"""
    if column.dataType == IntegerType():
        sql_type = SqlType.int()
    elif column.dataType == LongType():
        sql_type = SqlType.big_int()
    elif column.dataType == ShortType():
        sql_type = SqlType.small_int()
    elif column.dataType == DoubleType():
        sql_type = SqlType.double()
    elif column.dataType == FloatType():
        sql_type = SqlType.double()
    elif column.dataType == BooleanType():
        sql_type = SqlType.bool()
    elif column.dataType == DateType():
        sql_type = SqlType.date()
    elif column.dataType == TimestampType():
        sql_type = SqlType.timestamp()
    elif column.dataType == StringType():
        sql_type = SqlType.text()
    else:
        # Trap the DecimalType case
        if str(column.dataType).startswith("DecimalType"):
            # Max precision is only up to 18 decimal places in Tableau Hyper API
            precision = column.dataType.precision if column.dataType.precision <= 18 else 18
            scale = column.dataType.scale
            sql_type = SqlType.numeric(precision, scale)
        else:
            raise ValueError(f'Invalid StructField datatype for column `{column.name}` : {column.dataType}')
    nullable = NULLABLE if column.nullable else NOT_NULLABLE
    return TableDefinition.Column(name=column.name, type=sql_type, nullability=nullable)
Exemple #2
0
    def _hyper_sql_type(self, source_column):
        """
        Finds the correct Hyper column type for source_column

        source_column (obj): Source column (Instance of google.cloud.bigquery.schema.SchemaField)

        Returns a tableauhyperapi.SqlType Object
        """

        source_column_type = source_column.field_type
        return_sql_type = {
            "BOOL": SqlType.bool(),
            "BYTES": SqlType.bytes(),
            "DATE": SqlType.date(),
            "DATETIME": SqlType.timestamp(),
            "INT64": SqlType.big_int(),
            "INTEGER": SqlType.int(),
            "NUMERIC": SqlType.numeric(18, 9),
            "FLOAT64": SqlType.double(),
            "STRING": SqlType.text(),
            "TIME": SqlType.time(),
            "TIMESTAMP": SqlType.timestamp_tz(),
        }.get(source_column_type)

        if return_sql_type is None:
            error_message = "No Hyper SqlType defined for BigQuery source type: {}".format(
                source_column_type
            )
            logger.error(error_message)
            raise LookupError(error_message)

        logger.debug(
            "Translated source column type {} to Hyper SqlType {}".format(
                source_column_type, return_sql_type
            )
        )
        return return_sql_type
 TableDefinition.Column('market_normalized_name', SqlType.varchar(100), NULLABLE),
 TableDefinition.Column('brand_normalized_name', SqlType.varchar(30), NULLABLE),
 TableDefinition.Column('form_strength_normalized_name', SqlType.varchar(35), NULLABLE),
 TableDefinition.Column('normalized_name', SqlType.varchar(35), NULLABLE),
 TableDefinition.Column('competitor_flag', SqlType.bool(), NULLABLE),
 TableDefinition.Column('year', SqlType.int(), NULLABLE),
 TableDefinition.Column('semester_number', SqlType.int(), NULLABLE),
 TableDefinition.Column('quarter_number', SqlType.int(), NULLABLE),
 TableDefinition.Column('month_number', SqlType.int(), NULLABLE),
 TableDefinition.Column('week_number', SqlType.int(), NULLABLE),
 TableDefinition.Column('transaction_timestamp', SqlType.date(), NULLABLE),
 TableDefinition.Column('ddd_source_units', SqlType.int(), NULLABLE),
 TableDefinition.Column('ddd_source_units_uom', SqlType.varchar(30), NULLABLE),
 TableDefinition.Column('ddd_units', SqlType.int(), NULLABLE),
 TableDefinition.Column('ddd_units_uom', SqlType.varchar(30), NULLABLE),
 TableDefinition.Column('ddd_dollars', SqlType.numeric(10,4), NULLABLE),
 TableDefinition.Column('ddd_dot', SqlType.numeric(10,5), NULLABLE),
 TableDefinition.Column('ddd_mcg', SqlType.numeric(10,5), NULLABLE),
 TableDefinition.Column('ddd_normalized_units', SqlType.int(), NULLABLE),
 TableDefinition.Column('ddd_normalized_units_uom', SqlType.varchar(30), NULLABLE),
 TableDefinition.Column('chargeback_source_units', SqlType.int(), NULLABLE),
 TableDefinition.Column('chargeback_source_units_uom', SqlType.varchar(30), NULLABLE),
 TableDefinition.Column('chargeback_units', SqlType.int(), NULLABLE),
 TableDefinition.Column('chargeback_units_uom', SqlType.varchar(30), NULLABLE),
 TableDefinition.Column('chargeback_dollars', SqlType.numeric(10,4), NULLABLE),
 TableDefinition.Column('chargeback_dot', SqlType.numeric(10,5), NULLABLE),
 TableDefinition.Column('chargeback_mcg', SqlType.numeric(10,5), NULLABLE),
 TableDefinition.Column('chargeback_normalized_units', SqlType.numeric(10,5), NULLABLE),
 TableDefinition.Column('chargeback_normalized_units_uom', SqlType.varchar(30), NULLABLE),
 TableDefinition.Column('self_reported_source_units', SqlType.numeric(10,5), NULLABLE),
 TableDefinition.Column('self_reported_source_units_uom', SqlType.varchar(30), NULLABLE),
def sparkConnect():
    # fetching DF from spark filestore
    if cf.file_type == 'csv':
        df = spark.read.format(cf.file_type) \
            .option("inferSchema", cf.infer_schema) \
            .option("header", cf.first_row_is_header) \
            .option("sep", cf.delimiter) \
            .load(cf.input_file_path)
        # print('\n', cf.input_file_path, '\n', cf.schema, '\n')

    # fetching table from db from databricks
    elif cf.file_type == 'jdbc':
        df = spark.read.format("jdbc") \
            .option("driver", cf.driver) \
            .option("url", cf.url) \
            .option("dbtable", cf.table) \
            .option("user", cf.user) \
            .option("password", cf.password) \
            .option("inferSchema", cf.infer_schema) \
            .option("header", cf.first_row_is_header) \
            .load()

        df.write.format("csv") \
            .option("enoding", cf.charset) \
            .option("header", cf.first_row_is_header) \
            .option("sep", cf.delimiter) \
            .save('/home/hari/HyperConverter/test')

        # pdf = df.select('*').toPandas()
        # path = '/home/hari/HyperConverter/test.csv'
        # pdf.to_csv(path, sep=',', index=False)

        path = glob.glob('/home/hari/HyperConverter/test/part*.csv')
        cf.input_file_path = path[0]
        cf.input_file_path = path
        print('\n', cf.input_file_path, '\n')

    col = list(df.dtypes)
    print(col)
    print(len(col))
    for i in range(len(col)):
        col[i] = list(col[i])
        col[i][1] = type_[col[i][1]]
    # print('\n', col, '\n')

    x = []
    for i, j in col:
        print(i, j)
        if j == 'varchar':
            max_length = df.agg({i: "max"}).collect()[0]
            #print(max_length)
            xyz = max_length["max({})".format(i)]

            if xyz != None:
                max_length = len(xyz)
                if 19 <= max_length <= 40:
                    max_length = 100
                else:
                    max_length = 30
            else:
                max_length = 35
            print(max_length)
            x.append(
                TableDefinition.Column(i, SqlType.varchar(max_length),
                                       NULLABLE))
        elif j == 'int':
            x.append(TableDefinition.Column(i, SqlType.int(), NULLABLE))
        elif j == 'date':
            x.append(TableDefinition.Column(i, SqlType.date(), NULLABLE))
        elif j == 'numeric':
            x.append(
                TableDefinition.Column(i, SqlType.numeric(10, 4), NULLABLE))
        elif j == 'bool':
            x.append(TableDefinition.Column(i, SqlType.bool(), NULLABLE))
        elif j == 'big_int':
            x.append(TableDefinition.Column(i, SqlType.big_int(), NULLABLE))
        elif j == 'double':
            x.append(TableDefinition.Column(i, SqlType.double(), NULLABLE))
        elif j == 'text':
            print("this is culprate", i, j)
            x.append(TableDefinition.Column(i, SqlType.text(), NULLABLE))
    print(x)
    print(len(x))
    return x