Example #1
0
 def __init__(self):
     self.location = "./output/meli_challenge_result.hyper"
     self.test_location = "../output/meli_challenge_result.hyper"
     self.searchResult_table = TableDefinition('results', [
         TableDefinition.Column('id', SqlType.text(), Nullability.NOT_NULLABLE),
         TableDefinition.Column('site_id', SqlType.text(), Nullability.NOT_NULLABLE),
         TableDefinition.Column('title', SqlType.text(), Nullability.NOT_NULLABLE),
         TableDefinition.Column('seller', SqlType.text(), Nullability.NOT_NULLABLE),
         TableDefinition.Column('price', SqlType.text(), Nullability.NOT_NULLABLE),
         TableDefinition.Column('prices', SqlType.json(), Nullability.NOT_NULLABLE),
         TableDefinition.Column('sale_price', SqlType.text(), Nullability.NULLABLE),
         TableDefinition.Column('currency_id', SqlType.text(), Nullability.NULLABLE),
         TableDefinition.Column('available_quantity', SqlType.int(), Nullability.NULLABLE),
         TableDefinition.Column('sold_quantity', SqlType.int(), Nullability.NULLABLE),
         TableDefinition.Column('buying_mode', SqlType.text(), Nullability.NULLABLE),
         TableDefinition.Column('listing_type_id', SqlType.text(), Nullability.NULLABLE),
         TableDefinition.Column('stop_time', SqlType.text(), Nullability.NULLABLE),
         TableDefinition.Column('condition', SqlType.text(), Nullability.NULLABLE),
         TableDefinition.Column('permalink', SqlType.text(), Nullability.NULLABLE),
         TableDefinition.Column('thumbnail', SqlType.text(), Nullability.NULLABLE),
         TableDefinition.Column('accepts_mercadopago', SqlType.bool(), Nullability.NULLABLE),
         TableDefinition.Column('installments', SqlType.text(), Nullability.NULLABLE),
         TableDefinition.Column('address', SqlType.text(), Nullability.NULLABLE),
         TableDefinition.Column('shipping', SqlType.text(), Nullability.NULLABLE),
         TableDefinition.Column('seller_address', SqlType.text(), Nullability.NULLABLE),
         TableDefinition.Column('attributes', SqlType.text(), Nullability.NOT_NULLABLE),
         TableDefinition.Column('original_price', SqlType.text(), Nullability.NULLABLE),
         TableDefinition.Column('category_id', SqlType.text(), Nullability.NOT_NULLABLE),
         TableDefinition.Column('official_store_id', SqlType.int(), Nullability.NULLABLE),
         TableDefinition.Column('domain_id', SqlType.text(), Nullability.NULLABLE),
         TableDefinition.Column('catalog_product_id', SqlType.text(), Nullability.NULLABLE),
         TableDefinition.Column('tags', SqlType.text(), Nullability.NULLABLE),
         TableDefinition.Column('catalog_listing', SqlType.bool(), Nullability.NULLABLE),
         TableDefinition.Column('order_backend', SqlType.int(), Nullability.NULLABLE),
     ])
    def test_get_table_def(self):
        data = [
            (1001, 1, "Jane", "Doe", "2000-05-01", 29.0, False),
            (1002, 2, "John", "Doe", "1988-05-03", 33.0, False),
            (2201, 3, "Elonzo", "Smith", "1990-05-03", 21.0, True),
            (None, None, None, None, None, None, None)  # Test Nulls
        ]
        df = get_spark_session()\
            .createDataFrame(data, ["id", "dept_id", "first_name", "last_name", "dob", "age", "is_temp"])\
            .createOrReplaceTempView("employees")
        df = get_spark_session().sql(
            "select id, cast(dept_id as short), first_name, "
            "last_name, dob, age, is_temp from employees")
        table_def = get_table_def(df, "Extract", "Extract")

        # Ensure that the Table Name matches
        assert (table_def.table_name.name == Name("Extract"))

        # Ensure that the the TableDefinition column names match
        assert (table_def.get_column(0).name == Name("id"))
        assert (table_def.get_column(1).name == Name("dept_id"))
        assert (table_def.get_column(2).name == Name("first_name"))
        assert (table_def.get_column(3).name == Name("last_name"))
        assert (table_def.get_column(4).name == Name("dob"))
        assert (table_def.get_column(5).name == Name("age"))
        assert (table_def.get_column(6).name == Name("is_temp"))

        # Ensure that the column data types were converted correctly
        assert (table_def.get_column(0).type == SqlType.big_int())
        assert (table_def.get_column(1).type == SqlType.small_int())
        assert (table_def.get_column(2).type == SqlType.text())
        assert (table_def.get_column(3).type == SqlType.text())
        assert (table_def.get_column(4).type == SqlType.text())
        assert (table_def.get_column(5).type == SqlType.double())
        assert (table_def.get_column(6).type == SqlType.bool())
def convert_datatype(coldatatype):
    """
    [summary]
        This converts the datatype of the column of a given dataframe.
    
    Args:
        Datatype of the column in string format
    
    Returns:
        The tableau hyper extract compatible datatype after converting the dataframe datatype and a default value for NaN cases
    """

    datatype = SqlType.text()
    def_value = ''

    if 'datetime' in coldatatype.lower():
        datatype = SqlType.timestamp()
    elif 'str' in coldatatype.lower():
        datatype = SqlType.text()
    elif 'boolean' in coldatatype.lower():
        datatype = SqlType.bool()
    elif 'int' in coldatatype.lower():
        datatype = SqlType.int()
        def_value = 0
    elif 'float' in coldatatype.lower():
        datatype = SqlType.double()
        def_value = 0
    elif 'period' in coldatatype.lower():
        datatype = SqlType.interval()
    elif 'object' in coldatatype.lower():
        datatype = SqlType.text()
    else:
        datatype = SqlType.text()

    return (datatype, def_value)
Example #4
0
def convert_struct_field(column: StructField) -> TableDefinition.Column:
    """Converts a Spark StructField to a Tableau Hyper SqlType"""
    if column.dataType == IntegerType():
        sql_type = SqlType.int()
    elif column.dataType == LongType():
        sql_type = SqlType.big_int()
    elif column.dataType == ShortType():
        sql_type = SqlType.small_int()
    elif column.dataType == DoubleType():
        sql_type = SqlType.double()
    elif column.dataType == FloatType():
        sql_type = SqlType.double()
    elif column.dataType == BooleanType():
        sql_type = SqlType.bool()
    elif column.dataType == DateType():
        sql_type = SqlType.date()
    elif column.dataType == TimestampType():
        sql_type = SqlType.timestamp()
    elif column.dataType == StringType():
        sql_type = SqlType.text()
    else:
        # Trap the DecimalType case
        if str(column.dataType).startswith("DecimalType"):
            # Max precision is only up to 18 decimal places in Tableau Hyper API
            precision = column.dataType.precision if column.dataType.precision <= 18 else 18
            scale = column.dataType.scale
            sql_type = SqlType.numeric(precision, scale)
        else:
            raise ValueError(f'Invalid StructField datatype for column `{column.name}` : {column.dataType}')
    nullable = NULLABLE if column.nullable else NOT_NULLABLE
    return TableDefinition.Column(name=column.name, type=sql_type, nullability=nullable)
Example #5
0
    def __init__(self):
        """
        Handler for conversion of storage types between DSS and Tableau Hyper

        DSS storage types:

        "string","date","geopoint","geometry","array","map","object","double",
        "boolean","float","bigint","int","smallint","tinyint"

        Tableau Hyper storage types:

        TypeTag.BOOL, TypeTag.BIG_INT, TypeTag.SMALL_INT, TypeTag.INT, TypeTag.NUMERIC,
        TypeTag.DOUBLE, TypeTag.OID, TypeTag.BYTES, TypeTag.TEXT, TypeTag.VARCHAR, TypeTag.CHAR,
        TypeTag.JSON, TypeTag.DATE, TypeTag.INTERVAL, TypeTag.TIME, TypeTag.TIMESTAMP,
        TypeTag.TIMESTAMP_TZ, TypeTag.GEOGRAPHY

        """
        handle_null = lambda f: lambda x: None if pd.isna(x) else f(x)

        # Mapping DSS to Tableau Hyper types
        self.mapping_dss_to_hyper = {
            'array': (SqlType.text(), handle_null(str)),
            'bigint': (SqlType.big_int(), handle_null(int)),
            'boolean': (SqlType.bool(), handle_null(bool)),
            'date': (SqlType.timestamp(), handle_null(to_hyper_timestamp)),
            'double': (SqlType.double(), handle_null(float)),
            'float': (SqlType.double(), handle_null(float)),
            'geometry': (SqlType.text(), handle_null(str)),
            'geopoint': (SqlType.geography(), handle_null(to_hyper_geography)),
            'int': (SqlType.int(), handle_null(int)),
            'map': (SqlType.text(), handle_null(str)),
            'object': (SqlType.text(), handle_null(str)),
            'smallint': (SqlType.small_int(), handle_null(int)),
            'string': (SqlType.text(), handle_null(str)),
            'tinyint': (SqlType.small_int(), handle_null(int)),
        }

        # Mapping Tableau Hyper to DSS types
        self.mapping_hyper_to_dss = {
            TypeTag.BIG_INT: ('bigint', handle_null(int)),
            TypeTag.BYTES: ('string', handle_null(str)),
            TypeTag.BOOL: ('boolean', handle_null(bool)),
            TypeTag.CHAR: ('string', handle_null(str)),
            TypeTag.DATE: ('date', handle_null(to_dss_date)),
            TypeTag.DOUBLE: ('double', handle_null(float)),
            TypeTag.GEOGRAPHY: ('geopoint', handle_null(to_dss_geopoint)),
            TypeTag.INT: ('int', handle_null(int)),
            TypeTag.INTERVAL: ('string', handle_null(str)),
            TypeTag.JSON: ('string', handle_null(str)),
            TypeTag.NUMERIC: ('double', handle_null(float)),
            TypeTag.OID: ('string', handle_null(str)),
            TypeTag.SMALL_INT: ('smallint', handle_null(int)),
            TypeTag.TEXT: ('string', handle_null(str)),
            TypeTag.TIME: ('string', handle_null(str)),
            TypeTag.TIMESTAMP: ('date', handle_null(to_dss_timestamp)),
            TypeTag.TIMESTAMP_TZ: ('string', handle_null(str)),
            TypeTag.VARCHAR: ('string', handle_null(str))
        }
 def fn_convert_to_hyper_types(given_type):
     switcher = {
         'empty': SqlType.text(),
         'bool': SqlType.bool(),
         'int': SqlType.big_int(),
         'float-dot': SqlType.double(),
         'date-YMD': SqlType.date(),
         'date-MDY': SqlType.date(),
         'date-DMY': SqlType.date(),
         'time-24': SqlType.time(),
         'time-12': SqlType.time(),
         'datetime-24-YMD': SqlType.timestamp(),
         'datetime-12-MDY': SqlType.timestamp(),
         'datetime-24-DMY': SqlType.timestamp(),
         'str': SqlType.text()
     }
     identified_type = switcher.get(given_type)
     if identified_type is None:
         identified_type = SqlType.text()
     return identified_type
Example #7
0
    def _hyper_sql_type(self, source_column):
        """
        Finds the correct Hyper column type for source_column

        source_column (obj): Source column (Instance of google.cloud.bigquery.schema.SchemaField)

        Returns a tableauhyperapi.SqlType Object
        """

        source_column_type = source_column.field_type
        return_sql_type = {
            "BOOL": SqlType.bool(),
            "BYTES": SqlType.bytes(),
            "DATE": SqlType.date(),
            "DATETIME": SqlType.timestamp(),
            "INT64": SqlType.big_int(),
            "INTEGER": SqlType.int(),
            "NUMERIC": SqlType.numeric(18, 9),
            "FLOAT64": SqlType.double(),
            "STRING": SqlType.text(),
            "TIME": SqlType.time(),
            "TIMESTAMP": SqlType.timestamp_tz(),
        }.get(source_column_type)

        if return_sql_type is None:
            error_message = "No Hyper SqlType defined for BigQuery source type: {}".format(
                source_column_type
            )
            logger.error(error_message)
            raise LookupError(error_message)

        logger.debug(
            "Translated source column type {} to Hyper SqlType {}".format(
                source_column_type, return_sql_type
            )
        )
        return return_sql_type
    Connection,
    SqlType,
    TableDefinition,
    CreateMode,
    TableName,
    Inserter,
)

dtype_mapper = {
    "string": SqlType.text(),
    "str": SqlType.text(),
    "object": SqlType.text(),
    "O": SqlType.text(),
    "int64": SqlType.big_int(),
    "float64": SqlType.double(),
    "bool": SqlType.bool(),
    "datetime64[ns]": SqlType.timestamp(),
    "timedelta[ns]": SqlType.interval(),
    "category": SqlType.text(),
}


def read_hyper(path_to_hyper_file, custom_schema="Extract"):
    """Read a Tableau Hyper file and turn it into a Pandas DataFrame.

    Currently can only read single table extracts, which is Tableau's
    default way of creating an extract.

    Args:
        path_to_hyper_file: Specify the path to the .hyper file
        custom_schema: If you need to change the schema name. Defaults to "Extract"
 TableDefinition.Column('party_group_code', SqlType.varchar(30), NULLABLE),
 TableDefinition.Column('party_group_description', SqlType.varchar(30), NULLABLE),
 TableDefinition.Column('party_type_code', SqlType.varchar(30), NULLABLE),
 TableDefinition.Column('party_type_description', SqlType.varchar(30), NULLABLE),
 TableDefinition.Column('party_subtype_code', SqlType.varchar(30), NULLABLE),
 TableDefinition.Column('party_subtype_description', SqlType.varchar(30), NULLABLE),
 TableDefinition.Column('party_service_code', SqlType.varchar(30), NULLABLE),
 TableDefinition.Column('party_service_description', SqlType.varchar(30), NULLABLE),
 TableDefinition.Column('br_segment', SqlType.varchar(30), NULLABLE),
 TableDefinition.Column('br_sub_segment', SqlType.varchar(30), NULLABLE),
 TableDefinition.Column('channel_name', SqlType.varchar(30), NULLABLE),
 TableDefinition.Column('market_normalized_name', SqlType.varchar(100), NULLABLE),
 TableDefinition.Column('brand_normalized_name', SqlType.varchar(30), NULLABLE),
 TableDefinition.Column('form_strength_normalized_name', SqlType.varchar(35), NULLABLE),
 TableDefinition.Column('normalized_name', SqlType.varchar(35), NULLABLE),
 TableDefinition.Column('competitor_flag', SqlType.bool(), NULLABLE),
 TableDefinition.Column('year', SqlType.int(), NULLABLE),
 TableDefinition.Column('semester_number', SqlType.int(), NULLABLE),
 TableDefinition.Column('quarter_number', SqlType.int(), NULLABLE),
 TableDefinition.Column('month_number', SqlType.int(), NULLABLE),
 TableDefinition.Column('week_number', SqlType.int(), NULLABLE),
 TableDefinition.Column('transaction_timestamp', SqlType.date(), NULLABLE),
 TableDefinition.Column('ddd_source_units', SqlType.int(), NULLABLE),
 TableDefinition.Column('ddd_source_units_uom', SqlType.varchar(30), NULLABLE),
 TableDefinition.Column('ddd_units', SqlType.int(), NULLABLE),
 TableDefinition.Column('ddd_units_uom', SqlType.varchar(30), NULLABLE),
 TableDefinition.Column('ddd_dollars', SqlType.numeric(10,4), NULLABLE),
 TableDefinition.Column('ddd_dot', SqlType.numeric(10,5), NULLABLE),
 TableDefinition.Column('ddd_mcg', SqlType.numeric(10,5), NULLABLE),
 TableDefinition.Column('ddd_normalized_units', SqlType.int(), NULLABLE),
 TableDefinition.Column('ddd_normalized_units_uom', SqlType.varchar(30), NULLABLE),
def sparkConnect():
    # fetching DF from spark filestore
    if cf.file_type == 'csv':
        df = spark.read.format(cf.file_type) \
            .option("inferSchema", cf.infer_schema) \
            .option("header", cf.first_row_is_header) \
            .option("sep", cf.delimiter) \
            .load(cf.input_file_path)
        # print('\n', cf.input_file_path, '\n', cf.schema, '\n')

    # fetching table from db from databricks
    elif cf.file_type == 'jdbc':
        df = spark.read.format("jdbc") \
            .option("driver", cf.driver) \
            .option("url", cf.url) \
            .option("dbtable", cf.table) \
            .option("user", cf.user) \
            .option("password", cf.password) \
            .option("inferSchema", cf.infer_schema) \
            .option("header", cf.first_row_is_header) \
            .load()

        df.write.format("csv") \
            .option("enoding", cf.charset) \
            .option("header", cf.first_row_is_header) \
            .option("sep", cf.delimiter) \
            .save('/home/hari/HyperConverter/test')

        # pdf = df.select('*').toPandas()
        # path = '/home/hari/HyperConverter/test.csv'
        # pdf.to_csv(path, sep=',', index=False)

        path = glob.glob('/home/hari/HyperConverter/test/part*.csv')
        cf.input_file_path = path[0]
        cf.input_file_path = path
        print('\n', cf.input_file_path, '\n')

    col = list(df.dtypes)
    print(col)
    print(len(col))
    for i in range(len(col)):
        col[i] = list(col[i])
        col[i][1] = type_[col[i][1]]
    # print('\n', col, '\n')

    x = []
    for i, j in col:
        print(i, j)
        if j == 'varchar':
            max_length = df.agg({i: "max"}).collect()[0]
            #print(max_length)
            xyz = max_length["max({})".format(i)]

            if xyz != None:
                max_length = len(xyz)
                if 19 <= max_length <= 40:
                    max_length = 100
                else:
                    max_length = 30
            else:
                max_length = 35
            print(max_length)
            x.append(
                TableDefinition.Column(i, SqlType.varchar(max_length),
                                       NULLABLE))
        elif j == 'int':
            x.append(TableDefinition.Column(i, SqlType.int(), NULLABLE))
        elif j == 'date':
            x.append(TableDefinition.Column(i, SqlType.date(), NULLABLE))
        elif j == 'numeric':
            x.append(
                TableDefinition.Column(i, SqlType.numeric(10, 4), NULLABLE))
        elif j == 'bool':
            x.append(TableDefinition.Column(i, SqlType.bool(), NULLABLE))
        elif j == 'big_int':
            x.append(TableDefinition.Column(i, SqlType.big_int(), NULLABLE))
        elif j == 'double':
            x.append(TableDefinition.Column(i, SqlType.double(), NULLABLE))
        elif j == 'text':
            print("this is culprate", i, j)
            x.append(TableDefinition.Column(i, SqlType.text(), NULLABLE))
    print(x)
    print(len(x))
    return x