def compute_stats(self, tbl: LuxSQLTable): """ Function which computes the min and max values for each variable within the specified Lux DataFrame's SQL table. Populates the metadata parameters of the specified Lux DataFrame. Parameters ---------- tbl: lux.LuxSQLTable lux.LuxSQLTable object whose metadata will be calculated Returns ------- None """ # precompute statistics tbl.unique_values = {} tbl._min_max = {} length_query = pandas.read_sql(lux.config.query_templates['length_query'].format(table_name = tbl.table_name, where_clause = ""),lux.config.SQLconnection,) tbl._length = list(length_query["length"])[0] self.get_unique_values(tbl) for attribute in tbl.columns: if tbl._data_type[attribute] == "quantitative": min_max_query = pandas.read_sql(lux.config.query_templates['min_max_query'].format(attribute = attribute, table_name = tbl.table_name),lux.config.SQLconnection,) tbl._min_max[attribute] = (list(min_max_query["min"])[0],list(min_max_query["max"])[0],)
def compute_data_type(self, lst: LuxSQLTable): """ Function which the equivalent Pandas data type of each variable within the specified Lux DataFrame's SQL table. Populates the metadata parameters of the specified Lux DataFrame. Parameters ---------- lst: lux.LuxSQLTable lux.LuxSQLTable object whose metadata will be calculated Returns ------- None """ data_type = {} sql_dtypes = {} self.get_cardinality(lst) if "." in lst.table_name: table_name = lst.table_name[lst.table_name.index(".") + 1 :] else: table_name = lst.table_name # get the data types of the attributes in the SQL table for attr in list(lst.columns): datatype_query = "SELECT DATA_TYPE FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = '{}' AND COLUMN_NAME = '{}'".format( table_name, attr ) datatype = list(pandas.read_sql(datatype_query, lux.config.SQLconnection)["data_type"])[0] sql_dtypes[attr] = datatype for attr in list(lst.columns): if str(attr).lower() in ["month", "year"]: data_type[attr] = "temporal" elif sql_dtypes[attr] in [ "character", "character varying", "boolean", "uuid", "text", ]: data_type[attr] = "nominal" elif sql_dtypes[attr] in [ "integer", "numeric", "decimal", "bigint", "real", "smallint", "smallserial", "serial", "double precision", ]: if lst.cardinality[attr] < 13: data_type[attr] = "nominal" elif check_if_id_like(lst, attr): lst._data_type[attr] = "id" else: data_type[attr] = "quantitative" elif "time" in sql_dtypes[attr] or "date" in sql_dtypes[attr]: data_type[attr] = "temporal" lst._data_type = data_type
def get_unique_values(self, tbl: LuxSQLTable): """ Function which collects the unique values for each variable within the specified Lux DataFrame's SQL table. Populates the metadata parameters of the specified Lux DataFrame. Parameters ---------- tbl: lux.LuxSQLTable lux.LuxSQLTable object whose metadata will be calculated Returns ------- None """ unique_vals = {} for attr in list(tbl.columns): unique_query = 'SELECT Distinct("{}") FROM {} WHERE "{}" IS NOT NULL'.format( attr, tbl.table_name, attr ) unique_data = pandas.read_sql( unique_query, lux.config.SQLconnection, ) unique_vals[attr] = list(unique_data[attr]) tbl.unique_values = unique_vals
def get_cardinality(self, tbl: LuxSQLTable): """ Function which computes the cardinality for each variable within the specified Lux DataFrame's SQL table. Populates the metadata parameters of the specified Lux DataFrame. Parameters ---------- tbl: lux.LuxSQLTable lux.LuxSQLTable object whose metadata will be calculated Returns ------- None """ cardinality = {} for attr in list(tbl.columns): card_query = 'SELECT Count(Distinct("{}")) FROM {} WHERE "{}" IS NOT NULL'.format( attr, tbl.table_name, attr ) card_data = pandas.read_sql( card_query, lux.config.SQLconnection, ) cardinality[attr] = list(card_data["count"])[0] tbl.cardinality = cardinality
def get_SQL_attributes(self, tbl: LuxSQLTable): """ Retrieves the names of variables within a specified Lux DataFrame's Postgres SQL table. Uses these variables to populate the Lux DataFrame's columns list. Parameters ---------- tbl: lux.LuxSQLTable lux.LuxSQLTable object whose columns will be populated Returns ------- None """ if "." in tbl.table_name: table_name = tbl.table_name[self.table_name.index(".") + 1 :] else: table_name = tbl.table_name attr_query = "SELECT column_name FROM INFORMATION_SCHEMA.COLUMNS where TABLE_NAME = '{}'".format( table_name ) attributes = list(pandas.read_sql(attr_query, lux.config.SQLconnection)["column_name"]) for attr in attributes: tbl[attr] = None tbl._setup_done = True
def get_SQL_attributes(self, tbl: LuxSQLTable): """ Retrieves the names of variables within a specified Lux DataFrame's Postgres SQL table. Uses these variables to populate the Lux DataFrame's columns list. Parameters ---------- tbl: lux.LuxSQLTable lux.LuxSQLTable object whose columns will be populated Returns ------- None """ if "." in tbl.table_name: table_name = tbl.table_name[self.table_name.index(".") + 1 :] else: table_name = tbl.table_name attr_query = lux.config.query_templates['table_attributes_query'].format( table_name = table_name, ) attributes = list(pandas.read_sql(attr_query, lux.config.SQLconnection)["column_name"]) for attr in attributes: tbl[attr] = None tbl._setup_done = True
def execute_sampling(tbl: LuxSQLTable): SAMPLE_FLAG = lux.config.sampling SAMPLE_START = lux.config.sampling_start SAMPLE_CAP = lux.config.sampling_cap SAMPLE_FRAC = 0.2 length_query = pandas.read_sql(lux.config.query_templates['length_query'].format(table_name = tbl.table_name, where_clause = ""),lux.config.SQLconnection,) limit = int(list(length_query["length"])[0]) * SAMPLE_FRAC sample_query = lux.config.query_templates['sample_query'].format(table_name = tbl.table_name, where_clause = "", num_rows = str(int(limit))) tbl._sampled = pandas.read_sql(sample_query, lux.config.SQLconnection)
def compute_data_type(self, tbl: LuxSQLTable): """ Function which the equivalent Pandas data type of each variable within the specified Lux DataFrame's SQL table. Populates the metadata parameters of the specified Lux DataFrame. Parameters ---------- tbl: lux.LuxSQLTable lux.LuxSQLTable object whose metadata will be calculated Returns ------- None """ data_type = {} self.get_cardinality(tbl) if "." in tbl.table_name: table_name = tbl.table_name[tbl.table_name.index(".") + 1 :] else: table_name = tbl.table_name # get the data types of the attributes in the SQL table for attr in list(tbl.columns): datatype_query = lux.config.query_templates['datatype_query'].format(table_name = table_name, attribute = attr) datatype = list(pandas.read_sql(datatype_query, lux.config.SQLconnection)["data_type"])[0] if str(attr).lower() in {"month", "year"} or "time" in datatype or "date" in datatype: data_type[attr] = "temporal" elif datatype in { "character", "character varying", "boolean", "uuid", "text", }: data_type[attr] = "nominal" elif datatype in { "integer", "numeric", "decimal", "bigint", "real", "smallint", "smallserial", "serial", "double", "double precision", }: if tbl.cardinality[attr] < 13: data_type[attr] = "nominal" elif check_if_id_like(tbl, attr): data_type[attr] = "id" else: data_type[attr] = "quantitative" tbl._data_type = data_type
def execute_sampling(tbl: LuxSQLTable): SAMPLE_FLAG = lux.config.sampling SAMPLE_START = lux.config.sampling_start SAMPLE_CAP = lux.config.sampling_cap SAMPLE_FRAC = 0.2 length_query = pandas.read_sql( "SELECT COUNT(*) as length FROM {}".format(tbl.table_name), lux.config.SQLconnection, ) limit = int(list(length_query["length"])[0]) * SAMPLE_FRAC tbl._sampled = pandas.read_sql( "SELECT * from {} LIMIT {}".format(tbl.table_name, str(limit)), lux.config.SQLconnection )
def compute_stats(self, tbl: LuxSQLTable): """ Function which computes the min and max values for each variable within the specified Lux DataFrame's SQL table. Populates the metadata parameters of the specified Lux DataFrame. Parameters ---------- tbl: lux.LuxSQLTable lux.LuxSQLTable object whose metadata will be calculated Returns ------- None """ # precompute statistics tbl.unique_values = {} tbl._min_max = {} length_query = pandas.read_sql( "SELECT COUNT(1) as length FROM {}".format(tbl.table_name), lux.config.SQLconnection, ) tbl._length = list(length_query["length"])[0] self.get_unique_values(tbl) for attribute in tbl.columns: if tbl._data_type[attribute] == "quantitative": min_max_query = pandas.read_sql( 'SELECT MIN("{}") as min, MAX("{}") as max FROM {}'.format( attribute, attribute, tbl.table_name ), lux.config.SQLconnection, ) tbl._min_max[attribute] = ( list(min_max_query["min"])[0], list(min_max_query["max"])[0], )
def get_unique_values(self, tbl: LuxSQLTable): """ Function which collects the unique values for each variable within the specified Lux DataFrame's SQL table. Populates the metadata parameters of the specified Lux DataFrame. Parameters ---------- tbl: lux.LuxSQLTable lux.LuxSQLTable object whose metadata will be calculated Returns ------- None """ unique_vals = {} for attr in list(tbl.columns): unique_query = lux.config.query_templates['unique_query'].format(attribute = attr, table_name = tbl.table_name) unique_data = pandas.read_sql(unique_query,lux.config.SQLconnection,) unique_vals[attr] = list(unique_data[attr]) tbl.unique_values = unique_vals
def get_cardinality(self, tbl: LuxSQLTable): """ Function which computes the cardinality for each variable within the specified Lux DataFrame's SQL table. Populates the metadata parameters of the specified Lux DataFrame. Parameters ---------- tbl: lux.LuxSQLTable lux.LuxSQLTable object whose metadata will be calculated Returns ------- None """ cardinality = {} for attr in list(tbl.columns): card_query = lux.config.query_templates['cardinality_query'].format(attribute = attr, table_name = tbl.table_name) card_data = pandas.read_sql(card_query,lux.config.SQLconnection,) cardinality[attr] = list(card_data["count"])[0] tbl.cardinality = cardinality
def compute_dataset_metadata(self, tbl: LuxSQLTable): """ Function which computes the metadata required for the Lux recommendation system. Populates the metadata parameters of the specified Lux DataFrame. Parameters ---------- tbl: lux.LuxSQLTable lux.LuxSQLTable object whose metadata will be calculated Returns ------- None """ if not tbl._setup_done: self.get_SQL_attributes(tbl) tbl._data_type = {} #####NOTE: since we aren't expecting users to do much data processing with the SQL database, should we just keep this ##### in the initialization and do it just once self.compute_data_type(tbl) self.compute_stats(tbl)
def execute_preview(lst: LuxSQLTable): lst._sampled = pandas.read_sql( "SELECT * from {} LIMIT 5".format(lst.table_name), lux.config.SQLconnection )