def _sql_regression_part1(weights, columns, table_name, dbms: str): """ This method creates the portion of the SQL query responsible for the application of the dot product between regression weights and features. :param weights: the regression weights :param columns: the feature names :param table_name: the name of the table or the subquery where to read the data :param dbms: the name of the dbms :return: the portion of the SQL query which implements the regression dot products """ if not isinstance(weights, Iterable): raise TypeError( "Wrong data type for parameter weights. Only iterable data type is allowed." ) if not isinstance(columns, Iterable): raise TypeError( "Wrong data type for parameter columns. Only iterable data type is allowed." ) if not isinstance(table_name, str): raise TypeError( "Wrong data type for parameter table_name. Only string data type is allowed." ) for weight in weights: if not isinstance(weight, float): raise TypeError( "Wrong data type for weights elements. Only float data type is allowed." ) for col in columns: if not isinstance(col, str): raise TypeError( "Wrong data type for columns elements. Only string data type is allowed." ) dbms_util = DBMSUtils() query = "SELECT " for i in range(len(columns)): col = dbms_util.get_delimited_col(dbms, columns[i]) query += "({} * {}) AS {} ,".format(col, weights[i], col) query = query[:-1] # remove the last ',' query += " FROM {}".format(table_name) return query
def _sql_regression_part2(bias, columns, table_name, dbms: str): """ This method creates the portion of the SQL query responsible for the application of the linear combination over the regression dot products. :param bias: the regression bias :param columns: the feature names :param table_name: the name of the table or the subquery where to read the data :param dbms: the name of the dbms :return: the portion of the SQL query which implements the regression dot product linear combination """ if not isinstance(bias, float): raise TypeError( "Wrong data type for parameter bias. Only float data type is allowed." ) if not isinstance(columns, Iterable): raise TypeError( "Wrong data type for parameter columns. Only iterable data type is allowed." ) if table_name is not None: if not isinstance(table_name, str): raise TypeError( "Wrong data type for parameter table_name. Only string data type is allowed." ) for col in columns: if not isinstance(col, str): raise TypeError( "Wrong data type for columns elements. Only string data type is allowed." ) dbms_util = DBMSUtils() query = "SELECT " query += " ( " for col in columns: col = dbms_util.get_delimited_col(dbms, col) query += "{} +".format(col) query += "{}".format(bias) query += ") AS Score" query += " FROM {}".format(table_name) return query
def create_temp_table(self, params: dict): """ This method creates the table that will store the regression parameters (i.e., weights and bias) :param params: the sdca regressor params :return: sql query containing the statement for the regression table creation """ params = SDCARegressorSQL.check_params(params) create_stm = f'DROP TABLE IF EXISTS {self.temp_table_name};\n' create_stm += f'CREATE TABLE {self.temp_table_name} ({self.temp_table_pk} int, {self.temp_table_weight_col} float);\n' insert_stm = f'INSERT INTO {self.temp_table_name} VALUES (\n' for widx, w in enumerate(params['weights']): if self.dbms == 'sqlserver': if widx > 0 and widx % 1000 == 0: insert_stm = insert_stm[:-2] # remove ',\n' insert_stm += ';\n\n' insert_stm += f'INSERT INTO {self.temp_table_name} VALUES\n' insert_stm += f'({widx}, {w}),\n' insert_stm = insert_stm[-2:] # remove ',\n' insert_stm += ');\n' index_name = f'{self.temp_table_name}_{self.temp_table_pk}' index_stm = DBMSUtils.create_index(self.dbms, index_name, self.temp_table_name, self.temp_table_pk) # query = f'{create_stm}{insert_stm}{index_stm}' return [create_stm, insert_stm, index_stm]
def _get_linear_combination(self, weights, bias, columns): """ This method generates the linear combination component of the LogisticRegression function. :param weights: the weights for a target class :param bias: the bias for a target class :param columns: the feature names :return: the portion of SQL query responsible for the application of the linear combination component of the LogisticRegression function """ dbms_util = DBMSUtils() query = "" for i in range(len(columns)): c = dbms_util.get_delimited_col(self.dbms, columns[i]) query += "({}*{}) + ".format(c, weights[i]) query = "{} {}".format(query, bias) return query
def _get_query_dense_ohe(ohe_params: dict, table_name: str, dbms: str): """ This method creates the SQL query that implements a dense one-hot-encoding transformation. :param ohe_params: dictionary containing the parameters extracted from the fitted OneHotEncoder :param table_name: the table name or the previous subquery where to read the data :param dbms: the name of the dbms :return: the SQL query that implements a dense one-hot-encoding transformation """ ohe_params = OneHotEncoderSQL.check_ohe_params(ohe_params) assert isinstance(table_name, str) assert isinstance(dbms, str) ohe_map = ohe_params["ohe_encoding"] remaining_features = ohe_params["other_features"] dbms_utils = DBMSUtils() ohe_query = "SELECT " # implement one-hot encoding in SQL for feature_after_ohe in ohe_map: # feature_after_ohe = ohe_feature_map["feature_after_ohe"] fao = dbms_utils.get_delimited_col(dbms, feature_after_ohe) ohe_feature_map = ohe_map[feature_after_ohe] feature_before_ohe = dbms_utils.get_delimited_col( dbms, ohe_feature_map["feature_before_ohe"]) value = ohe_feature_map["value"] ohe_query += "CASE WHEN {} = '{}' THEN 1 ELSE 0 END AS {},\n".format( feature_before_ohe, value, fao) # add the remaining features to the selection for f in remaining_features: ohe_query += "{},".format(dbms_utils.get_delimited_col(dbms, f)) ohe_query = ohe_query[:-1] # remove the last ',' ohe_query += " FROM {}".format(table_name) return ohe_query
def query(self, table_name): """ This method creates the SQL query that implements into SQL an One Hot Encoding. :param table_name: the table name or the previous subquery where to read the data :return: the SQL query that implements the One Hot Encoding """ assert isinstance(table_name, str), "Wrong data type for param 'table_name'." assert self.params is not None, "No ohe params extracted." assert self.mode is not None, "No mode selected." dbms_util = DBMSUtils() auto_inc = dbms_util.get_auto_increment_col(self.dbms) # if the table provided in input is the result of a previous query if len(table_name) > 7 and table_name[-7:] == 'AS data': real_tab_name = 'data' else: # only a table name is provided real_tab_name = table_name self.input_table_name = f'(select {auto_inc} AS {self.ohe_table_pk}, {real_tab_name}.* FROM {table_name}) AS T' # create the SQL query that performs the One Hot Encoding pre_ohe_queries = None if self.mode == 'dense': ohe_query = self._get_query_dense_ohe(self.params, table_name, dbms=self.dbms) elif self.mode == 'sparse': pre_ohe_queries, ohe_query = self._get_query_sparse_ohe( self.params) else: raise ValueError(f"Wrong mode ({self.mode}).") return pre_ohe_queries, ohe_query
def get_column(url_connection, table_name, column_name): url_connection = check_url_connection(url_connection) dbms = 'mysql' if 'mssql' in url_connection: dbms = 'sqlserver' try: engine = create_engine(url_connection) with engine.connect() as connection: res = connection.execute("select {} from {}".format( DBMSUtils.get_delimited_col(dbms, column_name), table_name)) labels = [x[0] for x in res] return labels except SQLAlchemyError as e: logging.error(e) return None
def create_temp_table(self, params: dict): """ This method creates the table that will store the logistic regression parameters (i.e., weights and bias) :param params: the logistic regression params :return: sql query containing the statement for the logistic regression table creation """ params = LogisticRegressionSQL.check_params(params) query_list = [] # query = '' for class_idx in range(len(params['weights'])): tab_name = f'{self.temp_table_name}_{class_idx}' class_weights = params['weights'][class_idx] create_stm = f'DROP TABLE IF EXISTS {tab_name};\n' create_stm += f'CREATE TABLE {tab_name} ({self.temp_table_pk} int, {self.temp_table_weight_col} float);\n' insert_stm = f'INSERT INTO {tab_name} VALUES\n' for widx, w in enumerate(class_weights): if self.dbms == 'sqlserver': if widx > 0 and widx % 1000 == 0: insert_stm = insert_stm[:-2] # remove ',\n' insert_stm += ';\n\n' insert_stm += f'INSERT INTO {tab_name} VALUES\n' insert_stm += f'({widx}, {w}),\n' insert_stm = insert_stm[:-2] # remove ',\n' insert_stm += ';\n' index_name = f'{tab_name}_{self.temp_table_pk}' index_stm = DBMSUtils.create_index(self.dbms, index_name, tab_name, self.temp_table_pk) # query += f'{create_stm}{insert_stm}{index_stm}' query_list += [create_stm, insert_stm, index_stm] return query_list
def _get_query_sparse_ohe(self, ohe_params: dict): """ This method creates an SQL query that implements a sparse ohe transformation. The query is composed by a main CASE statement that replicates the OHE mapping. For high dimensional data, it is not possible to encode the mapping inside a single CASE statement, because of the limit of the number of WHEN statements that can be inserted in a query. In this case multiple queries are generated and for each of them the maximum number of WHEN statements allowed is considered. Each query result is saved into a temporary table. :param ohe_params: dictionary containing the parameters extracted from the fitted OneHotEncoder :return: the SQL query that implements the sparse One Hot Encoding transformation """ ohe_params = OneHotEncoderSQL.check_ohe_params(ohe_params) ohe_feature_map = ohe_params['ohe2idx_map'] original_ohe_features = ohe_params['ohe_features'] ohe_query = "" # considering that each feature after the ohe is used to create a WHEN statement, it is needed to check if this # number if greater (or not) than the maximum number of WHEN statements that can be included in a single CASE # statement. For SQLSERVER the maximum number of WHEN statements is 9700. # https://www.sqlservercentral.com/forums/topic/maximum-number-of-when-then-lines-in-a-case-statement sql_max_when_statements = 9700 # sql_max_when_statements = 100 # if the OHE is applied directly on the original table then a temporary table is created to store OHE # results in a triplet data format warn_message = "A temporary table 'ohe_table' will be created." logging.warning(warn_message) # add to the ohe query the SQL statement for the creation of the intermediate ohe table create_ohe_table_query = f"DROP TABLE IF EXISTS {self.ohe_table_name};\n" create_ohe_table_query += f"CREATE TABLE {self.ohe_table_name}({self.ohe_table_pk} int, " create_ohe_table_query += f"{self.ohe_table_fval_col} float, {self.ohe_table_fidx_col} int);\n\n" # create_ohe_table_query += f" PRIMARY KEY({self.ohe_table_pk}, {self.ohe_table_fidx_col}));\n\n" # ohe_query += create_ohe_table_query # split, if needed, the OHEed features in batches smaller than the SQL limits ohe_feature_map_batches = [] num_batch = 1 # loop over OHEed columns for col in ohe_feature_map: feature_map = ohe_feature_map[col] num_ohe_features_per_col = len(feature_map) # check if the number of features derived from the current OHEed column is greater than the DBMS limits if num_ohe_features_per_col > sql_max_when_statements: # split the query in multiple batch queries batch_size = sql_max_when_statements if num_ohe_features_per_col % batch_size == 0: num_batch = num_ohe_features_per_col // batch_size else: num_batch = num_ohe_features_per_col // batch_size + 1 feature_map_vals = list(feature_map.items()) # loop over the number of batch for i in range(num_batch): # select a partition of the features after ohe batch_ohe_feature_map = dict( feature_map_vals[i * batch_size:i * batch_size + batch_size]) ohe_feature_map_batches.append( {col: batch_ohe_feature_map}) else: ohe_feature_map_batches.append({col: feature_map}) # loop over the batches ohe_sub_queries = [] for ohe_feature_map_batch in ohe_feature_map_batches: # create the SQL query that applies the One Hot Encoding on the selected features batch_mode = False if num_batch > 1: batch_mode = True ohe_batch_query = self._create_ohe_query(ohe_feature_map_batch, original_ohe_features, batch_mode=batch_mode) ohe_sub_queries.append(ohe_batch_query) # optimization: combine multiple ohe batch queries to reduce the total number of INSERT statements cum_sum = 0 current_combined_suq_queries = [] for j in range(len(ohe_feature_map_batches)): suq_query = ohe_sub_queries[j] ohe_feature_map_batch = ohe_feature_map_batches[j] ohe_batch_query_size = len(list(ohe_feature_map_batch.values())[0]) cum_sum += ohe_batch_query_size current_combined_suq_queries.append(suq_query) if cum_sum > sql_max_when_statements: cum_sum = ohe_batch_query_size list_joint_sub_queries = current_combined_suq_queries[:-1] current_combined_suq_queries = [ current_combined_suq_queries[-1] ] joint_sub_queries = "" for sub_query in list_joint_sub_queries: joint_sub_queries += "{}\n\n UNION ALL \n\n".format( sub_query[:-3]) # remove ';\n\n' joint_sub_queries = joint_sub_queries[:-15] + ";" # remove '\n\n UNION ALL \n\n' # if multiple batch queries are generated, they have to be saved in a temporary table with an # INSERT statement insert_stm = "" if num_batch > 1: insert_stm += f"INSERT INTO {self.ohe_table_name}\n" else: insert_stm += f"INSERT INTO {self.ohe_table_name}\n" ohe_query += "{}{}\n\n".format(insert_stm, joint_sub_queries) # combine the last ohe sub queries joint_sub_queries = "" for sub_query in current_combined_suq_queries: joint_sub_queries += "{}\n\n UNION ALL \n\n".format( sub_query[:-3]) # remove ';\n\n' joint_sub_queries = joint_sub_queries[:-15] + ";" # remove '\n\n UNION ALL \n\n' # if multiple batch queries are generated, they have to be saved in a temporary table with an # INSERT statement insert_stm = "" if num_batch > 1: insert_stm += f"INSERT INTO {self.ohe_table_name}\n" else: insert_stm += f"INSERT INTO {self.ohe_table_name}\n" ohe_query += "{}{}\n\n".format(insert_stm, joint_sub_queries) # create an index on the ohe table index_ohe = DBMSUtils.create_index( dbms=self.dbms, index_name=f'{self.ohe_table_name}_{self.ohe_table_pk}', target_table=self.ohe_table_name, target_col=self.ohe_table_pk) index_ohe += DBMSUtils.create_index( dbms=self.dbms, index_name=f'{self.ohe_table_name}_{self.ohe_table_fidx_col}', target_table=self.ohe_table_name, target_col=self.ohe_table_fidx_col) # ohe_query += index_ohe return [create_ohe_table_query, ohe_query, index_ohe], f'select * from {self.ohe_table_name}'
def post(self, request): # Params scenario = create_simulation_scenario(request.data) try: check_simulation_consistency(scenario) except ValueError as e: return Response({ 'detail': str(e), }, status=status.HTTP_400_BAD_REQUEST) # Get Dataset t_load_start = datetime.now() ds = get_table(scenario['db_url'], scenario['table']) features = ds.columns.to_list() if scenario['labels_type'] == 'column': # Remove Label column if exists features.remove(scenario['labels']) t_load_end = datetime.now() # ML Manager manager = MLManager() t_ml = [] t_db = [] # Testing Phase for i in range(scenario['batch_number']): ds_batch = get_batch(ds, i, scenario['batch_size']) if ds_batch.empty: break # Execute predict using MLManager and ML Library t_start = datetime.now() _ = manager.predict(ds[features], scenario['pipeline'].file) t_end = datetime.now() t_ml.append(t_end - t_start) # Create Batch for DBMS ds_batch.to_sql('batch', con=scenario["db_url"], if_exists="replace", index=False) # Generate query using MLManager dbms = DBMSUtils.get_dbms_from_str_connection(scenario['db_url']) queries, query = manager.generate_query(scenario['pipeline'].file, scenario['table'], features, dbms, scenario['optimizer']) # Execute query t_start = datetime.now() # Execute query _ = execute_multi_queries(scenario["db_url"], queries) t_end = datetime.now() t_db.append(t_end - t_start) # Finish Simulation return Response( { 'detail': 'Successfully predicted result', 'ml_results': { 'execution_time': (np.mean(t_ml) + (t_load_end - t_load_start)) }, 'dbms_results': { 'execution_time': np.mean(t_db) }, }, status=status.HTTP_200_OK)
def post(self, request): scenario = create_test_scenario(request.data) inference_time = 0 try: check_test_consistency(scenario) except ValueError as e: return Response({ 'detail': str(e), }, status=status.HTTP_400_BAD_REQUEST) # Load pipeline model from pipeline file pipeline = load_model(scenario.pipeline.file) if not pipeline: return Response( {'detail': 'The selected file isn\'t a pipeline objects'}, status=status.HTTP_400_BAD_REQUEST) # Extract pipeline information from loaded model pipeline = MLManager.extract_pipeline_components(pipeline) if not pipeline: return Response( {'detail': 'The selected file isn\'t a pipeline objects'}, status=status.HTTP_400_BAD_REQUEST) # Model params scenario.model = pipeline.get('model') scenario.transforms = json.dumps(pipeline.get('transforms', [])) # Dataset if scenario.run_db: # Get features from table features = get_columns(scenario.db_url, scenario.table) else: data_extractor_start = datetime.now() # Get Dataset ds = get_dataset(scenario) features = ds.columns.to_list() data_extractor_end = datetime.now() data_extractor_time = (data_extractor_end - data_extractor_start).total_seconds() inference_time += data_extractor_time if scenario.labels_type == 'column': # Remove Label column if exists features.remove(scenario.labels) # ML Manager manager = MLManager() # Testing Phase query = None if scenario.run_db: inference_start = datetime.now() # Generate query using MLManager dbms = DBMSUtils.get_dbms_from_str_connection(scenario.db_url) queries, query = manager.generate_query(scenario.pipeline.file, scenario.table, features, dbms, scenario.optimizer) # Execute query y_pred = execute_multi_queries(scenario.db_url, queries) y_pred = pd.Series(y_pred.iloc[:, 0], name='Label') inference_end = datetime.now() inference_time += (inference_end - inference_start).total_seconds() else: inference_start = datetime.now() # Execute predict using MLManager and ML Library y_pred = manager.predict(ds[features], scenario.pipeline.file) y_pred = pd.Series(y_pred, name='Label') inference_end = datetime.now() inference_time += (inference_end - inference_start).total_seconds() # Label labels = [] # Compute evaluation if scenario.labels_type: if scenario.labels_type == 'file': # Get labels from file labels_document = get_document_object(scenario.labels) labels = get_dataframe(labels_document.file) if labels is None: return Response( { 'detail': 'The selected labels file {} isn\'t valid'.format( labels_document.filename) }, status=status.HTTP_400_BAD_REQUEST) # Get first column from file labels = labels.iloc[:, 0].to_list() elif scenario.labels_type == 'table': # Get labels from table labels = get_table(scenario.db_url, scenario.labels) if not labels: return Response( { 'detail': 'The selected table {} isn\'t valid for label'. format(scenario.labels) }, status=status.HTTP_400_BAD_REQUEST) # Get first column from table labels = labels.iloc[:, 0].to_list() elif scenario.labels_type == 'column' and not scenario.run_db: # Get labels from column labels = ds[scenario.labels].to_list() elif scenario.labels_type == 'column' and scenario.run_db: # Get labels from table labels = get_column(scenario.db_url, scenario.table, scenario.labels) else: return Response({'detail': 'Select the wrong labels type'}, status=status.HTTP_400_BAD_REQUEST) # Compute evaluation res_evaluation = None if labels and scenario.metric: res_evaluation = manager.evaluate(scenario.metric, labels, y_pred) # Create predictions file test_result_name = "test_{}_{}.csv".format(scenario.model, datetime.now()) test_result_name = test_result_name.replace(' ', '_') test_result_name = test_result_name.replace(':', '_') y_pred.to_csv(test_result_name, index=False, header=True) # Save predictions in Document model f = open(test_result_name, 'rb') document = Document(file=File(f), filename=test_result_name) document.save() f.close() # Remove temporally predictions file os.remove(test_result_name) # Save Scenario model scenario.save() # Save ResultScenario result_scenario = ResultScenario() result_scenario.scenario = scenario result_scenario.execution_time = inference_time result_scenario.throughput = result_scenario.execution_time / len( y_pred) result_scenario.score = res_evaluation result_scenario.file_result = document.filename result_scenario.query = query result_scenario.save() return Response( { 'detail': 'Successfully predicted result', 'filename': test_result_name, 'scenario_id': scenario.id }, status=status.HTTP_201_CREATED)
def get_sql_nested_rules(tree: BaseDecisionTree, feature_names: list, is_classification: bool, dbms: str, merge_ohe_features: dict = None): """ This method extracts the rules from a BaseDecisionTree object and convert them in SQL. :param tree: BaseDecisionTree object :param feature_names: list of feature names :param is_classification: boolean flag that indicates whether the DTM is used in classification or regression :param dbms: the name of the dbms :param merge_ohe_features: (optional) ohe feature map to be merged in the decision rules :return: string containing the SQL query """ assert isinstance( tree, BaseDecisionTree ), "Only BaseDecisionTree data type is allowed for param 'tree'." assert isinstance( feature_names, list), "Only list data type is allowed for param 'features_names'." for f in feature_names: assert isinstance(f, str) assert isinstance( is_classification, bool ), "Only bool data type is allowed for param 'is_classification'." if merge_ohe_features is not None: DTMSQL._check_merge_ohe_features(merge_ohe_features) dbms_util = DBMSUtils() # get for each node, left, right child nodes, thresholds and features left = tree.tree_.children_left # left child for each node right = tree.tree_.children_right # right child for each node thresholds = tree.tree_.threshold # test threshold for each node features = [feature_names[i] for i in tree.tree_.feature] # features = tree.tree_.feature # indexes of the features used by the tree if is_classification: classes = tree.classes_ def visit_tree(node): # leaf node if left[node] == -1 and right[node] == -1: if is_classification: return " {} ".format(classes[np.argmax( tree.tree_.value[node][0])]) else: return " {} ".format(tree.tree_.value[node][0][0]) # internal node op = '<=' feature = features[node] thr = thresholds[node] if merge_ohe_features is not None: # if ohe features have to be merged in the decision tree, the tree conditions are changed # feature_after_ohe > 0.5 becomes original_cat_feature = val # feature_after_ohe <= 0.5 becomes original_cat_feature <> val if feature in merge_ohe_features: # only categorical features should pass this test mof = merge_ohe_features[feature] feature = mof['feature_before_ohe'] thr = "'{}'".format(mof['value']) op = '<>' sql_dtm_rule = f" CASE WHEN {dbms_util.get_delimited_col(dbms, feature)} {op} {thr} THEN" # check if current node has a left child if left[node] != -1: sql_dtm_rule += visit_tree(left[node]) sql_dtm_rule += "ELSE" # check if current node has a right child if right[node] != -1: sql_dtm_rule += visit_tree(right[node]) sql_dtm_rule += "END " return sql_dtm_rule # start tree visit from the root node root = 0 sql_dtm_rules = visit_tree(root) return sql_dtm_rules
def convert_rules_to_sql(rules: list, dbms: str, merge_ohe_features: dict = None): """ This method converts the rules extracted from a BaseDecisionTree object into SQL case statements. :param rules: rules extracted from a BaseDecisionTree object :param dbms: the name of the dbms :param merge_ohe_features: (optional) ohe feature map to be merged in the decision rules :return: string containing the SQL query """ DTMSQL._check_rule_format(rules) if merge_ohe_features is not None: DTMSQL._check_merge_ohe_features(merge_ohe_features) dbms_util = DBMSUtils() sql_query = "" sql_string = " CASE WHEN " # loop over the rules for item in rules: if not isinstance(item, tuple): # the item is a leaf score sql_string = sql_string[:-5] # remove 'WHEN ' if sql_string == ' CASE ': # case a tree is composed of only a leaf sql_query += str(item) else: sql_string += " THEN {} ".format(item) sql_query += sql_string sql_string = "WHEN " else: # the item is a rule condition op = item[1] thr = item[2] if op == 'l': op = '<=' elif op == 'r': op = '>' else: # when op is equals to '=' or '<>' the thr is a string thr = "'{}'".format(thr) feature_name = item[3] if merge_ohe_features is not None: # if ohe features have to be merged in the decision tree, the tree conditions are changed # feature_after_ohe > 0.5 becomes original_cat_feature = val # feature_after_ohe <= 0.5 becomes original_cat_feature <> val if feature_name in merge_ohe_features: # only categorical features should pass this test mof = merge_ohe_features[feature_name] feature_name = mof['feature_before_ohe'] old_op = op[:] op = '=' if old_op == '<=': if 0 <= thr: op = '<>' elif old_op == '>': if 0 > thr: op = '<>' else: raise ValueError("Wrong op.") thr = "'{}'".format(mof['value']) feature_name = dbms_util.get_delimited_col(dbms, feature_name) sql_string += "{} {} {} and ".format(feature_name, op, thr) if 'CASE' in sql_query: # ignore the case where a tree is composed of only a leaf sql_query += "END " return sql_query
def main(data_conf, pipeline_conf, str_db_conn, task, optimizer, debug=False): data_conf['str_db_conn'] = str_db_conn data_conf = check_data_config(data_conf) train = data_conf['train'] y_train = data_conf['y_train'] test = data_conf['test'] y_test = data_conf['y_test'] test_table_name = data_conf['test_table_name'] features = list(data_conf['train'].columns) conn = data_conf['db_conn'] tasks = ['regression', 'binary_classification', 'multi_classification'] if task not in tasks: raise ValueError(f"Wrong task {task}. Available tasks: {tasks}") if task == 'regression': eval_fn = evaluate_regression_results elif task == 'binary_classification': eval_fn = evaluate_binary_classification_results else: eval_fn = evaluate_multi_classification_results check_pipeline_config(pipeline_conf, features) model_name = pipeline_conf['model']['name'] mlmanager = MLManager() pipeline = create_pipeline(pipeline_conf) # fit print("\nStarting training...") pipeline.fit(train, y_train) _check_fitted_pipeline(pipeline, model_name, train) print("Training completed.\n") fitted_model = pipeline.steps[1][1] # ML predict print("\nStarting the ML inference...") ml_preds = pipeline.predict(test) ml_preds = pd.Series(ml_preds) print(ml_preds[:10]) eval_fn(model_name, y_test, ml_preds) print("ML inference completed.\n") # SQL conversion print("\nStarting the SQL conversion...") pipeline = extract_pipeline(pipeline) dbms = DBMSUtils.get_dbms_from_str_connection(data_conf['str_db_conn']) queries, all_query = create_query(pipeline, mlmanager, features, test_table_name, optimizer, dbms, debug) print("SQL Conversion completed.\n") # SQL predict print("\nStarting the SQL inference...") for q in queries[:-1]: try: for qq in q.split(';'): conn.execute(qq) except Exception as e: pass try: sql_preds = pd.read_sql(queries[-1], conn) except Exception as e: logging.error(e.args[0]) return sql_preds = pd.Series(sql_preds.iloc[:, 0]) print(sql_preds[:10]) null_val = False if sql_preds.isnull().sum() == 0: eval_fn(f"{model_name} SQL", y_test, sql_preds) else: null_val = True print("SQL inference completed.\n") # Null value test if null_val: print("\nNull value test") null_val_cnt = 0 for sample_id in sql_preds[sql_preds.isnull()].index: print(sample_id) for (attr, val) in zip(test.columns, test.iloc[sample_id, :].values): print("\t", attr, '=', val) null_val_cnt += 1 print(f"Found {null_val_cnt} null predictions.") # Accuracy test print("\nAccuracy test") equals = False for prec in range(10, 0, -1): ml_preds = ml_preds.map(lambda x: round(x, prec)) sql_preds = sql_preds.map(lambda x: round(x, prec)) if ml_preds.equals(sql_preds): print( f"The prediction scores are equal with {prec} decimal precision." ) print(":)") equals = True break if not equals: print("The prediction scores are not equal.") print(":(\n") ne_preds = 0 for i in range(len(ml_preds)): ml_pred = ml_preds.iloc[i] sql_pred = sql_preds.iloc[i] if ml_pred != sql_pred: if debug: print(i, ml_pred, sql_pred) for (attr, val) in zip(test.columns, test.iloc[i, :].values): print("\t", attr, '=', val) ne_preds += 1 print(f"Found {ne_preds} incorrect predictions.")
def optimize(self): out_pipeline = self.pipeline.copy() features = self.features # if the optimization is enabled then check if the operator fusion can be applied transformers_to_merge = [] if self.optimization: # if the pipeline includes an OneHotEncoder and a tree-based model then apply the one-hot encoding directly # in the decision tree rules if 'OneHotEncoder' in self.transformer_names and \ any(key in self.model_name for key in self.tree_based_model_keys): transformers_to_merge.append('OneHotEncoder') # get the fitted transformers from the pipeline prev_transform_features = [] new_transformers = [] sql_transformers_to_merge = [] for transformer in self.transformers: transformer_name = transformer["transform_name"] fitted_transformer = transformer["fitted_transform"] transformer_features = transformer["transform_features"] # retrieve the SQL wrapper related to the current transformer and extract its fitted params transformer_sql_wrapper = self.ml_manager.sql_transform_types[ transformer_name] transformer_params = transformer_sql_wrapper.get_params( fitted_transformer, transformer_features, features, prev_transform_features) features = transformer_params["out_all_features"] # transformers that have to be merged with the model are ignored in this phase if transformer_name not in transformers_to_merge: transformer_sql_wrapper.set_dbms(self.dbms) new_transformers.append(transformer_sql_wrapper) else: sql_transformers_to_merge.append( (transformer_name, transformer_sql_wrapper, transformer_params)) prev_transform_features = transformer_params[ "out_transform_features"][:] # get the fitted model from the pipeline and retrieve its SQL wrapper fitted_model = self.model["trained_model"] model_sql_wrapper = self.ml_manager.sql_model_types[self.model_name] model_sql_wrapper.set_dbms(self.dbms) # if the optimization is enabled then check if the sparse implementation can be applied sparse = False if self.optimization: # check if the number of features generated after the application of the transformers has reached dbms # limits; this is checked in particular for models based on linear combination of the features where # expression limits may be exceeded # for the moment only the OneHotEncoder transformer supports the sparse implementation linear_models = [ 'LogisticRegression', 'SGDRegressor', 'LinearRegression' ] if self.model_name in linear_models and any( ['OneHotEncoder' in str(type(nt)) for nt in new_transformers]): dbms_limit = DBMSUtils.get_expression_limits(self.dbms) if dbms_limit is not None and len(features) > dbms_limit: sparse = True print("Sparse implementation enabled.") # if dbms limits are reached then set the sparse implementation for some transformers and for the model # in the opposite case enable dense modality for all the pipeline components for nt in new_transformers: if 'OneHotEncoder' in str(type(nt)): if sparse: nt.set_mode('sparse') features = nt.get_table_cols() else: nt.set_mode('dense') if sparse: model_sql_wrapper.set_mode('sparse') else: model_sql_wrapper.set_mode('dense') # if no operator can be merged or the sparse implementation is active then disable the operator fusion # optimization if len(sql_transformers_to_merge) == 0 or sparse: model_sql_wrapper.reset_optimization() # if there are operators to merge and the sparse implementation is disabled then merge the previously # selected transformers with the model if len(sql_transformers_to_merge) > 0 and not sparse: for sql_transf_to_merge in sql_transformers_to_merge: transf_name = sql_transf_to_merge[0] transf_sql = sql_transf_to_merge[1] transf_params = sql_transf_to_merge[2] if transf_name == 'OneHotEncoder': print("Operator fusion enabled.") model_sql_wrapper.merge_ohe_with_trees( transf_params['ohe_encoding']) if self.optimization: if self.dbms == 'sqlserver' and any( [key in self.model_name for key in self.tree_based_model_keys]): if 'estimators_' in dir(fitted_model): if isinstance(fitted_model.estimators_, np.ndarray): depth = np.max([ tree.tree_.max_depth for estimator in fitted_model.estimators_ for tree in estimator ]) else: depth = np.max([ estimator.tree_.max_depth for estimator in fitted_model.estimators_ ]) else: depth = fitted_model.tree_.max_depth if depth > 10: model_sql_wrapper.set_flat_implementation() new_model = { 'trained_model': fitted_model, 'model_sql_wrapper': model_sql_wrapper, 'model_features': features, } out_pipeline['transforms'] = new_transformers out_pipeline['model'] = new_model return out_pipeline