Example #1
0
    def _sql_regression_part1(weights, columns, table_name, dbms: str):
        """
        This method creates the portion of the SQL query responsible for the application of the dot product between
        regression weights and features.

        :param weights: the regression weights
        :param columns: the feature names
        :param table_name: the name of the table or the subquery where to read the data
        :param dbms: the name of the dbms
        :return: the portion of the SQL query which implements the regression dot products
        """

        if not isinstance(weights, Iterable):
            raise TypeError(
                "Wrong data type for parameter weights. Only iterable data type is allowed."
            )

        if not isinstance(columns, Iterable):
            raise TypeError(
                "Wrong data type for parameter columns. Only iterable data type is allowed."
            )

        if not isinstance(table_name, str):
            raise TypeError(
                "Wrong data type for parameter table_name. Only string data type is allowed."
            )

        for weight in weights:
            if not isinstance(weight, float):
                raise TypeError(
                    "Wrong data type for weights elements. Only float data type is allowed."
                )

        for col in columns:
            if not isinstance(col, str):
                raise TypeError(
                    "Wrong data type for columns elements. Only string data type is allowed."
                )

        dbms_util = DBMSUtils()

        query = "SELECT "
        for i in range(len(columns)):
            col = dbms_util.get_delimited_col(dbms, columns[i])
            query += "({} * {}) AS {} ,".format(col, weights[i], col)

        query = query[:-1]  # remove the last ','

        query += " FROM {}".format(table_name)

        return query
Example #2
0
    def _sql_regression_part2(bias, columns, table_name, dbms: str):
        """
        This method creates the portion of the SQL query responsible for the application of the linear combination over
        the regression dot products.

        :param bias: the regression bias
        :param columns: the feature names
        :param table_name: the name of the table or the subquery where to read the data
        :param dbms: the name of the dbms
        :return: the portion of the SQL query which implements the regression dot product linear combination
        """

        if not isinstance(bias, float):
            raise TypeError(
                "Wrong data type for parameter bias. Only float data type is allowed."
            )

        if not isinstance(columns, Iterable):
            raise TypeError(
                "Wrong data type for parameter columns. Only iterable data type is allowed."
            )

        if table_name is not None:
            if not isinstance(table_name, str):
                raise TypeError(
                    "Wrong data type for parameter table_name. Only string data type is allowed."
                )

        for col in columns:
            if not isinstance(col, str):
                raise TypeError(
                    "Wrong data type for columns elements. Only string data type is allowed."
                )

        dbms_util = DBMSUtils()

        query = "SELECT "
        query += " ( "
        for col in columns:
            col = dbms_util.get_delimited_col(dbms, col)
            query += "{} +".format(col)

        query += "{}".format(bias)
        query += ") AS Score"

        query += " FROM {}".format(table_name)

        return query
Example #3
0
    def create_temp_table(self, params: dict):
        """
        This method creates the table that will store the regression parameters (i.e., weights and bias)

        :param params: the sdca regressor params
        :return: sql query containing the statement for the regression table creation
        """

        params = SDCARegressorSQL.check_params(params)

        create_stm = f'DROP TABLE IF EXISTS {self.temp_table_name};\n'
        create_stm += f'CREATE TABLE {self.temp_table_name} ({self.temp_table_pk} int, {self.temp_table_weight_col} float);\n'

        insert_stm = f'INSERT INTO {self.temp_table_name} VALUES (\n'
        for widx, w in enumerate(params['weights']):
            if self.dbms == 'sqlserver':
                if widx > 0 and widx % 1000 == 0:
                    insert_stm = insert_stm[:-2]  # remove ',\n'
                    insert_stm += ';\n\n'
                    insert_stm += f'INSERT INTO {self.temp_table_name} VALUES\n'
            insert_stm += f'({widx}, {w}),\n'
        insert_stm = insert_stm[-2:]  # remove ',\n'
        insert_stm += ');\n'

        index_name = f'{self.temp_table_name}_{self.temp_table_pk}'
        index_stm = DBMSUtils.create_index(self.dbms, index_name,
                                           self.temp_table_name,
                                           self.temp_table_pk)

        # query = f'{create_stm}{insert_stm}{index_stm}'

        return [create_stm, insert_stm, index_stm]
Example #4
0
    def _get_linear_combination(self, weights, bias, columns):
        """
        This method generates the linear combination component of the LogisticRegression function.

        :param weights: the weights for a target class
        :param bias: the bias for a target class
        :param columns: the feature names
        :return: the portion of SQL query responsible for the application of the linear combination component of the
                 LogisticRegression function
        """

        dbms_util = DBMSUtils()

        query = ""
        for i in range(len(columns)):
            c = dbms_util.get_delimited_col(self.dbms, columns[i])
            query += "({}*{}) + ".format(c, weights[i])
        query = "{} {}".format(query, bias)

        return query
    def _get_query_dense_ohe(ohe_params: dict, table_name: str, dbms: str):
        """
        This method creates the SQL query that implements a dense one-hot-encoding transformation.

        :param ohe_params: dictionary containing the parameters extracted from the fitted OneHotEncoder
        :param table_name: the table name or the previous subquery where to read the data
        :param dbms: the name of the dbms
        :return: the SQL query that implements a dense one-hot-encoding transformation
        """

        ohe_params = OneHotEncoderSQL.check_ohe_params(ohe_params)
        assert isinstance(table_name, str)
        assert isinstance(dbms, str)

        ohe_map = ohe_params["ohe_encoding"]
        remaining_features = ohe_params["other_features"]

        dbms_utils = DBMSUtils()

        ohe_query = "SELECT "

        # implement one-hot encoding in SQL
        for feature_after_ohe in ohe_map:
            # feature_after_ohe = ohe_feature_map["feature_after_ohe"]
            fao = dbms_utils.get_delimited_col(dbms, feature_after_ohe)
            ohe_feature_map = ohe_map[feature_after_ohe]
            feature_before_ohe = dbms_utils.get_delimited_col(
                dbms, ohe_feature_map["feature_before_ohe"])
            value = ohe_feature_map["value"]

            ohe_query += "CASE WHEN {} = '{}' THEN 1 ELSE 0 END AS {},\n".format(
                feature_before_ohe, value, fao)

        # add the remaining features to the selection
        for f in remaining_features:
            ohe_query += "{},".format(dbms_utils.get_delimited_col(dbms, f))
        ohe_query = ohe_query[:-1]  # remove the last ','

        ohe_query += " FROM {}".format(table_name)

        return ohe_query
    def query(self, table_name):
        """
        This method creates the SQL query that implements into SQL an One Hot Encoding.

        :param table_name: the table name or the previous subquery where to read the data
        :return: the SQL query that implements the One Hot Encoding
        """

        assert isinstance(table_name,
                          str), "Wrong data type for param 'table_name'."
        assert self.params is not None, "No ohe params extracted."
        assert self.mode is not None, "No mode selected."

        dbms_util = DBMSUtils()
        auto_inc = dbms_util.get_auto_increment_col(self.dbms)
        # if the table provided in input is the result of a previous query
        if len(table_name) > 7 and table_name[-7:] == 'AS data':
            real_tab_name = 'data'
        else:  # only a table name is provided
            real_tab_name = table_name
        self.input_table_name = f'(select {auto_inc} AS {self.ohe_table_pk}, {real_tab_name}.* FROM {table_name}) AS T'

        # create the SQL query that performs the One Hot Encoding
        pre_ohe_queries = None
        if self.mode == 'dense':
            ohe_query = self._get_query_dense_ohe(self.params,
                                                  table_name,
                                                  dbms=self.dbms)

        elif self.mode == 'sparse':
            pre_ohe_queries, ohe_query = self._get_query_sparse_ohe(
                self.params)

        else:
            raise ValueError(f"Wrong mode ({self.mode}).")

        return pre_ohe_queries, ohe_query
Example #7
0
def get_column(url_connection, table_name, column_name):
    url_connection = check_url_connection(url_connection)
    dbms = 'mysql'
    if 'mssql' in url_connection:
        dbms = 'sqlserver'
    try:
        engine = create_engine(url_connection)
        with engine.connect() as connection:
            res = connection.execute("select {} from {}".format(
                DBMSUtils.get_delimited_col(dbms, column_name), table_name))
            labels = [x[0] for x in res]
        return labels
    except SQLAlchemyError as e:
        logging.error(e)
        return None
Example #8
0
    def create_temp_table(self, params: dict):
        """
        This method creates the table that will store the logistic regression parameters (i.e., weights and bias)

        :param params: the logistic regression params
        :return: sql query containing the statement for the logistic regression table creation
        """

        params = LogisticRegressionSQL.check_params(params)

        query_list = []
        # query = ''
        for class_idx in range(len(params['weights'])):
            tab_name = f'{self.temp_table_name}_{class_idx}'
            class_weights = params['weights'][class_idx]

            create_stm = f'DROP TABLE IF EXISTS {tab_name};\n'
            create_stm += f'CREATE TABLE {tab_name} ({self.temp_table_pk} int, {self.temp_table_weight_col} float);\n'

            insert_stm = f'INSERT INTO {tab_name} VALUES\n'
            for widx, w in enumerate(class_weights):
                if self.dbms == 'sqlserver':
                    if widx > 0 and widx % 1000 == 0:
                        insert_stm = insert_stm[:-2]  # remove ',\n'
                        insert_stm += ';\n\n'
                        insert_stm += f'INSERT INTO {tab_name} VALUES\n'

                insert_stm += f'({widx}, {w}),\n'
            insert_stm = insert_stm[:-2]  # remove ',\n'
            insert_stm += ';\n'

            index_name = f'{tab_name}_{self.temp_table_pk}'
            index_stm = DBMSUtils.create_index(self.dbms, index_name, tab_name,
                                               self.temp_table_pk)

            # query += f'{create_stm}{insert_stm}{index_stm}'
            query_list += [create_stm, insert_stm, index_stm]

        return query_list
    def _get_query_sparse_ohe(self, ohe_params: dict):
        """
        This method creates an SQL query that implements a sparse ohe transformation.
        The query is composed by a main CASE statement that replicates the OHE mapping.
        For high dimensional data, it is not possible to encode the mapping inside a single CASE statement, because of
        the limit of the number of WHEN statements that can be inserted in a query. In this case multiple queries are
        generated and for each of them the maximum number of WHEN statements allowed is considered. Each query result is
        saved into a temporary table.

        :param ohe_params: dictionary containing the parameters extracted from the fitted OneHotEncoder
        :return: the SQL query that implements the sparse One Hot Encoding transformation
        """

        ohe_params = OneHotEncoderSQL.check_ohe_params(ohe_params)
        ohe_feature_map = ohe_params['ohe2idx_map']
        original_ohe_features = ohe_params['ohe_features']

        ohe_query = ""

        # considering that each feature after the ohe is used to create a WHEN statement, it is needed to check if this
        # number if greater (or not) than the maximum number of WHEN statements that can be included in a single CASE
        # statement. For SQLSERVER the maximum number of WHEN statements is 9700.
        # https://www.sqlservercentral.com/forums/topic/maximum-number-of-when-then-lines-in-a-case-statement
        sql_max_when_statements = 9700
        # sql_max_when_statements = 100

        # if the OHE is applied directly on the original table then a temporary table is created to store OHE
        # results in a triplet data format
        warn_message = "A temporary table 'ohe_table' will be created."
        logging.warning(warn_message)

        # add to the ohe query the SQL statement for the creation of the intermediate ohe table
        create_ohe_table_query = f"DROP TABLE IF EXISTS {self.ohe_table_name};\n"
        create_ohe_table_query += f"CREATE TABLE {self.ohe_table_name}({self.ohe_table_pk} int, "
        create_ohe_table_query += f"{self.ohe_table_fval_col} float, {self.ohe_table_fidx_col} int);\n\n"
        # create_ohe_table_query += f" PRIMARY KEY({self.ohe_table_pk}, {self.ohe_table_fidx_col}));\n\n"
        # ohe_query += create_ohe_table_query

        # split, if needed, the OHEed features in batches smaller than the SQL limits

        ohe_feature_map_batches = []
        num_batch = 1

        # loop over OHEed columns
        for col in ohe_feature_map:
            feature_map = ohe_feature_map[col]
            num_ohe_features_per_col = len(feature_map)

            # check if the number of features derived from the current OHEed column is greater than the DBMS limits
            if num_ohe_features_per_col > sql_max_when_statements:

                # split the query in multiple batch queries
                batch_size = sql_max_when_statements
                if num_ohe_features_per_col % batch_size == 0:
                    num_batch = num_ohe_features_per_col // batch_size
                else:
                    num_batch = num_ohe_features_per_col // batch_size + 1

                feature_map_vals = list(feature_map.items())
                # loop over the number of batch
                for i in range(num_batch):
                    # select a partition of the features after ohe
                    batch_ohe_feature_map = dict(
                        feature_map_vals[i * batch_size:i * batch_size +
                                         batch_size])
                    ohe_feature_map_batches.append(
                        {col: batch_ohe_feature_map})

            else:
                ohe_feature_map_batches.append({col: feature_map})

        # loop over the batches
        ohe_sub_queries = []
        for ohe_feature_map_batch in ohe_feature_map_batches:

            # create the SQL query that applies the One Hot Encoding on the selected features
            batch_mode = False
            if num_batch > 1:
                batch_mode = True
            ohe_batch_query = self._create_ohe_query(ohe_feature_map_batch,
                                                     original_ohe_features,
                                                     batch_mode=batch_mode)
            ohe_sub_queries.append(ohe_batch_query)

        # optimization: combine multiple ohe batch queries to reduce the total number of INSERT statements
        cum_sum = 0
        current_combined_suq_queries = []
        for j in range(len(ohe_feature_map_batches)):
            suq_query = ohe_sub_queries[j]
            ohe_feature_map_batch = ohe_feature_map_batches[j]
            ohe_batch_query_size = len(list(ohe_feature_map_batch.values())[0])

            cum_sum += ohe_batch_query_size
            current_combined_suq_queries.append(suq_query)

            if cum_sum > sql_max_when_statements:
                cum_sum = ohe_batch_query_size
                list_joint_sub_queries = current_combined_suq_queries[:-1]
                current_combined_suq_queries = [
                    current_combined_suq_queries[-1]
                ]

                joint_sub_queries = ""
                for sub_query in list_joint_sub_queries:
                    joint_sub_queries += "{}\n\n UNION ALL \n\n".format(
                        sub_query[:-3])  # remove ';\n\n'
                joint_sub_queries = joint_sub_queries[:-15] + ";"  # remove '\n\n UNION ALL \n\n'

                # if multiple batch queries are generated, they have to be saved in a temporary table with an
                # INSERT statement
                insert_stm = ""
                if num_batch > 1:
                    insert_stm += f"INSERT INTO {self.ohe_table_name}\n"
                else:
                    insert_stm += f"INSERT INTO {self.ohe_table_name}\n"

                ohe_query += "{}{}\n\n".format(insert_stm, joint_sub_queries)

        # combine the last ohe sub queries
        joint_sub_queries = ""
        for sub_query in current_combined_suq_queries:
            joint_sub_queries += "{}\n\n UNION ALL \n\n".format(
                sub_query[:-3])  # remove ';\n\n'
        joint_sub_queries = joint_sub_queries[:-15] + ";"  # remove '\n\n UNION ALL \n\n'

        # if multiple batch queries are generated, they have to be saved in a temporary table with an
        # INSERT statement
        insert_stm = ""
        if num_batch > 1:
            insert_stm += f"INSERT INTO {self.ohe_table_name}\n"
        else:
            insert_stm += f"INSERT INTO {self.ohe_table_name}\n"

        ohe_query += "{}{}\n\n".format(insert_stm, joint_sub_queries)

        # create an index on the ohe table
        index_ohe = DBMSUtils.create_index(
            dbms=self.dbms,
            index_name=f'{self.ohe_table_name}_{self.ohe_table_pk}',
            target_table=self.ohe_table_name,
            target_col=self.ohe_table_pk)
        index_ohe += DBMSUtils.create_index(
            dbms=self.dbms,
            index_name=f'{self.ohe_table_name}_{self.ohe_table_fidx_col}',
            target_table=self.ohe_table_name,
            target_col=self.ohe_table_fidx_col)
        # ohe_query += index_ohe

        return [create_ohe_table_query, ohe_query,
                index_ohe], f'select * from {self.ohe_table_name}'
Example #10
0
    def post(self, request):
        # Params
        scenario = create_simulation_scenario(request.data)

        try:
            check_simulation_consistency(scenario)
        except ValueError as e:
            return Response({
                'detail': str(e),
            },
                            status=status.HTTP_400_BAD_REQUEST)

        # Get Dataset
        t_load_start = datetime.now()
        ds = get_table(scenario['db_url'], scenario['table'])
        features = ds.columns.to_list()

        if scenario['labels_type'] == 'column':
            # Remove Label column if exists
            features.remove(scenario['labels'])

        t_load_end = datetime.now()

        # ML Manager
        manager = MLManager()

        t_ml = []
        t_db = []

        # Testing Phase
        for i in range(scenario['batch_number']):
            ds_batch = get_batch(ds, i, scenario['batch_size'])
            if ds_batch.empty:
                break

            # Execute predict using MLManager and ML Library
            t_start = datetime.now()
            _ = manager.predict(ds[features], scenario['pipeline'].file)
            t_end = datetime.now()

            t_ml.append(t_end - t_start)

            # Create Batch for DBMS
            ds_batch.to_sql('batch',
                            con=scenario["db_url"],
                            if_exists="replace",
                            index=False)

            # Generate query using MLManager
            dbms = DBMSUtils.get_dbms_from_str_connection(scenario['db_url'])
            queries, query = manager.generate_query(scenario['pipeline'].file,
                                                    scenario['table'],
                                                    features, dbms,
                                                    scenario['optimizer'])

            # Execute query
            t_start = datetime.now()
            # Execute query
            _ = execute_multi_queries(scenario["db_url"], queries)
            t_end = datetime.now()

            t_db.append(t_end - t_start)

        # Finish Simulation
        return Response(
            {
                'detail': 'Successfully predicted result',
                'ml_results': {
                    'execution_time':
                    (np.mean(t_ml) + (t_load_end - t_load_start))
                },
                'dbms_results': {
                    'execution_time': np.mean(t_db)
                },
            },
            status=status.HTTP_200_OK)
Example #11
0
    def post(self, request):

        scenario = create_test_scenario(request.data)
        inference_time = 0

        try:
            check_test_consistency(scenario)
        except ValueError as e:
            return Response({
                'detail': str(e),
            },
                            status=status.HTTP_400_BAD_REQUEST)

        # Load pipeline model from pipeline file
        pipeline = load_model(scenario.pipeline.file)
        if not pipeline:
            return Response(
                {'detail': 'The selected file isn\'t a pipeline objects'},
                status=status.HTTP_400_BAD_REQUEST)

        # Extract pipeline information from loaded model
        pipeline = MLManager.extract_pipeline_components(pipeline)
        if not pipeline:
            return Response(
                {'detail': 'The selected file isn\'t a pipeline objects'},
                status=status.HTTP_400_BAD_REQUEST)

        # Model params
        scenario.model = pipeline.get('model')
        scenario.transforms = json.dumps(pipeline.get('transforms', []))

        # Dataset
        if scenario.run_db:
            # Get features from table
            features = get_columns(scenario.db_url, scenario.table)

        else:
            data_extractor_start = datetime.now()
            # Get Dataset
            ds = get_dataset(scenario)
            features = ds.columns.to_list()

            data_extractor_end = datetime.now()
            data_extractor_time = (data_extractor_end -
                                   data_extractor_start).total_seconds()
            inference_time += data_extractor_time

        if scenario.labels_type == 'column':
            # Remove Label column if exists
            features.remove(scenario.labels)

        # ML Manager
        manager = MLManager()

        # Testing Phase
        query = None
        if scenario.run_db:

            inference_start = datetime.now()

            # Generate query using MLManager
            dbms = DBMSUtils.get_dbms_from_str_connection(scenario.db_url)
            queries, query = manager.generate_query(scenario.pipeline.file,
                                                    scenario.table, features,
                                                    dbms, scenario.optimizer)

            # Execute query
            y_pred = execute_multi_queries(scenario.db_url, queries)
            y_pred = pd.Series(y_pred.iloc[:, 0], name='Label')

            inference_end = datetime.now()
            inference_time += (inference_end - inference_start).total_seconds()

        else:

            inference_start = datetime.now()

            # Execute predict using MLManager and ML Library
            y_pred = manager.predict(ds[features], scenario.pipeline.file)
            y_pred = pd.Series(y_pred, name='Label')

            inference_end = datetime.now()
            inference_time += (inference_end - inference_start).total_seconds()

        # Label
        labels = []
        # Compute evaluation
        if scenario.labels_type:
            if scenario.labels_type == 'file':
                # Get labels from file
                labels_document = get_document_object(scenario.labels)
                labels = get_dataframe(labels_document.file)

                if labels is None:
                    return Response(
                        {
                            'detail':
                            'The selected labels file {} isn\'t valid'.format(
                                labels_document.filename)
                        },
                        status=status.HTTP_400_BAD_REQUEST)

                # Get first column from file
                labels = labels.iloc[:, 0].to_list()

            elif scenario.labels_type == 'table':
                # Get labels from table
                labels = get_table(scenario.db_url, scenario.labels)
                if not labels:
                    return Response(
                        {
                            'detail':
                            'The selected table {} isn\'t valid for label'.
                            format(scenario.labels)
                        },
                        status=status.HTTP_400_BAD_REQUEST)

                # Get first column from table
                labels = labels.iloc[:, 0].to_list()

            elif scenario.labels_type == 'column' and not scenario.run_db:
                # Get labels from column
                labels = ds[scenario.labels].to_list()

            elif scenario.labels_type == 'column' and scenario.run_db:
                # Get labels from table
                labels = get_column(scenario.db_url, scenario.table,
                                    scenario.labels)
            else:
                return Response({'detail': 'Select the wrong labels type'},
                                status=status.HTTP_400_BAD_REQUEST)

        # Compute evaluation
        res_evaluation = None
        if labels and scenario.metric:
            res_evaluation = manager.evaluate(scenario.metric, labels, y_pred)

        # Create predictions file
        test_result_name = "test_{}_{}.csv".format(scenario.model,
                                                   datetime.now())
        test_result_name = test_result_name.replace(' ', '_')
        test_result_name = test_result_name.replace(':', '_')
        y_pred.to_csv(test_result_name, index=False, header=True)

        # Save predictions in Document model
        f = open(test_result_name, 'rb')
        document = Document(file=File(f), filename=test_result_name)
        document.save()
        f.close()

        # Remove temporally predictions file
        os.remove(test_result_name)

        # Save Scenario model
        scenario.save()

        # Save ResultScenario
        result_scenario = ResultScenario()
        result_scenario.scenario = scenario
        result_scenario.execution_time = inference_time
        result_scenario.throughput = result_scenario.execution_time / len(
            y_pred)
        result_scenario.score = res_evaluation
        result_scenario.file_result = document.filename
        result_scenario.query = query
        result_scenario.save()

        return Response(
            {
                'detail': 'Successfully predicted result',
                'filename': test_result_name,
                'scenario_id': scenario.id
            },
            status=status.HTTP_201_CREATED)
Example #12
0
    def get_sql_nested_rules(tree: BaseDecisionTree,
                             feature_names: list,
                             is_classification: bool,
                             dbms: str,
                             merge_ohe_features: dict = None):
        """
        This method extracts the rules from a BaseDecisionTree object and convert them in SQL.

        :param tree: BaseDecisionTree object
        :param feature_names: list of feature names
        :param is_classification: boolean flag that indicates whether the DTM is used in classification or regression
        :param dbms: the name of the dbms
        :param merge_ohe_features: (optional) ohe feature map to be merged in the decision rules
        :return: string containing the SQL query
        """

        assert isinstance(
            tree, BaseDecisionTree
        ), "Only BaseDecisionTree data type is allowed for param 'tree'."
        assert isinstance(
            feature_names,
            list), "Only list data type is allowed for param 'features_names'."
        for f in feature_names:
            assert isinstance(f, str)
        assert isinstance(
            is_classification, bool
        ), "Only bool data type is allowed for param 'is_classification'."
        if merge_ohe_features is not None:
            DTMSQL._check_merge_ohe_features(merge_ohe_features)

        dbms_util = DBMSUtils()

        # get for each node, left, right child nodes, thresholds and features
        left = tree.tree_.children_left  # left child for each node
        right = tree.tree_.children_right  # right child for each node
        thresholds = tree.tree_.threshold  # test threshold for each node
        features = [feature_names[i] for i in tree.tree_.feature]
        # features = tree.tree_.feature  # indexes of the features used by the tree
        if is_classification:
            classes = tree.classes_

        def visit_tree(node):

            # leaf node
            if left[node] == -1 and right[node] == -1:
                if is_classification:
                    return " {} ".format(classes[np.argmax(
                        tree.tree_.value[node][0])])
                else:
                    return " {} ".format(tree.tree_.value[node][0][0])

            # internal node
            op = '<='
            feature = features[node]
            thr = thresholds[node]

            if merge_ohe_features is not None:
                # if ohe features have to be merged in the decision tree, the tree conditions are changed
                #   feature_after_ohe > 0.5 becomes original_cat_feature = val
                #   feature_after_ohe <= 0.5 becomes original_cat_feature <> val
                if feature in merge_ohe_features:  # only categorical features should pass this test
                    mof = merge_ohe_features[feature]
                    feature = mof['feature_before_ohe']
                    thr = "'{}'".format(mof['value'])
                    op = '<>'

            sql_dtm_rule = f" CASE WHEN {dbms_util.get_delimited_col(dbms, feature)} {op} {thr} THEN"

            # check if current node has a left child
            if left[node] != -1:
                sql_dtm_rule += visit_tree(left[node])

            sql_dtm_rule += "ELSE"

            # check if current node has a right child
            if right[node] != -1:
                sql_dtm_rule += visit_tree(right[node])

            sql_dtm_rule += "END "

            return sql_dtm_rule

        # start tree visit from the root node
        root = 0
        sql_dtm_rules = visit_tree(root)

        return sql_dtm_rules
Example #13
0
    def convert_rules_to_sql(rules: list,
                             dbms: str,
                             merge_ohe_features: dict = None):
        """
        This method converts the rules extracted from a BaseDecisionTree object into SQL case statements.

        :param rules: rules extracted from a BaseDecisionTree object
        :param dbms: the name of the dbms
        :param merge_ohe_features: (optional) ohe feature map to be merged in the decision rules
        :return: string containing the SQL query
        """

        DTMSQL._check_rule_format(rules)
        if merge_ohe_features is not None:
            DTMSQL._check_merge_ohe_features(merge_ohe_features)

        dbms_util = DBMSUtils()

        sql_query = ""
        sql_string = " CASE WHEN "
        # loop over the rules
        for item in rules:

            if not isinstance(item, tuple):  # the item is a leaf score
                sql_string = sql_string[:-5]  # remove 'WHEN '
                if sql_string == ' CASE ':  # case a tree is composed of only a leaf
                    sql_query += str(item)
                else:
                    sql_string += " THEN {} ".format(item)
                    sql_query += sql_string
                sql_string = "WHEN "
            else:  # the item is a rule condition
                op = item[1]
                thr = item[2]
                if op == 'l':
                    op = '<='
                elif op == 'r':
                    op = '>'
                else:  # when op is equals to '=' or '<>' the thr is a string
                    thr = "'{}'".format(thr)
                feature_name = item[3]

                if merge_ohe_features is not None:
                    # if ohe features have to be merged in the decision tree, the tree conditions are changed
                    #   feature_after_ohe > 0.5 becomes original_cat_feature = val
                    #   feature_after_ohe <= 0.5 becomes original_cat_feature <> val
                    if feature_name in merge_ohe_features:  # only categorical features should pass this test
                        mof = merge_ohe_features[feature_name]
                        feature_name = mof['feature_before_ohe']
                        old_op = op[:]
                        op = '='
                        if old_op == '<=':
                            if 0 <= thr:
                                op = '<>'
                        elif old_op == '>':
                            if 0 > thr:
                                op = '<>'
                        else:
                            raise ValueError("Wrong op.")

                        thr = "'{}'".format(mof['value'])

                feature_name = dbms_util.get_delimited_col(dbms, feature_name)
                sql_string += "{} {} {} and ".format(feature_name, op, thr)

        if 'CASE' in sql_query:  # ignore the case where a tree is composed of only a leaf
            sql_query += "END "

        return sql_query
Example #14
0
def main(data_conf, pipeline_conf, str_db_conn, task, optimizer, debug=False):
    data_conf['str_db_conn'] = str_db_conn
    data_conf = check_data_config(data_conf)
    train = data_conf['train']
    y_train = data_conf['y_train']
    test = data_conf['test']
    y_test = data_conf['y_test']
    test_table_name = data_conf['test_table_name']
    features = list(data_conf['train'].columns)
    conn = data_conf['db_conn']

    tasks = ['regression', 'binary_classification', 'multi_classification']
    if task not in tasks:
        raise ValueError(f"Wrong task {task}. Available tasks: {tasks}")

    if task == 'regression':
        eval_fn = evaluate_regression_results
    elif task == 'binary_classification':
        eval_fn = evaluate_binary_classification_results
    else:
        eval_fn = evaluate_multi_classification_results

    check_pipeline_config(pipeline_conf, features)
    model_name = pipeline_conf['model']['name']

    mlmanager = MLManager()
    pipeline = create_pipeline(pipeline_conf)

    # fit
    print("\nStarting training...")
    pipeline.fit(train, y_train)
    _check_fitted_pipeline(pipeline, model_name, train)

    print("Training completed.\n")

    fitted_model = pipeline.steps[1][1]

    # ML predict
    print("\nStarting the ML inference...")
    ml_preds = pipeline.predict(test)
    ml_preds = pd.Series(ml_preds)
    print(ml_preds[:10])
    eval_fn(model_name, y_test, ml_preds)
    print("ML inference completed.\n")

    # SQL conversion
    print("\nStarting the SQL conversion...")
    pipeline = extract_pipeline(pipeline)
    dbms = DBMSUtils.get_dbms_from_str_connection(data_conf['str_db_conn'])
    queries, all_query = create_query(pipeline, mlmanager, features,
                                      test_table_name, optimizer, dbms, debug)
    print("SQL Conversion completed.\n")

    # SQL predict
    print("\nStarting the SQL inference...")
    for q in queries[:-1]:
        try:
            for qq in q.split(';'):
                conn.execute(qq)
        except Exception as e:
            pass

    try:
        sql_preds = pd.read_sql(queries[-1], conn)
    except Exception as e:
        logging.error(e.args[0])
        return

    sql_preds = pd.Series(sql_preds.iloc[:, 0])
    print(sql_preds[:10])
    null_val = False
    if sql_preds.isnull().sum() == 0:
        eval_fn(f"{model_name} SQL", y_test, sql_preds)
    else:
        null_val = True
    print("SQL inference completed.\n")

    # Null value test
    if null_val:
        print("\nNull value test")
        null_val_cnt = 0
        for sample_id in sql_preds[sql_preds.isnull()].index:
            print(sample_id)
            for (attr, val) in zip(test.columns,
                                   test.iloc[sample_id, :].values):
                print("\t", attr, '=', val)
            null_val_cnt += 1
        print(f"Found {null_val_cnt} null predictions.")

    # Accuracy test
    print("\nAccuracy test")
    equals = False
    for prec in range(10, 0, -1):
        ml_preds = ml_preds.map(lambda x: round(x, prec))
        sql_preds = sql_preds.map(lambda x: round(x, prec))
        if ml_preds.equals(sql_preds):
            print(
                f"The prediction scores are equal with {prec} decimal precision."
            )
            print(":)")
            equals = True
            break
    if not equals:
        print("The prediction scores are not equal.")
        print(":(\n")

        ne_preds = 0
        for i in range(len(ml_preds)):
            ml_pred = ml_preds.iloc[i]
            sql_pred = sql_preds.iloc[i]

            if ml_pred != sql_pred:
                if debug:
                    print(i, ml_pred, sql_pred)
                    for (attr, val) in zip(test.columns,
                                           test.iloc[i, :].values):
                        print("\t", attr, '=', val)
                ne_preds += 1
        print(f"Found {ne_preds} incorrect predictions.")
Example #15
0
    def optimize(self):

        out_pipeline = self.pipeline.copy()
        features = self.features

        # if the optimization is enabled then check if the operator fusion can be applied
        transformers_to_merge = []
        if self.optimization:
            # if the pipeline includes an OneHotEncoder and a tree-based model then apply the one-hot encoding directly
            # in the decision tree rules
            if 'OneHotEncoder' in self.transformer_names and \
                    any(key in self.model_name for key in self.tree_based_model_keys):
                transformers_to_merge.append('OneHotEncoder')

        # get the fitted transformers from the pipeline
        prev_transform_features = []
        new_transformers = []
        sql_transformers_to_merge = []
        for transformer in self.transformers:
            transformer_name = transformer["transform_name"]
            fitted_transformer = transformer["fitted_transform"]
            transformer_features = transformer["transform_features"]

            # retrieve the SQL wrapper related to the current transformer and extract its fitted params
            transformer_sql_wrapper = self.ml_manager.sql_transform_types[
                transformer_name]
            transformer_params = transformer_sql_wrapper.get_params(
                fitted_transformer, transformer_features, features,
                prev_transform_features)
            features = transformer_params["out_all_features"]

            # transformers that have to be merged with the model are ignored in this phase
            if transformer_name not in transformers_to_merge:
                transformer_sql_wrapper.set_dbms(self.dbms)
                new_transformers.append(transformer_sql_wrapper)

            else:
                sql_transformers_to_merge.append(
                    (transformer_name, transformer_sql_wrapper,
                     transformer_params))

            prev_transform_features = transformer_params[
                "out_transform_features"][:]

        # get the fitted model from the pipeline and retrieve its SQL wrapper
        fitted_model = self.model["trained_model"]
        model_sql_wrapper = self.ml_manager.sql_model_types[self.model_name]
        model_sql_wrapper.set_dbms(self.dbms)

        # if the optimization is enabled then check if the sparse implementation can be applied
        sparse = False
        if self.optimization:
            # check if the number of features generated after the application of the transformers has reached dbms
            # limits; this is checked in particular for models based on linear combination of the features where
            # expression limits may be exceeded
            # for the moment only the OneHotEncoder transformer supports the sparse implementation
            linear_models = [
                'LogisticRegression', 'SGDRegressor', 'LinearRegression'
            ]
            if self.model_name in linear_models and any(
                ['OneHotEncoder' in str(type(nt)) for nt in new_transformers]):
                dbms_limit = DBMSUtils.get_expression_limits(self.dbms)
                if dbms_limit is not None and len(features) > dbms_limit:
                    sparse = True
                    print("Sparse implementation enabled.")

        # if dbms limits are reached then set the sparse implementation for some transformers and for the model
        # in the opposite case enable dense modality for all the pipeline components
        for nt in new_transformers:
            if 'OneHotEncoder' in str(type(nt)):
                if sparse:
                    nt.set_mode('sparse')
                    features = nt.get_table_cols()
                else:
                    nt.set_mode('dense')
        if sparse:
            model_sql_wrapper.set_mode('sparse')
        else:
            model_sql_wrapper.set_mode('dense')

        # if no operator can be merged or the sparse implementation is active then disable the operator fusion
        # optimization
        if len(sql_transformers_to_merge) == 0 or sparse:
            model_sql_wrapper.reset_optimization()

        # if there are operators to merge and the sparse implementation is disabled then merge the previously
        # selected transformers with the model
        if len(sql_transformers_to_merge) > 0 and not sparse:
            for sql_transf_to_merge in sql_transformers_to_merge:
                transf_name = sql_transf_to_merge[0]
                transf_sql = sql_transf_to_merge[1]
                transf_params = sql_transf_to_merge[2]

                if transf_name == 'OneHotEncoder':
                    print("Operator fusion enabled.")
                    model_sql_wrapper.merge_ohe_with_trees(
                        transf_params['ohe_encoding'])

        if self.optimization:
            if self.dbms == 'sqlserver' and any(
                [key in self.model_name
                 for key in self.tree_based_model_keys]):
                if 'estimators_' in dir(fitted_model):
                    if isinstance(fitted_model.estimators_, np.ndarray):
                        depth = np.max([
                            tree.tree_.max_depth
                            for estimator in fitted_model.estimators_
                            for tree in estimator
                        ])
                    else:
                        depth = np.max([
                            estimator.tree_.max_depth
                            for estimator in fitted_model.estimators_
                        ])
                else:
                    depth = fitted_model.tree_.max_depth
                if depth > 10:
                    model_sql_wrapper.set_flat_implementation()

        new_model = {
            'trained_model': fitted_model,
            'model_sql_wrapper': model_sql_wrapper,
            'model_features': features,
        }

        out_pipeline['transforms'] = new_transformers
        out_pipeline['model'] = new_model

        return out_pipeline