Ejemplo n.º 1
0
    def _sql_regression_part1(weights, columns, table_name, dbms: str):
        """
        This method creates the portion of the SQL query responsible for the application of the dot product between
        regression weights and features.

        :param weights: the regression weights
        :param columns: the feature names
        :param table_name: the name of the table or the subquery where to read the data
        :param dbms: the name of the dbms
        :return: the portion of the SQL query which implements the regression dot products
        """

        if not isinstance(weights, Iterable):
            raise TypeError(
                "Wrong data type for parameter weights. Only iterable data type is allowed."
            )

        if not isinstance(columns, Iterable):
            raise TypeError(
                "Wrong data type for parameter columns. Only iterable data type is allowed."
            )

        if not isinstance(table_name, str):
            raise TypeError(
                "Wrong data type for parameter table_name. Only string data type is allowed."
            )

        for weight in weights:
            if not isinstance(weight, float):
                raise TypeError(
                    "Wrong data type for weights elements. Only float data type is allowed."
                )

        for col in columns:
            if not isinstance(col, str):
                raise TypeError(
                    "Wrong data type for columns elements. Only string data type is allowed."
                )

        dbms_util = DBMSUtils()

        query = "SELECT "
        for i in range(len(columns)):
            col = dbms_util.get_delimited_col(dbms, columns[i])
            query += "({} * {}) AS {} ,".format(col, weights[i], col)

        query = query[:-1]  # remove the last ','

        query += " FROM {}".format(table_name)

        return query
Ejemplo n.º 2
0
    def _sql_regression_part2(bias, columns, table_name, dbms: str):
        """
        This method creates the portion of the SQL query responsible for the application of the linear combination over
        the regression dot products.

        :param bias: the regression bias
        :param columns: the feature names
        :param table_name: the name of the table or the subquery where to read the data
        :param dbms: the name of the dbms
        :return: the portion of the SQL query which implements the regression dot product linear combination
        """

        if not isinstance(bias, float):
            raise TypeError(
                "Wrong data type for parameter bias. Only float data type is allowed."
            )

        if not isinstance(columns, Iterable):
            raise TypeError(
                "Wrong data type for parameter columns. Only iterable data type is allowed."
            )

        if table_name is not None:
            if not isinstance(table_name, str):
                raise TypeError(
                    "Wrong data type for parameter table_name. Only string data type is allowed."
                )

        for col in columns:
            if not isinstance(col, str):
                raise TypeError(
                    "Wrong data type for columns elements. Only string data type is allowed."
                )

        dbms_util = DBMSUtils()

        query = "SELECT "
        query += " ( "
        for col in columns:
            col = dbms_util.get_delimited_col(dbms, col)
            query += "{} +".format(col)

        query += "{}".format(bias)
        query += ") AS Score"

        query += " FROM {}".format(table_name)

        return query
Ejemplo n.º 3
0
    def _get_linear_combination(self, weights, bias, columns):
        """
        This method generates the linear combination component of the LogisticRegression function.

        :param weights: the weights for a target class
        :param bias: the bias for a target class
        :param columns: the feature names
        :return: the portion of SQL query responsible for the application of the linear combination component of the
                 LogisticRegression function
        """

        dbms_util = DBMSUtils()

        query = ""
        for i in range(len(columns)):
            c = dbms_util.get_delimited_col(self.dbms, columns[i])
            query += "({}*{}) + ".format(c, weights[i])
        query = "{} {}".format(query, bias)

        return query
Ejemplo n.º 4
0
    def _get_query_dense_ohe(ohe_params: dict, table_name: str, dbms: str):
        """
        This method creates the SQL query that implements a dense one-hot-encoding transformation.

        :param ohe_params: dictionary containing the parameters extracted from the fitted OneHotEncoder
        :param table_name: the table name or the previous subquery where to read the data
        :param dbms: the name of the dbms
        :return: the SQL query that implements a dense one-hot-encoding transformation
        """

        ohe_params = OneHotEncoderSQL.check_ohe_params(ohe_params)
        assert isinstance(table_name, str)
        assert isinstance(dbms, str)

        ohe_map = ohe_params["ohe_encoding"]
        remaining_features = ohe_params["other_features"]

        dbms_utils = DBMSUtils()

        ohe_query = "SELECT "

        # implement one-hot encoding in SQL
        for feature_after_ohe in ohe_map:
            # feature_after_ohe = ohe_feature_map["feature_after_ohe"]
            fao = dbms_utils.get_delimited_col(dbms, feature_after_ohe)
            ohe_feature_map = ohe_map[feature_after_ohe]
            feature_before_ohe = dbms_utils.get_delimited_col(
                dbms, ohe_feature_map["feature_before_ohe"])
            value = ohe_feature_map["value"]

            ohe_query += "CASE WHEN {} = '{}' THEN 1 ELSE 0 END AS {},\n".format(
                feature_before_ohe, value, fao)

        # add the remaining features to the selection
        for f in remaining_features:
            ohe_query += "{},".format(dbms_utils.get_delimited_col(dbms, f))
        ohe_query = ohe_query[:-1]  # remove the last ','

        ohe_query += " FROM {}".format(table_name)

        return ohe_query
Ejemplo n.º 5
0
    def query(self, table_name):
        """
        This method creates the SQL query that implements into SQL an One Hot Encoding.

        :param table_name: the table name or the previous subquery where to read the data
        :return: the SQL query that implements the One Hot Encoding
        """

        assert isinstance(table_name,
                          str), "Wrong data type for param 'table_name'."
        assert self.params is not None, "No ohe params extracted."
        assert self.mode is not None, "No mode selected."

        dbms_util = DBMSUtils()
        auto_inc = dbms_util.get_auto_increment_col(self.dbms)
        # if the table provided in input is the result of a previous query
        if len(table_name) > 7 and table_name[-7:] == 'AS data':
            real_tab_name = 'data'
        else:  # only a table name is provided
            real_tab_name = table_name
        self.input_table_name = f'(select {auto_inc} AS {self.ohe_table_pk}, {real_tab_name}.* FROM {table_name}) AS T'

        # create the SQL query that performs the One Hot Encoding
        pre_ohe_queries = None
        if self.mode == 'dense':
            ohe_query = self._get_query_dense_ohe(self.params,
                                                  table_name,
                                                  dbms=self.dbms)

        elif self.mode == 'sparse':
            pre_ohe_queries, ohe_query = self._get_query_sparse_ohe(
                self.params)

        else:
            raise ValueError(f"Wrong mode ({self.mode}).")

        return pre_ohe_queries, ohe_query
Ejemplo n.º 6
0
    def get_sql_nested_rules(tree: BaseDecisionTree,
                             feature_names: list,
                             is_classification: bool,
                             dbms: str,
                             merge_ohe_features: dict = None):
        """
        This method extracts the rules from a BaseDecisionTree object and convert them in SQL.

        :param tree: BaseDecisionTree object
        :param feature_names: list of feature names
        :param is_classification: boolean flag that indicates whether the DTM is used in classification or regression
        :param dbms: the name of the dbms
        :param merge_ohe_features: (optional) ohe feature map to be merged in the decision rules
        :return: string containing the SQL query
        """

        assert isinstance(
            tree, BaseDecisionTree
        ), "Only BaseDecisionTree data type is allowed for param 'tree'."
        assert isinstance(
            feature_names,
            list), "Only list data type is allowed for param 'features_names'."
        for f in feature_names:
            assert isinstance(f, str)
        assert isinstance(
            is_classification, bool
        ), "Only bool data type is allowed for param 'is_classification'."
        if merge_ohe_features is not None:
            DTMSQL._check_merge_ohe_features(merge_ohe_features)

        dbms_util = DBMSUtils()

        # get for each node, left, right child nodes, thresholds and features
        left = tree.tree_.children_left  # left child for each node
        right = tree.tree_.children_right  # right child for each node
        thresholds = tree.tree_.threshold  # test threshold for each node
        features = [feature_names[i] for i in tree.tree_.feature]
        # features = tree.tree_.feature  # indexes of the features used by the tree
        if is_classification:
            classes = tree.classes_

        def visit_tree(node):

            # leaf node
            if left[node] == -1 and right[node] == -1:
                if is_classification:
                    return " {} ".format(classes[np.argmax(
                        tree.tree_.value[node][0])])
                else:
                    return " {} ".format(tree.tree_.value[node][0][0])

            # internal node
            op = '<='
            feature = features[node]
            thr = thresholds[node]

            if merge_ohe_features is not None:
                # if ohe features have to be merged in the decision tree, the tree conditions are changed
                #   feature_after_ohe > 0.5 becomes original_cat_feature = val
                #   feature_after_ohe <= 0.5 becomes original_cat_feature <> val
                if feature in merge_ohe_features:  # only categorical features should pass this test
                    mof = merge_ohe_features[feature]
                    feature = mof['feature_before_ohe']
                    thr = "'{}'".format(mof['value'])
                    op = '<>'

            sql_dtm_rule = f" CASE WHEN {dbms_util.get_delimited_col(dbms, feature)} {op} {thr} THEN"

            # check if current node has a left child
            if left[node] != -1:
                sql_dtm_rule += visit_tree(left[node])

            sql_dtm_rule += "ELSE"

            # check if current node has a right child
            if right[node] != -1:
                sql_dtm_rule += visit_tree(right[node])

            sql_dtm_rule += "END "

            return sql_dtm_rule

        # start tree visit from the root node
        root = 0
        sql_dtm_rules = visit_tree(root)

        return sql_dtm_rules
Ejemplo n.º 7
0
    def convert_rules_to_sql(rules: list,
                             dbms: str,
                             merge_ohe_features: dict = None):
        """
        This method converts the rules extracted from a BaseDecisionTree object into SQL case statements.

        :param rules: rules extracted from a BaseDecisionTree object
        :param dbms: the name of the dbms
        :param merge_ohe_features: (optional) ohe feature map to be merged in the decision rules
        :return: string containing the SQL query
        """

        DTMSQL._check_rule_format(rules)
        if merge_ohe_features is not None:
            DTMSQL._check_merge_ohe_features(merge_ohe_features)

        dbms_util = DBMSUtils()

        sql_query = ""
        sql_string = " CASE WHEN "
        # loop over the rules
        for item in rules:

            if not isinstance(item, tuple):  # the item is a leaf score
                sql_string = sql_string[:-5]  # remove 'WHEN '
                if sql_string == ' CASE ':  # case a tree is composed of only a leaf
                    sql_query += str(item)
                else:
                    sql_string += " THEN {} ".format(item)
                    sql_query += sql_string
                sql_string = "WHEN "
            else:  # the item is a rule condition
                op = item[1]
                thr = item[2]
                if op == 'l':
                    op = '<='
                elif op == 'r':
                    op = '>'
                else:  # when op is equals to '=' or '<>' the thr is a string
                    thr = "'{}'".format(thr)
                feature_name = item[3]

                if merge_ohe_features is not None:
                    # if ohe features have to be merged in the decision tree, the tree conditions are changed
                    #   feature_after_ohe > 0.5 becomes original_cat_feature = val
                    #   feature_after_ohe <= 0.5 becomes original_cat_feature <> val
                    if feature_name in merge_ohe_features:  # only categorical features should pass this test
                        mof = merge_ohe_features[feature_name]
                        feature_name = mof['feature_before_ohe']
                        old_op = op[:]
                        op = '='
                        if old_op == '<=':
                            if 0 <= thr:
                                op = '<>'
                        elif old_op == '>':
                            if 0 > thr:
                                op = '<>'
                        else:
                            raise ValueError("Wrong op.")

                        thr = "'{}'".format(mof['value'])

                feature_name = dbms_util.get_delimited_col(dbms, feature_name)
                sql_string += "{} {} {} and ".format(feature_name, op, thr)

        if 'CASE' in sql_query:  # ignore the case where a tree is composed of only a leaf
            sql_query += "END "

        return sql_query