Esempio n. 1
0
    def predict(self, idadf, column_id=None, outtable=None):
        """
        Apply the K-means clustering model to new data.

        Parameters
        ----------
        idadf : IdaDataFrame
            IdaDataFrame to be used as input.

        column_id : str
            The column of the input table that identifies a unique instance ID.
            Default: the same id column that is specified in the stored procedure to build the model.

        outtable : str
            The name of the output table where the assigned clusters are stored.
            If this parameter is not specified, it is generated automatically.
            If the parameter corresponds to an existing table in the database,
            it is replaced.

        Returns
        -------
        IdaDataFrame
            IdaDataFrame containing the closest cluster for each data point referenced by its ID.
        """
        if not type(idadf).__name__ == 'IdaDataFrame':
            raise TypeError("Argument should be an IdaDataFrame")

        # Check the ID
        if column_id is None:
            column_id = self._column_id
        if column_id not in idadf.columns:
            raise ValueError(
                "No id columns is available in IdaDataFrame:" + column_id +
                ". Either create a new ID column using add_column_id function"
                + " or give the name of a column that can be used as ID")

        if self._idadb is None:
            raise IdaKMeansError("No KMeans model was trained before")

        if outtable is None:
            outtable = idadf._idadb._get_valid_modelname('PREDICT_KMEANS_')
        else:
            if self.outtable:
                outtable = self.outtable
            outtable = ibmdbpy.utils.check_tablename(outtable)
            if idadf._idadb.exists_table(outtable):
                idadf._idadb.drop_table(outtable)

        self.outtable = outtable
        # Create a temporay view
        idadf.internal_state._create_view()
        tmp_view_name = idadf.internal_state.current_state

        if "." in tmp_view_name:
            tmp_view_name = tmp_view_name.split('.')[-1]

        try:
            idadf._idadb._call_stored_procedure("IDAX.PREDICT_KMEANS ",
                                                model=self.modelname,
                                                intable=tmp_view_name,
                                                id=column_id,
                                                outtable=self.outtable)
        except:
            raise
        finally:
            idadf.internal_state._delete_view()
            idadf._idadb.commit()

        self.labels_ = ibmdbpy.IdaDataFrame(idadf._idadb,
                                            outtable,
                                            indexer=column_id)
        return self.labels_
Esempio n. 2
0
    def _retrieve_KMeans_Model(self, modelname, verbose=False):
        """
        Retrieve information about the model to print the results. The KMEANS 
        IDAX function stores its result in 4 tables:
            * <MODELNAME>_MODEL
            * <MODELNAME>_COLUMNS
            * <MODELNAME>_COLUMN_STATISTICS
            * <MODELNAME>_CLUSTERS

        Parameters
        ----------
        modelname : str
            The name of the model that is retrieved.

        verbose : bol, default: False
            Verbosity mode.
        """
        modelname = ibmdbpy.utils.check_tablename(modelname)

        if self._idadb is None:
            raise IdaKMeansError("No KMeans model was trained before")

        model_main = self._idadb.ida_query('SELECT * FROM "' +
                                           self._idadb.current_schema + '"."' +
                                           modelname + '_MODEL"')
        # Woraround for specific version of ODBC
        model_main.columns = [
            'MODELCLASS', 'COMPARISONTYPE', 'COMPARISONMEASURE', 'NUMCLUSTERS'
        ]
        model_main.columns = [x.upper() for x in model_main.columns]

        col_info = self._idadb.ida_query(
            'SELECT * FROM "' + self._idadb.current_schema + '"."' +
            modelname + '_COLUMNS"', )
        col_info.columns = [
            'COLUMNNAME', 'DATATYPE', 'OPTYPE', 'USAGETYPE', 'COLUMNWEIGHT',
            'AUTOTRANSFORM', 'TRANSFORMEDCOLUMN', 'COMPAREFUNCTION',
            'IMPORTANCE', 'OUTLIERTREATMENT', 'LOWERLIMIT', 'UPPERLIMIT',
            'CLOSURE', 'STATISTICSTYPE'
        ]
        col_info.columns = [x.upper() for x in col_info.columns]

        col_stats = self._idadb.ida_query('SELECT * FROM "' +
                                          self._idadb.current_schema + '"."' +
                                          modelname + '_COLUMN_STATISTICS"')
        col_stats.columns = [
            'CLUSTERID', 'COLUMNNAME', 'CARDINALITY', 'MODE', 'MINIMUM',
            'MAXIMUM', 'MEAN', 'VARIANCE', 'VALIDFREQ', 'MISSINGFREQ',
            'INVALIDFREQ', 'IMPORTANCE'
        ]
        col_stats.columns = [x.upper() for x in col_stats.columns]

        km_out_stat = self._idadb.ida_query('SELECT * FROM "' +
                                            self._idadb.current_schema +
                                            '"."' + modelname + '_CLUSTERS"')
        km_out_stat.columns = [
            'CLUSTERID', 'NAME', 'DESCRIPTION', 'SIZE', 'RELSIZE', 'WITHINSS'
        ]
        km_out_stat.columns = [x.upper() for x in km_out_stat.columns]

        k = model_main.iloc[0][3]
        distance = model_main.iloc[0][2]
        cont_cols = col_info.loc[(col_info['USAGETYPE'] == 'active') &
                                 (col_info['OPTYPE'] == 'continuous'),
                                 ['COLUMNNAME']]
        cat_cols = col_info.loc[(col_info['USAGETYPE'] == 'active') &
                                (col_info['OPTYPE'] == 'categorical'),
                                ['COLUMNNAME']]

        columns = []
        for x in col_stats['COLUMNNAME'].values:
            if x not in columns:
                columns.append(x)

        clusters = km_out_stat['CLUSTERID'].values
        clusters.sort()

        cluster_centers = []
        for cluster in clusters:
            tmp = [cluster]
            for column in columns:
                if column in cont_cols.values:
                    tmp.append(
                        col_stats.loc[(col_stats['CLUSTERID'] == cluster)
                                      & (col_stats['COLUMNNAME'] == column)]
                        ['MEAN'].values[0])
                elif column in cat_cols.values:
                    tmp.append(
                        col_stats.loc[(col_stats['CLUSTERID'] == cluster)
                                      & (col_stats['COLUMNNAME'] == column)]
                        ['MODE'].values[0])
                else:
                    raise TypeError("Unexpected column category")
            cluster_centers.append(tmp)

        centers = pd.DataFrame([tuple(x) for x in cluster_centers])
        centers.columns = ['CLUSTERID'] + columns

        if verbose is True:
            print("MODEL")
            print(model_main)
            print("COLUMNS")
            print(col_info)
            print("COLUMNS_STATISTICS")
            print(col_stats)
            print("CLUSTERS")
            print(km_out_stat)

        result = dict()
        result['withinss'] = km_out_stat['WITHINSS'].values
        result['size'] = km_out_stat['SIZE'].values
        result['relsize'] = km_out_stat['RELSIZE'].values
        result['distance'] = distance

        result['k'] = k
        result['centers'] = centers

        return result
Esempio n. 3
0
    def _retrieve_KMeans_Model(self, modelname, verbose=False):
        """
        Retrieve information about the model to print the results. The KMEANS 
        IDAX function stores its result in 4 tables:
            * <MODELNAME>_MODEL
            * <MODELNAME>_COLUMNS
            * <MODELNAME>_COLUMN_STATISTICS
            * <MODELNAME>_CLUSTERS

        Parameters
        ----------
        modelname : str
            The name of the model that is retrieved.

        verbose : bol, default: False
            Verbosity mode.
        """
        modelname = ibmdbpy.utils.check_tablename(modelname)

        if self._idadb is None:
            raise IdaKMeansError("No KMeans model was trained before")

        model_main = self._idadb.ida_query('SELECT * FROM "' +
                                           self._idadb.current_schema + '"."' +
                                           modelname + '_MODEL"')
        model_main.columns = [x.upper() for x in model_main.columns]

        col_info = self._idadb.ida_query(
            'SELECT * FROM "' + self._idadb.current_schema + '"."' +
            modelname + '_COLUMNS"', )
        col_info.columns = [x.upper() for x in col_info.columns]

        col_stats = self._idadb.ida_query('SELECT * FROM "' +
                                          self._idadb.current_schema + '"."' +
                                          modelname + '_COLUMN_STATISTICS"')
        col_stats.columns = [x.upper() for x in col_stats.columns]

        km_out_stat = self._idadb.ida_query('SELECT * FROM "' +
                                            self._idadb.current_schema +
                                            '"."' + modelname + '_CLUSTERS"')
        km_out_stat.columns = [x.upper() for x in km_out_stat.columns]

        k = model_main.iloc[0][3]
        distance = model_main.iloc[0][2]
        cont_cols = col_info.loc[(col_info['USAGETYPE'] == 'active') &
                                 (col_info['OPTYPE'] == 'continuous'),
                                 ['COLUMNNAME']]
        cat_cols = col_info.loc[(col_info['USAGETYPE'] == 'active') &
                                (col_info['OPTYPE'] == 'categorical'),
                                ['COLUMNNAME']]

        columns = []
        for x in col_stats['COLUMNNAME'].values:
            if x not in columns:
                columns.append(x)

        clusters = km_out_stat['CLUSTERID'].values
        clusters.sort()

        cluster_centers = []
        for cluster in clusters:
            tmp = [cluster]
            for column in columns:
                if column in cont_cols.values:
                    tmp.append(
                        col_stats.loc[(col_stats['CLUSTERID'] == cluster)
                                      & (col_stats['COLUMNNAME'] == column)]
                        ['MEAN'].values[0])
                elif column in cat_cols.values:
                    tmp.append(
                        col_stats.loc[(col_stats['CLUSTERID'] == cluster)
                                      & (col_stats['COLUMNNAME'] == column)]
                        ['MODE'].values[0])
                else:
                    raise TypeError("Unexpected column category")
            cluster_centers.append(tmp)

        centers = pd.DataFrame([tuple(x) for x in cluster_centers])
        centers.columns = ['CLUSTERID'] + columns

        if verbose is True:
            print("MODEL")
            print(model_main)
            print("COLUMNS")
            print(col_info)
            print("COLUMNS_STATISTICS")
            print(col_stats)
            print("CLUSTERS")
            print(km_out_stat)

        result = dict()
        result['withinss'] = km_out_stat['WITHINSS'].values
        result['size'] = km_out_stat['SIZE'].values
        result['relsize'] = km_out_stat['RELSIZE'].values
        result['distance'] = distance

        result['k'] = k
        result['centers'] = centers

        return result