def predict(self, idadf, column_id=None, outtable=None): """ Apply the K-means clustering model to new data. Parameters ---------- idadf : IdaDataFrame IdaDataFrame to be used as input. column_id : str The column of the input table that identifies a unique instance ID. Default: the same id column that is specified in the stored procedure to build the model. outtable : str The name of the output table where the assigned clusters are stored. If this parameter is not specified, it is generated automatically. If the parameter corresponds to an existing table in the database, it is replaced. Returns ------- IdaDataFrame IdaDataFrame containing the closest cluster for each data point referenced by its ID. """ if not type(idadf).__name__ == 'IdaDataFrame': raise TypeError("Argument should be an IdaDataFrame") # Check the ID if column_id is None: column_id = self._column_id if column_id not in idadf.columns: raise ValueError( "No id columns is available in IdaDataFrame:" + column_id + ". Either create a new ID column using add_column_id function" + " or give the name of a column that can be used as ID") if self._idadb is None: raise IdaKMeansError("No KMeans model was trained before") if outtable is None: outtable = idadf._idadb._get_valid_modelname('PREDICT_KMEANS_') else: if self.outtable: outtable = self.outtable outtable = ibmdbpy.utils.check_tablename(outtable) if idadf._idadb.exists_table(outtable): idadf._idadb.drop_table(outtable) self.outtable = outtable # Create a temporay view idadf.internal_state._create_view() tmp_view_name = idadf.internal_state.current_state if "." in tmp_view_name: tmp_view_name = tmp_view_name.split('.')[-1] try: idadf._idadb._call_stored_procedure("IDAX.PREDICT_KMEANS ", model=self.modelname, intable=tmp_view_name, id=column_id, outtable=self.outtable) except: raise finally: idadf.internal_state._delete_view() idadf._idadb.commit() self.labels_ = ibmdbpy.IdaDataFrame(idadf._idadb, outtable, indexer=column_id) return self.labels_
def _retrieve_KMeans_Model(self, modelname, verbose=False): """ Retrieve information about the model to print the results. The KMEANS IDAX function stores its result in 4 tables: * <MODELNAME>_MODEL * <MODELNAME>_COLUMNS * <MODELNAME>_COLUMN_STATISTICS * <MODELNAME>_CLUSTERS Parameters ---------- modelname : str The name of the model that is retrieved. verbose : bol, default: False Verbosity mode. """ modelname = ibmdbpy.utils.check_tablename(modelname) if self._idadb is None: raise IdaKMeansError("No KMeans model was trained before") model_main = self._idadb.ida_query('SELECT * FROM "' + self._idadb.current_schema + '"."' + modelname + '_MODEL"') # Woraround for specific version of ODBC model_main.columns = [ 'MODELCLASS', 'COMPARISONTYPE', 'COMPARISONMEASURE', 'NUMCLUSTERS' ] model_main.columns = [x.upper() for x in model_main.columns] col_info = self._idadb.ida_query( 'SELECT * FROM "' + self._idadb.current_schema + '"."' + modelname + '_COLUMNS"', ) col_info.columns = [ 'COLUMNNAME', 'DATATYPE', 'OPTYPE', 'USAGETYPE', 'COLUMNWEIGHT', 'AUTOTRANSFORM', 'TRANSFORMEDCOLUMN', 'COMPAREFUNCTION', 'IMPORTANCE', 'OUTLIERTREATMENT', 'LOWERLIMIT', 'UPPERLIMIT', 'CLOSURE', 'STATISTICSTYPE' ] col_info.columns = [x.upper() for x in col_info.columns] col_stats = self._idadb.ida_query('SELECT * FROM "' + self._idadb.current_schema + '"."' + modelname + '_COLUMN_STATISTICS"') col_stats.columns = [ 'CLUSTERID', 'COLUMNNAME', 'CARDINALITY', 'MODE', 'MINIMUM', 'MAXIMUM', 'MEAN', 'VARIANCE', 'VALIDFREQ', 'MISSINGFREQ', 'INVALIDFREQ', 'IMPORTANCE' ] col_stats.columns = [x.upper() for x in col_stats.columns] km_out_stat = self._idadb.ida_query('SELECT * FROM "' + self._idadb.current_schema + '"."' + modelname + '_CLUSTERS"') km_out_stat.columns = [ 'CLUSTERID', 'NAME', 'DESCRIPTION', 'SIZE', 'RELSIZE', 'WITHINSS' ] km_out_stat.columns = [x.upper() for x in km_out_stat.columns] k = model_main.iloc[0][3] distance = model_main.iloc[0][2] cont_cols = col_info.loc[(col_info['USAGETYPE'] == 'active') & (col_info['OPTYPE'] == 'continuous'), ['COLUMNNAME']] cat_cols = col_info.loc[(col_info['USAGETYPE'] == 'active') & (col_info['OPTYPE'] == 'categorical'), ['COLUMNNAME']] columns = [] for x in col_stats['COLUMNNAME'].values: if x not in columns: columns.append(x) clusters = km_out_stat['CLUSTERID'].values clusters.sort() cluster_centers = [] for cluster in clusters: tmp = [cluster] for column in columns: if column in cont_cols.values: tmp.append( col_stats.loc[(col_stats['CLUSTERID'] == cluster) & (col_stats['COLUMNNAME'] == column)] ['MEAN'].values[0]) elif column in cat_cols.values: tmp.append( col_stats.loc[(col_stats['CLUSTERID'] == cluster) & (col_stats['COLUMNNAME'] == column)] ['MODE'].values[0]) else: raise TypeError("Unexpected column category") cluster_centers.append(tmp) centers = pd.DataFrame([tuple(x) for x in cluster_centers]) centers.columns = ['CLUSTERID'] + columns if verbose is True: print("MODEL") print(model_main) print("COLUMNS") print(col_info) print("COLUMNS_STATISTICS") print(col_stats) print("CLUSTERS") print(km_out_stat) result = dict() result['withinss'] = km_out_stat['WITHINSS'].values result['size'] = km_out_stat['SIZE'].values result['relsize'] = km_out_stat['RELSIZE'].values result['distance'] = distance result['k'] = k result['centers'] = centers return result
def _retrieve_KMeans_Model(self, modelname, verbose=False): """ Retrieve information about the model to print the results. The KMEANS IDAX function stores its result in 4 tables: * <MODELNAME>_MODEL * <MODELNAME>_COLUMNS * <MODELNAME>_COLUMN_STATISTICS * <MODELNAME>_CLUSTERS Parameters ---------- modelname : str The name of the model that is retrieved. verbose : bol, default: False Verbosity mode. """ modelname = ibmdbpy.utils.check_tablename(modelname) if self._idadb is None: raise IdaKMeansError("No KMeans model was trained before") model_main = self._idadb.ida_query('SELECT * FROM "' + self._idadb.current_schema + '"."' + modelname + '_MODEL"') model_main.columns = [x.upper() for x in model_main.columns] col_info = self._idadb.ida_query( 'SELECT * FROM "' + self._idadb.current_schema + '"."' + modelname + '_COLUMNS"', ) col_info.columns = [x.upper() for x in col_info.columns] col_stats = self._idadb.ida_query('SELECT * FROM "' + self._idadb.current_schema + '"."' + modelname + '_COLUMN_STATISTICS"') col_stats.columns = [x.upper() for x in col_stats.columns] km_out_stat = self._idadb.ida_query('SELECT * FROM "' + self._idadb.current_schema + '"."' + modelname + '_CLUSTERS"') km_out_stat.columns = [x.upper() for x in km_out_stat.columns] k = model_main.iloc[0][3] distance = model_main.iloc[0][2] cont_cols = col_info.loc[(col_info['USAGETYPE'] == 'active') & (col_info['OPTYPE'] == 'continuous'), ['COLUMNNAME']] cat_cols = col_info.loc[(col_info['USAGETYPE'] == 'active') & (col_info['OPTYPE'] == 'categorical'), ['COLUMNNAME']] columns = [] for x in col_stats['COLUMNNAME'].values: if x not in columns: columns.append(x) clusters = km_out_stat['CLUSTERID'].values clusters.sort() cluster_centers = [] for cluster in clusters: tmp = [cluster] for column in columns: if column in cont_cols.values: tmp.append( col_stats.loc[(col_stats['CLUSTERID'] == cluster) & (col_stats['COLUMNNAME'] == column)] ['MEAN'].values[0]) elif column in cat_cols.values: tmp.append( col_stats.loc[(col_stats['CLUSTERID'] == cluster) & (col_stats['COLUMNNAME'] == column)] ['MODE'].values[0]) else: raise TypeError("Unexpected column category") cluster_centers.append(tmp) centers = pd.DataFrame([tuple(x) for x in cluster_centers]) centers.columns = ['CLUSTERID'] + columns if verbose is True: print("MODEL") print(model_main) print("COLUMNS") print(col_info) print("COLUMNS_STATISTICS") print(col_stats) print("CLUSTERS") print(km_out_stat) result = dict() result['withinss'] = km_out_stat['WITHINSS'].values result['size'] = km_out_stat['SIZE'].values result['relsize'] = km_out_stat['RELSIZE'].values result['distance'] = distance result['k'] = k result['centers'] = centers return result