def __init__(self, ndims=5, nn=25, k=10000, verbose=False): self.logger = logging.getLogger("T5Clustering") logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', level=logging.DEBUG) self.ndims = ndims self.nn = nn self.k = k self.data = None self.train_data = None self.test_data = None self.data_embedded = None self.sampling = None self.reducer = DimReducer(n_components=ndims, n_neighbors=nn) self.kmeans = KMeans(n_clusters=k, verbose=verbose)
class T5_KMEANS: def __init__(self, ndims=5, nn=25, k=10000, verbose=False): self.logger = logging.getLogger("T5Clustering") logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', level=logging.DEBUG) self.ndims = ndims self.nn = nn self.k = k self.data = None self.train_data = None self.test_data = None self.data_embedded = None self.sampling = None self.reducer = DimReducer(n_components=ndims, n_neighbors=nn) self.kmeans = KMeans(n_clusters=k, verbose=verbose) def split(self, fname, sampling=False, train_size=1000000, test_size=None): self.data = read(fname, scale=True) self.sampling = sampling if sampling: self.train_data, test_data = split(train_size=train_size, test_size=test_size) else: self.train_data = self.data return self def reduce(self): self.logger.info("Dimensionality reduction (UMAP): %s", self.reducer.umap.get_params) self.reducer.fit(self.train_data) del self.train_data step = min(500000, len(self.data)) result = self.reducer.reduce(self.data[0:step], as_df=False) for i in range(step, len(self.data), step): result = np.append(result, self.reducer.reduce(self.data[i:i + step], as_df=False), axis=0) self.logger.debug("Dim reduce batch : %d", i) self.data = None self.data_embedded = result return self def cluster(self): self.logger.info("Clustering ... (KMeans)") self.kmeans.fit(self.data_embedded) self.logger.info("Clusters: %d", len(self.kmeans.cluster_centers_)) return self
def fit(self, X_train, y_train): """ Fit decision tree model """ if 'XGBoost' in self.hpo_config.model_type: hpo_log.info('> fit xgboost model') dtrain = xgboost.DMatrix(data=X_train, label=y_train) num_boost_round = self.hpo_config.model_params['num_boost_round'] trained_model = xgboost.train(dtrain=dtrain, params=self.hpo_config.model_params, num_boost_round=num_boost_round) elif 'RandomForest' in self.hpo_config.model_type: hpo_log.info('> fit randomforest model') trained_model = RandomForestClassifier( n_estimators=self.hpo_config.model_params['n_estimators'], max_depth=self.hpo_config.model_params['max_depth'], max_features=self.hpo_config.model_params['max_features'], n_bins=self.hpo_config.model_params['n_bins']).fit( X_train, y_train.astype('int32')) elif 'KMeans' in self.hpo_config.model_type: hpo_log.info('> fit kmeans model') trained_model = KMeans( n_clusters=self.hpo_config.model_params['n_clusters'], max_iter=self.hpo_config.model_params['max_iter'], random_state=self.hpo_config.model_params['random_state'], init=self.hpo_config.model_params['init']).fit(X_train) return trained_model
def perform_kmeans(x, category, gpu_kemans=False): if gpu_kemans: # Acceleration by using gpu os.environ["CUDA_VISIBLE_DEVICES"] = '0' from cuml.cluster import KMeans clustering = KMeans(n_clusters=category, n_init=10, random_state=0, output_type='numpy').fit(x) return clustering.labels_, clustering.cluster_centers_ else: clustering = cluster.KMeans(n_clusters=category, random_state=0).fit(x) return clustering.labels_, clustering.cluster_centers_
def kmeans_prop_cuda_batch(fname, sample_rate, slice_w, slice_h, down_sample=0.25, num_clusters=8, batch_size=1): """Produce a "most frequent" color visualization video file by iterating over the frames of the source and using a kmean's clustering algorithm using cuda """ from cuml.cluster import KMeans as KMeansCuda cap = cv2.VideoCapture(fname) total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) num_slices = total_frames // sample_rate vis_output = np.zeros((slice_h, slice_w * num_slices, 3), dtype='uint8') cluster_model = KMeansCuda(n_clusters=num_clusters, init='scalable-k-means++', n_init=20) for i in tqdm(range(total_frames)): ret = cap.grab() if (i % sample_rate) == 0: ret, frame = cap.retrieve() temp_f = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) reduc = cv2.resize(temp_f, None, fx=down_sample, fy=down_sample, interpolation=cv2.INTER_CUBIC).astype('float32') with warnings.catch_warnings(): warnings.simplefilter('ignore') cluster_model.fit(reduc.reshape((-1,3))) colors = np.around(cluster_model.cluster_centers_).astype('uint8') lbl, counts = np.unique(cluster_model.labels_, return_counts=True) cut = slice_h / np.sum(counts) ordering = np.argsort(counts)[::-1] tind = int(i // sample_rate) prev_ind = 0 for i, val in enumerate(ordering): height = int(round(cut * counts[val])) l_ind = (tind * slice_w) r_ind = (tind + 1) * slice_w vis_output[prev_ind:prev_ind+height, l_ind:r_ind] = colors[val] prev_ind += height cap.release() return vis_output
def create_content_class(df): dfs = [] for i in range(2): tsne_df = pd.read_feather(f'../features/content_id_tsne_{i}_train.feather') tsne_df[f'content_id_tsne_{i}'].fillna(-100, inplace=True) tsne_df[f'content_id_tsne_{i}'] = (tsne_df[f'content_id_tsne_{i}'] - tsne_df[f'content_id_tsne_{i}'].mean()) / tsne_df[f'content_id_tsne_{i}'].std() dfs.append(tsne_df) tsne_array = pd.concat(dfs, axis=1).values pred = KMeans(n_clusters=N_CLUSTERS).fit_predict(tsne_array) df[f'content_id_class{N_CLUSTERS}'] = pred return df
def make_clusters_KMeans(dat_to_cluster , nb_clust, n=15000): estimator = KMeans(n_clusters=nb_clust) if n < len(dat_to_cluster): s=np.asarray(sample(list(dat_to_cluster), k = n), dtype=np.float_) estimator= estimator.fit(s) res = [] notinit = True for i in range(0, len(dat_to_cluster) // n + 1): d = np.asarray(list(dat_to_cluster)[n * i: n * (i + 1)]) if notinit: res = list(estimator.predict(d)) notinit = False else: a =list(estimator.predict(d)) for j in a: res.append(j) else : res = estimator.fit_predict(dat_to_cluster) return res, estimator
def fit(self, X, y=None) -> "KBinsDiscretizer": """ Fit the estimator. Parameters ---------- X : numeric array-like, shape (n_samples, n_features) Data to be discretized. y : None Ignored. This parameter exists only for compatibility with :class:`sklearn.pipeline.Pipeline`. Returns ------- self """ X = self._validate_data(X, dtype='numeric') valid_encode = ('onehot', 'onehot-dense', 'ordinal') if self.encode not in valid_encode: raise ValueError("Valid options for 'encode' are {}. " "Got encode={!r} instead.".format( valid_encode, self.encode)) valid_strategy = ('uniform', 'quantile', 'kmeans') if self.strategy not in valid_strategy: raise ValueError("Valid options for 'strategy' are {}. " "Got strategy={!r} instead.".format( valid_strategy, self.strategy)) n_features = X.shape[1] n_bins = self._validate_n_bins(n_features) n_bins = np.asnumpy(n_bins) bin_edges = cpu_np.zeros(n_features, dtype=object) for jj in range(n_features): column = X[:, jj] col_min, col_max = column.min(), column.max() if col_min == col_max: warnings.warn("Feature %d is constant and will be " "replaced with 0." % jj) n_bins[jj] = 1 bin_edges[jj] = np.array([-np.inf, np.inf]) continue if self.strategy == 'uniform': bin_edges[jj] = np.linspace(col_min, col_max, n_bins[jj] + 1) elif self.strategy == 'quantile': quantiles = np.linspace(0, 100, n_bins[jj] + 1) bin_edges[jj] = np.asarray(np.percentile(column, quantiles)) # Workaround for https://github.com/cupy/cupy/issues/4451 # This should be removed as soon as a fix is available in cupy # in order to limit alterations in the included sklearn code bin_edges[jj][-1] = col_max elif self.strategy == 'kmeans': # Deterministic initialization with uniform spacing uniform_edges = np.linspace(col_min, col_max, n_bins[jj] + 1) init = (uniform_edges[1:] + uniform_edges[:-1])[:, None] * 0.5 # 1D k-means procedure km = KMeans(n_clusters=n_bins[jj], init=init, n_init=1, output_type='cupy') km = km.fit(column[:, None]) with using_output_type('cupy'): centers = km.cluster_centers_[:, 0] # Must sort, centers may be unsorted even with sorted init centers.sort() bin_edges[jj] = (centers[1:] + centers[:-1]) * 0.5 bin_edges[jj] = np.r_[col_min, bin_edges[jj], col_max] # Remove bins whose width are too small (i.e., <= 1e-8) if self.strategy in ('quantile', 'kmeans'): mask = np.diff(bin_edges[jj], prepend=-np.inf) > 1e-8 bin_edges[jj] = bin_edges[jj][mask] if len(bin_edges[jj]) - 1 != n_bins[jj]: warnings.warn('Bins whose width are too small (i.e., <= ' '1e-8) in feature %d are removed. Consider ' 'decreasing the number of bins.' % jj) n_bins[jj] = len(bin_edges[jj]) - 1 self.bin_edges_ = bin_edges self.n_bins_ = n_bins if 'onehot' in self.encode: self._encoder = OneHotEncoder(categories=np.array( [np.arange(i) for i in self.n_bins_]), sparse=self.encode == 'onehot', output_type='cupy') # Fit the OneHotEncoder with toy datasets # so that it's ready for use after the KBinsDiscretizer is fitted self._encoder.fit(np.zeros((1, len(self.n_bins_)), dtype=int)) return self
warn_not_gpu_support(alg) elif alg.name == 'IsolationForest': from sklearn.ensemble import IsolationForest model = IsolationForest(**alg.input_variables.__dict__) warn_not_gpu_support(alg) # ------------------------------------------------------------- # Clustering algorithms # elif alg.name == 'MeanShift': from sklearn.cluster import MeanShift model = MeanShift(**alg.input_variables.__dict__) warn_not_gpu_support(alg) elif alg.name == 'KMeans': if NVIDIA_RAPIDS_ENABLED: from cuml.cluster import KMeans model = KMeans(**alg.input_variables.__dict__) else: from sklearn.cluster import KMeans model = KMeans(**alg.input_variables.__dict__) # ------------------------------------------------------------- dataframe_train = None dataframe_label = None model_explainer = None loss = 0 if model is not None: print('-' * 30) print(model) print('-' * 30) if is_labeled_data: dataframe_train = dataframe.drop([LABEL_COLUMN], axis=1)
def make_clusters(dat_to_cluster , nb_clust=4): estimator = KMeans(n_clusters=nb_clust) res = estimator.fit_predict(dat_to_cluster) return res, estimator