Esempio n. 1
0
 def __init__(self, ndims=5, nn=25, k=10000, verbose=False):
     self.logger = logging.getLogger("T5Clustering")
     logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s',
                         level=logging.DEBUG)
     self.ndims = ndims
     self.nn = nn
     self.k = k
     self.data = None
     self.train_data = None
     self.test_data = None
     self.data_embedded = None
     self.sampling = None
     self.reducer = DimReducer(n_components=ndims, n_neighbors=nn)
     self.kmeans = KMeans(n_clusters=k, verbose=verbose)
Esempio n. 2
0
class T5_KMEANS:
    def __init__(self, ndims=5, nn=25, k=10000, verbose=False):
        self.logger = logging.getLogger("T5Clustering")
        logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s',
                            level=logging.DEBUG)
        self.ndims = ndims
        self.nn = nn
        self.k = k
        self.data = None
        self.train_data = None
        self.test_data = None
        self.data_embedded = None
        self.sampling = None
        self.reducer = DimReducer(n_components=ndims, n_neighbors=nn)
        self.kmeans = KMeans(n_clusters=k, verbose=verbose)

    def split(self, fname, sampling=False, train_size=1000000, test_size=None):
        self.data = read(fname, scale=True)
        self.sampling = sampling
        if sampling:
            self.train_data, test_data = split(train_size=train_size,
                                               test_size=test_size)
        else:
            self.train_data = self.data
        return self

    def reduce(self):
        self.logger.info("Dimensionality reduction (UMAP): %s",
                         self.reducer.umap.get_params)
        self.reducer.fit(self.train_data)
        del self.train_data
        step = min(500000, len(self.data))
        result = self.reducer.reduce(self.data[0:step], as_df=False)
        for i in range(step, len(self.data), step):
            result = np.append(result,
                               self.reducer.reduce(self.data[i:i + step],
                                                   as_df=False),
                               axis=0)
            self.logger.debug("Dim reduce batch : %d", i)
        self.data = None
        self.data_embedded = result
        return self

    def cluster(self):
        self.logger.info("Clustering ... (KMeans)")
        self.kmeans.fit(self.data_embedded)
        self.logger.info("Clusters: %d", len(self.kmeans.cluster_centers_))
        return self
Esempio n. 3
0
    def fit(self, X_train, y_train):
        """ Fit decision tree model """
        if 'XGBoost' in self.hpo_config.model_type:
            hpo_log.info('> fit xgboost model')
            dtrain = xgboost.DMatrix(data=X_train, label=y_train)
            num_boost_round = self.hpo_config.model_params['num_boost_round']
            trained_model = xgboost.train(dtrain=dtrain,
                                          params=self.hpo_config.model_params,
                                          num_boost_round=num_boost_round)

        elif 'RandomForest' in self.hpo_config.model_type:
            hpo_log.info('> fit randomforest model')
            trained_model = RandomForestClassifier(
                n_estimators=self.hpo_config.model_params['n_estimators'],
                max_depth=self.hpo_config.model_params['max_depth'],
                max_features=self.hpo_config.model_params['max_features'],
                n_bins=self.hpo_config.model_params['n_bins']).fit(
                    X_train, y_train.astype('int32'))

        elif 'KMeans' in self.hpo_config.model_type:
            hpo_log.info('> fit kmeans model')
            trained_model = KMeans(
                n_clusters=self.hpo_config.model_params['n_clusters'],
                max_iter=self.hpo_config.model_params['max_iter'],
                random_state=self.hpo_config.model_params['random_state'],
                init=self.hpo_config.model_params['init']).fit(X_train)

        return trained_model
Esempio n. 4
0
def perform_kmeans(x, category, gpu_kemans=False):
    if gpu_kemans:  # Acceleration by using gpu
        os.environ["CUDA_VISIBLE_DEVICES"] = '0'
        from cuml.cluster import KMeans
        clustering = KMeans(n_clusters=category,
                            n_init=10,
                            random_state=0,
                            output_type='numpy').fit(x)
        return clustering.labels_, clustering.cluster_centers_
    else:
        clustering = cluster.KMeans(n_clusters=category, random_state=0).fit(x)
        return clustering.labels_, clustering.cluster_centers_
Esempio n. 5
0
def kmeans_prop_cuda_batch(fname, sample_rate, slice_w, slice_h, down_sample=0.25, num_clusters=8, batch_size=1):
    """Produce a "most frequent" color visualization video file by iterating
        over the frames of the source and using a kmean's clustering algorithm using cuda
    """

    from cuml.cluster import KMeans as KMeansCuda 

    cap = cv2.VideoCapture(fname)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    num_slices = total_frames // sample_rate

    vis_output = np.zeros((slice_h, slice_w * num_slices, 3), dtype='uint8')
    cluster_model = KMeansCuda(n_clusters=num_clusters, init='scalable-k-means++', n_init=20)

    for i in tqdm(range(total_frames)):
        ret = cap.grab()
        if (i % sample_rate) == 0:
            ret, frame = cap.retrieve()
            temp_f = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
            reduc = cv2.resize(temp_f, None, fx=down_sample, fy=down_sample, interpolation=cv2.INTER_CUBIC).astype('float32')
            with warnings.catch_warnings():
                warnings.simplefilter('ignore')
                cluster_model.fit(reduc.reshape((-1,3)))
                colors = np.around(cluster_model.cluster_centers_).astype('uint8')
            lbl, counts = np.unique(cluster_model.labels_, return_counts=True)
            cut = slice_h / np.sum(counts)
            ordering = np.argsort(counts)[::-1]
            tind = int(i // sample_rate)
            prev_ind = 0
            for i, val in enumerate(ordering):
                height = int(round(cut * counts[val]))
                l_ind = (tind * slice_w)
                r_ind = (tind + 1) * slice_w
                vis_output[prev_ind:prev_ind+height, l_ind:r_ind] = colors[val]
                prev_ind += height
            
    cap.release()
    return vis_output
Esempio n. 6
0
def create_content_class(df):
    dfs = []
    for i in range(2):
        tsne_df = pd.read_feather(f'../features/content_id_tsne_{i}_train.feather')
        tsne_df[f'content_id_tsne_{i}'].fillna(-100, inplace=True)
        tsne_df[f'content_id_tsne_{i}'] = (tsne_df[f'content_id_tsne_{i}'] - tsne_df[f'content_id_tsne_{i}'].mean()) / tsne_df[f'content_id_tsne_{i}'].std()
        dfs.append(tsne_df)

    tsne_array = pd.concat(dfs, axis=1).values
    pred = KMeans(n_clusters=N_CLUSTERS).fit_predict(tsne_array)

    df[f'content_id_class{N_CLUSTERS}'] = pred

    return df
def make_clusters_KMeans(dat_to_cluster , nb_clust, n=15000):
    estimator = KMeans(n_clusters=nb_clust)
    if n < len(dat_to_cluster):
        s=np.asarray(sample(list(dat_to_cluster), k = n), dtype=np.float_)
        estimator= estimator.fit(s)
        res = []
        notinit = True
        for i in range(0, len(dat_to_cluster) // n + 1):
            d = np.asarray(list(dat_to_cluster)[n * i: n * (i + 1)])
            if notinit:
                res = list(estimator.predict(d))
                notinit = False
            else:
                a =list(estimator.predict(d))

                for j in a:
                    res.append(j)


    else :
        res = estimator.fit_predict(dat_to_cluster)
    return res, estimator
Esempio n. 8
0
    def fit(self, X, y=None) -> "KBinsDiscretizer":
        """
        Fit the estimator.

        Parameters
        ----------
        X : numeric array-like, shape (n_samples, n_features)
            Data to be discretized.

        y : None
            Ignored. This parameter exists only for compatibility with
            :class:`sklearn.pipeline.Pipeline`.

        Returns
        -------
        self
        """
        X = self._validate_data(X, dtype='numeric')

        valid_encode = ('onehot', 'onehot-dense', 'ordinal')
        if self.encode not in valid_encode:
            raise ValueError("Valid options for 'encode' are {}. "
                             "Got encode={!r} instead.".format(
                                 valid_encode, self.encode))
        valid_strategy = ('uniform', 'quantile', 'kmeans')
        if self.strategy not in valid_strategy:
            raise ValueError("Valid options for 'strategy' are {}. "
                             "Got strategy={!r} instead.".format(
                                 valid_strategy, self.strategy))

        n_features = X.shape[1]
        n_bins = self._validate_n_bins(n_features)
        n_bins = np.asnumpy(n_bins)

        bin_edges = cpu_np.zeros(n_features, dtype=object)
        for jj in range(n_features):
            column = X[:, jj]
            col_min, col_max = column.min(), column.max()

            if col_min == col_max:
                warnings.warn("Feature %d is constant and will be "
                              "replaced with 0." % jj)
                n_bins[jj] = 1
                bin_edges[jj] = np.array([-np.inf, np.inf])
                continue

            if self.strategy == 'uniform':
                bin_edges[jj] = np.linspace(col_min, col_max, n_bins[jj] + 1)

            elif self.strategy == 'quantile':
                quantiles = np.linspace(0, 100, n_bins[jj] + 1)
                bin_edges[jj] = np.asarray(np.percentile(column, quantiles))
                # Workaround for https://github.com/cupy/cupy/issues/4451
                # This should be removed as soon as a fix is available in cupy
                # in order to limit alterations in the included sklearn code
                bin_edges[jj][-1] = col_max

            elif self.strategy == 'kmeans':
                # Deterministic initialization with uniform spacing
                uniform_edges = np.linspace(col_min, col_max, n_bins[jj] + 1)
                init = (uniform_edges[1:] + uniform_edges[:-1])[:, None] * 0.5

                # 1D k-means procedure
                km = KMeans(n_clusters=n_bins[jj],
                            init=init,
                            n_init=1,
                            output_type='cupy')
                km = km.fit(column[:, None])
                with using_output_type('cupy'):
                    centers = km.cluster_centers_[:, 0]
                # Must sort, centers may be unsorted even with sorted init
                centers.sort()
                bin_edges[jj] = (centers[1:] + centers[:-1]) * 0.5
                bin_edges[jj] = np.r_[col_min, bin_edges[jj], col_max]

            # Remove bins whose width are too small (i.e., <= 1e-8)
            if self.strategy in ('quantile', 'kmeans'):
                mask = np.diff(bin_edges[jj], prepend=-np.inf) > 1e-8
                bin_edges[jj] = bin_edges[jj][mask]
                if len(bin_edges[jj]) - 1 != n_bins[jj]:
                    warnings.warn('Bins whose width are too small (i.e., <= '
                                  '1e-8) in feature %d are removed. Consider '
                                  'decreasing the number of bins.' % jj)
                    n_bins[jj] = len(bin_edges[jj]) - 1

        self.bin_edges_ = bin_edges
        self.n_bins_ = n_bins

        if 'onehot' in self.encode:
            self._encoder = OneHotEncoder(categories=np.array(
                [np.arange(i) for i in self.n_bins_]),
                                          sparse=self.encode == 'onehot',
                                          output_type='cupy')
            # Fit the OneHotEncoder with toy datasets
            # so that it's ready for use after the KBinsDiscretizer is fitted
            self._encoder.fit(np.zeros((1, len(self.n_bins_)), dtype=int))

        return self
Esempio n. 9
0
        warn_not_gpu_support(alg)
    elif alg.name == 'IsolationForest':
        from sklearn.ensemble import IsolationForest
        model = IsolationForest(**alg.input_variables.__dict__)
        warn_not_gpu_support(alg)
    # -------------------------------------------------------------
    # Clustering algorithms
    #
    elif alg.name == 'MeanShift':
        from sklearn.cluster import MeanShift
        model = MeanShift(**alg.input_variables.__dict__)
        warn_not_gpu_support(alg)
    elif alg.name == 'KMeans':
        if NVIDIA_RAPIDS_ENABLED:
            from cuml.cluster import KMeans
            model = KMeans(**alg.input_variables.__dict__)
        else:
            from sklearn.cluster import KMeans
            model = KMeans(**alg.input_variables.__dict__)
# -------------------------------------------------------------
dataframe_train = None
dataframe_label = None
model_explainer = None
loss = 0
if model is not None:
    print('-' * 30)
    print(model)
    print('-' * 30)

    if is_labeled_data:
        dataframe_train = dataframe.drop([LABEL_COLUMN], axis=1)
def make_clusters(dat_to_cluster , nb_clust=4):
    estimator = KMeans(n_clusters=nb_clust)
    res = estimator.fit_predict(dat_to_cluster)
    return res, estimator