Esempio n. 1
0
    def run_gaussian_initial_starting_points(self, nb_clusters, src_file, data_without_target, dataset_name,
                                             initial_clusters_file, initial_clusters, run_number, run_info=None):
        import tensorflow as tf

        output_file, centroids_file = self._prepare_files(dataset_name, run_info, True)

        if self.seed is not None:
            tf.set_random_seed(self.seed)

        points = data_without_target.values

        def get_input_fn():
            def input_fn():
                return constant_op.constant(points.astype(np.float32)), None

            return input_fn

        gmm = tf.contrib.factorization.GMM(num_clusters=nb_clusters, initial_clusters=initial_clusters)
        gmm.fit(input_fn=get_input_fn(), steps=1)

        cluster_indices = list(gmm.predict_assignments())
        ClusteringToolkit._save_clustering(TensorFlow._clustering_to_list(points, cluster_indices), output_file)
        ClusteringToolkit._save_centroids(TensorFlow._centroids_to_list(gmm), centroids_file)

        return output_file, {"centroids": centroids_file}
Esempio n. 2
0
    def run_kmeans_base(self,
                        nb_clusters,
                        src_file,
                        data_without_target,
                        dataset_name,
                        run_number,
                        config_function,
                        run_info=None,
                        nb_iterations=None):
        self._init()
        output_file, centroids_file = self._prepare_files(
            dataset_name, run_info, True)

        train_features = shogun.RealFeatures(
            data_without_target.values.astype("float64").transpose())
        # distance metric over feature matrix - Euclidean distance
        distance = shogun.EuclideanDistance(train_features, train_features)

        # KMeans object created
        kmeans = shogun.KMeans(nb_clusters, distance)
        if config_function is not None:
            config_function(kmeans)

        if nb_iterations is not None:
            kmeans.set_max_iter(nb_iterations)

        centers, result = Shogun._kmeans_process(kmeans)
        ClusteringToolkit._save_clustering(
            Shogun._clustering_to_list(data_without_target, result),
            output_file)
        ClusteringToolkit._save_centroids(Shogun._centroids_to_list(centers),
                                          centroids_file)

        return output_file, {"centroids": centroids_file}
Esempio n. 3
0
    def run_meanshift(self, nb_clusters, src_file, data_without_target, dataset_name, run_number, run_info=None):
        self._init()
        output_file, centroids_file = self._prepare_files(dataset_name, run_info, True)

        # Create model.
        model = sklearn.cluster.MeanShift()
        model.fit(data_without_target)

        self._save_clustering(self._clustering_to_list(data_without_target, model.labels_), output_file)
        ClusteringToolkit._save_centroids(self._centroids_to_list(model), centroids_file)

        return output_file, {"centroids": centroids_file}
Esempio n. 4
0
    def run_kmeans(self, nb_clusters, src_file, data_without_target, dataset_name, initial_clusters_file,
                   initial_clusters, run_number, run_info=None, nb_iterations=None):
        import tensorflow as tf

        output_file, centroids_file = self._prepare_files(dataset_name, run_info, True)

        if self.seed is not None:
            tf.set_random_seed(self.seed)

        kmeans = tf.contrib.factorization.KMeansClustering(num_clusters=nb_clusters,
                                                           initial_clusters=initial_clusters, use_mini_batch=False)
        points, input_fn = TensorFlow._build_points_and_input_fn(data_without_target)
        TensorFlow._train_kpp(input_fn, kmeans, 10 if nb_iterations is None else nb_iterations)
        cluster_indices = list(kmeans.predict_cluster_index(input_fn))
        ClusteringToolkit._save_clustering(TensorFlow._clustering_to_list(points, cluster_indices), output_file)
        ClusteringToolkit._save_centroids(TensorFlow._centroids_to_list(kmeans), centroids_file)

        return output_file, {"centroids": centroids_file}
Esempio n. 5
0
    def base_kmeans_specified_init(self, nb_clusters, src_file, data_without_target, dataset_name, run_number,
                                   init, run_info=None, nb_iterations=None):
        self._init()
        output_file, centroids_file = self._prepare_files(dataset_name, run_info, True)

        # Create a KMean model.
        params = {"n_clusters": nb_clusters, "init": init, "n_init": 1}
        if self.tolerance is not None:
            params['tol'] = self.tolerance
        if nb_iterations is not None:
            params['max_iter'] = nb_iterations

        sklearn_kmean_model = sklearn.cluster.KMeans(**params)
        sklearn_kmean_model.fit(data_without_target)

        ClusteringToolkit._save_clustering(self._clustering_to_list(data_without_target, sklearn_kmean_model.labels_),
                                           output_file)
        ClusteringToolkit._save_centroids(self._centroids_to_list(sklearn_kmean_model), centroids_file)

        return output_file, {"centroids": centroids_file}
Esempio n. 6
0
def read_or_draw_centroids(dataset_name,
                           run_info,
                           nb_clusters,
                           data,
                           redirect_output=None):
    drawn_clusters_file_path = ClusteringToolkit.dataset_out_file_name_static(
        dataset_name, "{}.init_set_clusters".format(run_info))

    if redirect_output is not None:
        base_name = os.path.basename(drawn_clusters_file_path)
        drawn_clusters_file_path = os.path.join(redirect_output, base_name)

    if not os.path.exists(drawn_clusters_file_path):
        # Lets draw a random feature set on EACH feature (this will be the starting point for *ALL* algorithms)
        initial_clusters = draw_centroids(nb_clusters, data,
                                          drawn_clusters_file_path)
    else:
        # Reread to get float32 type (required by TF)
        initial_clusters = read_centroids_file(drawn_clusters_file_path)

    return drawn_clusters_file_path, initial_clusters
Esempio n. 7
0
 def _save_run(ret, data_without_target, output_file, centroids_file):
     ClusteringToolkit._save_clustering(OpenCV._clustering_to_list(data_without_target, ret[1]), output_file)
     ClusteringToolkit._save_centroids(OpenCV._centroids_to_list(ret[2]), centroids_file)