Beispiel #1
0
    def _init_estimate(self):
        # mini-batch sets stride to None
        stride = self.stride if self.stride else 1
        ###### init
        self._cluster_centers_iter = []
        self._init_centers_indices = {}
        self._t_total = 0
        traj_lengths = self.trajectory_lengths(stride=stride)
        total_length = sum(traj_lengths)
        if not self.n_clusters:
            self.n_clusters = min(int(math.sqrt(total_length)), 5000)
            self._logger.info("The number of cluster centers was not specified, "
                              "using min(sqrt(N), 5000)=%s as n_clusters." % self.n_clusters)
        if self.init_strategy == 'kmeans++':
            self._progress_register(self.n_clusters,
                                    description="initialize kmeans++ centers", stage=0)
        self._progress_register(self.max_iter, description="kmeans iterations", stage=1)
        self._init_in_memory_chunks(total_length)
        if self.init_strategy == 'uniform':
            # gives random samples from each trajectory such that the cluster centers are distributed percentage-wise
            # with respect to the trajectories length
            with conditional(self.fixed_seed, random_seed(42)):
                for idx, traj_len in enumerate(traj_lengths):
                    self._init_centers_indices[idx] = random.sample(list(range(0, traj_len)), int(
                            math.ceil((traj_len / float(total_length)) * self.n_clusters)))

        return stride
Beispiel #2
0
    def _init_estimate(self):
        # mini-batch sets stride to None
        stride = self.stride if self.stride else 1
        ###### init
        self._init_centers_indices = {}
        self._t_total = 0
        traj_lengths = self.trajectory_lengths(stride=stride, skip=self.skip)
        total_length = sum(traj_lengths)
        if not self.n_clusters:
            self.n_clusters = min(int(math.sqrt(total_length)), 5000)
            self.logger.info("The number of cluster centers was not specified, "
                              "using min(sqrt(N), 5000)=%s as n_clusters." % self.n_clusters)
        from pyemma.coordinates.data import DataInMemory
        if not isinstance(self, MiniBatchKmeansClustering) and not isinstance(self.data_producer, DataInMemory):
            n_chunks = self.data_producer.n_chunks(chunksize=self.chunksize, skip=self.skip, stride=self.stride)
            self._progress_register(n_chunks, description="creating data array", stage='data')

        if self.init_strategy == 'kmeans++':
            self._progress_register(self.n_clusters,
                                    description="initialize kmeans++ centers", stage=0)
        self._progress_register(self.max_iter, description="kmeans iterations", stage=1)
        self._init_in_memory_chunks(total_length)

        if self.init_strategy == 'uniform':
            # gives random samples from each trajectory such that the cluster centers are distributed percentage-wise
            # with respect to the trajectories length
            with random_seed(self.fixed_seed):
                for idx, traj_len in enumerate(traj_lengths):
                    self._init_centers_indices[idx] = random.sample(list(range(0, traj_len)), int(
                            math.ceil((traj_len / float(total_length)) * self.n_clusters)))

        from ._ext import kmeans as kmeans_mod
        self._inst = kmeans_mod.Kmeans_f(self.n_clusters, self.metric, self.data_producer.ndim)

        return stride