def save_model_topics(
            self,
            name: str,
            model: TopicModel,
            topic_scores: List[Dict[str, float]] = None,
            phi: pd.DataFrame = None,
            dataset: Dataset = None,
            theta: bool = False) -> None:

        if phi is None:
            phi = model.get_phi()

        with open(os.path.join(self._path, f'{name}__phi.bin'), 'wb') as f:
            f.write(dill.dumps(phi))

        if not theta:
            pass
        else:
            try:
                theta = model.get_theta(dataset=dataset)
            except ValueError:
                pass
            else:
                with open(os.path.join(self._path, f'{name}__theta.bin'), 'wb') as f:
                    f.write(dill.dumps(theta))

        if topic_scores is None:
            topic_scores = dict()

        with open(os.path.join(self._path, f'{name}__topic_scores.bin'), 'wb') as f:
            f.write(dill.dumps(topic_scores))
Esempio n. 2
0
    def call(self, model: TopicModel):
        theta = model.get_theta(dataset=self._dataset)

        theta.columns = range(len(theta.columns))
        objects_clusters = theta.values.argmax(axis=0)

        # TODO: or return some numeric?
        if len(set(objects_clusters)) == 1:
            _Logger.warning(
                'Only one unique cluster! Returning None as score value')

            return float('nan')

        return calinski_harabasz_score(theta.T.values, objects_clusters)
Esempio n. 3
0
    def call(self, model: TopicModel):
        theta = model.get_theta(dataset=self._dataset)

        theta.columns = range(len(theta.columns))
        objects_clusters = theta.values.argmax(axis=0)

        # TODO: or return some numeric?
        if len(set(objects_clusters)) == 1:
            _Logger.warning(
                'Only one unique cluster! Returning None as score value')

            return float('nan')

        return _silhouette_score_by_sampling(
            theta.T.values,
            objects_clusters,
            sample_size=self.sample_size,
            batches_number=self.batches_number)
    def call(self, model: TopicModel):
        theta = model.get_theta(dataset=self._dataset)
        phi = model.get_phi(class_ids=self.modalities)

        c_m1 = np.linalg.svd(phi, compute_uv=False)
        c_m2 = self.document_lengths.dot(theta.T)
        c_m2 += 0.0001  # we need this to prevent components equal to zero

        if len(c_m1) != phi.shape[1]:
            warnings.warn(
                f'Phi has {phi.shape[1]} topics'
                f' but its SVD resulted in a vector of size {len(c_m1)}!'
                f' To work correctly, SpectralDivergenceScore expects to get a vector'
                f' of exactly {phi.shape[1]} singular values.')

            return 1.0

        # we do not need to normalize these vectors
        return _symmetric_kl(c_m1, c_m2)
Esempio n. 5
0
    def call(self, model: TopicModel):
        theta = model.get_theta(dataset=self._dataset)
        T = theta.shape[0]

        return _compute_kl(T, theta, self.document_lengths)