def call(self, model: TopicModel):

        phi = model.get_phi(class_ids=[self.modality])
        V, T = phi.shape
        D = self.num_docs

        # TODO: consider the case of having vector of taus instead
        hyperparams = len(model.regularizers)

        # than2012 (https://link.springer.com/content/pdf/10.1007/978-3-642-33460-3_37.pdf)
        # argues that number of free parameters in LDA and sparse models (such as PLSA)
        # should should be calculated differently
        if self.consider_sparsity:
            N_p = phi.astype(bool).sum().sum() + hyperparams
        else:
            N_p = (V - 1) * T + hyperparams

        ll = get_log_likelihood(model._model, self.modality)

        if self.mode == "MDL":
            return 0.5 * N_p * np.log(T * D) - ll
        if self.mode == "AIC":
            return 2 * N_p - 2 * ll
        if self.mode == "BIC":
            return N_p * np.log(D) - 2 * ll

        raise ValueError(
            f"Unsupported score type {self.mode}; Supported ones are: AIC/BIC/MDL"
        )
    def save_model_topics(
            self,
            name: str,
            model: TopicModel,
            topic_scores: List[Dict[str, float]] = None,
            phi: pd.DataFrame = None,
            dataset: Dataset = None,
            theta: bool = False) -> None:

        if phi is None:
            phi = model.get_phi()

        with open(os.path.join(self._path, f'{name}__phi.bin'), 'wb') as f:
            f.write(dill.dumps(phi))

        if not theta:
            pass
        else:
            try:
                theta = model.get_theta(dataset=dataset)
            except ValueError:
                pass
            else:
                with open(os.path.join(self._path, f'{name}__theta.bin'), 'wb') as f:
                    f.write(dill.dumps(theta))

        if topic_scores is None:
            topic_scores = dict()

        with open(os.path.join(self._path, f'{name}__topic_scores.bin'), 'wb') as f:
            f.write(dill.dumps(topic_scores))
Beispiel #3
0
def initialize_with_copying_topics(
        dataset: Dataset,
        model_number: int,
        num_topics: int,
        phi: pd.DataFrame,
        num_topics_to_copy: int = None,
        topic_indices_to_copy: List[int] = None) -> pd.DataFrame:

    random = np.random.RandomState(seed=model_number)

    if num_topics_to_copy is None and topic_indices_to_copy is None:
        raise ValueError(
            'Either `num_topics_to_copy` or `topic_indices_to_copy` should be specified!'
        )
    elif topic_indices_to_copy is None:
        topic_indices_to_copy = list(range(len(phi.columns)))
    elif num_topics_to_copy is None:
        num_topics_to_copy = len(topic_indices_to_copy)
    elif num_topics_to_copy != len(topic_indices_to_copy):
        raise ValueError(
            'If both `num_topics_to_copy` and `topic_indices_to_copy` are specified,'
            ' they shouldn\'t contradict each other!')
    else:
        assert False

    topics_to_copy = random.choice(topic_indices_to_copy,
                                   size=num_topics_to_copy,
                                   replace=False)
    artm_model_template = _get_artm_model_template(dataset, num_topics)

    init_phi_utils._copy_phi(artm_model_template, phi.iloc[:, topics_to_copy])
    model_template = TopicModel(artm_model=artm_model_template)

    return model_template.get_phi()
Beispiel #4
0
def _get_phi_template(dataset: Dataset, num_topics: int) -> pd.DataFrame:
    artm_model = _get_artm_model_template(dataset, num_topics)
    model = TopicModel(artm_model=artm_model)
    phi_template = model.get_phi()

    del model
    del artm_model

    return phi_template
Beispiel #5
0
def get_phi_index(dataset: Dataset) -> Index:
    artm_model_template = artm.ARTM(num_topics=1, num_processors=1)
    artm_model_template.initialize(dictionary=dataset.get_dictionary())
    model_template = TopicModel(artm_model=artm_model_template)
    phi_template = model_template.get_phi()
    phi_index = phi_template.index

    del model_template
    del artm_model_template

    return phi_index
    def call(self, model: TopicModel):
        phi = model.get_phi(class_ids=self.modalities)

        relevant_words = self._select_topwords(phi)

        loglift = self._compute_lift(phi, relevant_words)

        if self.topic_names is not None:
            topic_names = self.topic_names
        else:
            topic_names = model.topic_names

        total_loglift = loglift[topic_names]

        return float(total_loglift.mean())
Beispiel #7
0
def _copy_phi(model: artm.ARTM,
              phi: pd.DataFrame,
              phi_ref: np.ndarray = None) -> np.ndarray:
    model_wrapper = TopicModel(artm_model=model)
    base_phi_index = model_wrapper.get_phi().index

    # TODO: faster?
    source_indices = list(phi.index)
    target_indices = list()
    found_indices = list()
    not_found_indices = list()
    not_found_indices_fraction_threshold = 0.5

    for index in source_indices:
        try:
            target_index = base_phi_index.get_loc(index)
        except KeyError:
            not_found_indices.append(index)
        else:
            target_indices.append(target_index)
            found_indices.append(index)

    if len(not_found_indices) == 0:
        pass
    elif len(not_found_indices
             ) < not_found_indices_fraction_threshold * len(source_indices):
        warnings.warn(
            f'There are {len(not_found_indices) / (1e-7 + len(source_indices)) * 100}% of words'
            f' (i.e. {len(not_found_indices)} words)'
            f' in the given Phi matrix'
            f' which were not found in the model\'s Phi matrix')
    else:
        raise RuntimeError(
            f'Not less than {not_found_indices_fraction_threshold * 100}% of words'
            f' in the given Phi matrix with {len(source_indices)} words were not found'
            f' in the model\'s Phi matrix with {len(base_phi_index)} words!'
            f' Seems like doing initialization in such circumstances is not good'
        )

    _logger.debug(f'Attaching pwt and copying')

    if phi_ref is None:
        (_, phi_ref) = model.master.attach_model(model=model.model_pwt)

    phi_ref[target_indices, :phi.shape[1]] = phi.loc[found_indices, :].values

    return phi_ref
    def call(self, model: TopicModel):
        theta = model.get_theta(dataset=self._dataset)
        phi = model.get_phi(class_ids=self.modalities)

        c_m1 = np.linalg.svd(phi, compute_uv=False)
        c_m2 = self.document_lengths.dot(theta.T)
        c_m2 += 0.0001  # we need this to prevent components equal to zero

        if len(c_m1) != phi.shape[1]:
            warnings.warn(
                f'Phi has {phi.shape[1]} topics'
                f' but its SVD resulted in a vector of size {len(c_m1)}!'
                f' To work correctly, SpectralDivergenceScore expects to get a vector'
                f' of exactly {phi.shape[1]} singular values.')

            return 1.0

        # we do not need to normalize these vectors
        return _symmetric_kl(c_m1, c_m2)
Beispiel #9
0
    def call(self, model: TopicModel):
        phi = model.get_phi(class_ids=self._class_ids)

        if self._metric == "hellinger":
            matrix = np.sqrt(phi.T)
            condensed_distances = pdist(matrix,
                                        metric='euclidean') / np.sqrt(2)
        else:
            condensed_distances = pdist(phi.T, metric=self._metric)

        if self.closest:
            df = pd.DataFrame(index=phi.columns,
                              columns=phi.columns,
                              data=squareform(condensed_distances))
            # get rid of zeros on the diagonals
            np.fill_diagonal(df.values, float("inf"))

            return df.min(axis=0).mean()

        return condensed_distances.mean()
    def _get_matrices(model: TopicModel) -> Tuple[np.array, np.array]:
        pwt = model.get_phi().values
        nwt = model._model.get_phi(model_name=model._model.model_nwt).values

        return pwt, nwt
    def compute(self,
                model: TopicModel,
                topics: List[str] = None,
                documents: List[str] = None) -> Dict[str, float]:

        phi = model.get_phi()

        if topics is not None:
            pass
        if self._topics is not None:
            topics = self._topics
        else:
            topics = list(phi.columns)

        if self._modalities is not None:
            # As self._modalities is list, here always will be df with multiIndex
            subphi = model.get_phi().loc[self._modalities, topics]
        else:
            subphi = model.get_phi().loc[:, topics]

        vocabulary_size = subphi.shape[0]

        topic_coherences = dict()

        if self._active_topic_threshold is None:
            pass
        else:
            # TODO: can't do without transform here, cache theta didn't help
            theta = model._model.transform(
                self._dataset.get_batch_vectorizer())
            subtheta_values = theta.loc[topics, :].values
            max_probs = np.max(subtheta_values, axis=1)
            active_topic_indices = np.where(
                max_probs > self._active_topic_threshold)[0]
            topics = [
                t for i, t in enumerate(topics) if i in active_topic_indices
            ]

        for topic in topics:
            topic_column = subphi.loc[:, topic]

            if not self._kernel:
                tokens = topic_column\
                    .sort_values(ascending=False)[:self._num_top_tokens]\
                    .index\
                    .get_level_values(1)\
                    .to_list()
            else:
                # if self._num_top_tokens is None — also Ok
                tokens = topic_column[topic_column > 1.0 / vocabulary_size][:self._num_top_tokens]\
                    .index\
                    .get_level_values(1)\
                    .to_list()

            current_cooc_values = list()

            for token_a, token_b in combinations(tokens, 2):
                if (token_a, token_b) in self._cooc_values:
                    current_cooc_values.append(self._cooc_values[(token_a,
                                                                  token_b)])
                elif (token_b, token_a) in self._cooc_values:
                    current_cooc_values.append(self._cooc_values[(token_b,
                                                                  token_a)])
                else:
                    _logger.warning(
                        f'Cooc pair "{token_a}, {token_b}" not found in the provided data!'
                        f' Using zero 0 for this pair as cooc value')

                    current_cooc_values.append(0)

            if len(current_cooc_values) > 0:
                topic_coherences[topic] = float(np.mean(current_cooc_values))
            else:
                # TODO: warn?
                topic_coherences[topic] = 0.0

        return topic_coherences
Beispiel #12
0
    def _get_matrices(self, model: TopicModel) -> Tuple[np.array, np.array]:
        pwt = model.get_phi(class_ids=self._class_ids).values
        nwt = model._model.get_phi(model_name=model._model.model_nwt).values

        return pwt, nwt