Exemple #1
0
def initialize_with_copying_topics(
        dataset: Dataset,
        model_number: int,
        num_topics: int,
        phi: pd.DataFrame,
        num_topics_to_copy: int = None,
        topic_indices_to_copy: List[int] = None) -> pd.DataFrame:

    random = np.random.RandomState(seed=model_number)

    if num_topics_to_copy is None and topic_indices_to_copy is None:
        raise ValueError(
            'Either `num_topics_to_copy` or `topic_indices_to_copy` should be specified!'
        )
    elif topic_indices_to_copy is None:
        topic_indices_to_copy = list(range(len(phi.columns)))
    elif num_topics_to_copy is None:
        num_topics_to_copy = len(topic_indices_to_copy)
    elif num_topics_to_copy != len(topic_indices_to_copy):
        raise ValueError(
            'If both `num_topics_to_copy` and `topic_indices_to_copy` are specified,'
            ' they shouldn\'t contradict each other!')
    else:
        assert False

    topics_to_copy = random.choice(topic_indices_to_copy,
                                   size=num_topics_to_copy,
                                   replace=False)
    artm_model_template = _get_artm_model_template(dataset, num_topics)

    init_phi_utils._copy_phi(artm_model_template, phi.iloc[:, topics_to_copy])
    model_template = TopicModel(artm_model=artm_model_template)

    return model_template.get_phi()
Exemple #2
0
def _get_phi_template(dataset: Dataset, num_topics: int) -> pd.DataFrame:
    artm_model = _get_artm_model_template(dataset, num_topics)
    model = TopicModel(artm_model=artm_model)
    phi_template = model.get_phi()

    del model
    del artm_model

    return phi_template
Exemple #3
0
def _fit_model_with_scores(topic_model: TopicModel,
                           dataset: Dataset,
                           scores: List[BaseScore] = None,
                           num_fit_iterations: int = 1):

    if scores is not None:
        for score in scores:
            score._attach(topic_model)

    topic_model._fit(dataset.get_batch_vectorizer(),
                     num_iterations=num_fit_iterations)
    def call(self, model: TopicModel):

        phi = model.get_phi(class_ids=[self.modality])
        V, T = phi.shape
        D = self.num_docs

        # TODO: consider the case of having vector of taus instead
        hyperparams = len(model.regularizers)

        # than2012 (https://link.springer.com/content/pdf/10.1007/978-3-642-33460-3_37.pdf)
        # argues that number of free parameters in LDA and sparse models (such as PLSA)
        # should should be calculated differently
        if self.consider_sparsity:
            N_p = phi.astype(bool).sum().sum() + hyperparams
        else:
            N_p = (V - 1) * T + hyperparams

        ll = get_log_likelihood(model._model, self.modality)

        if self.mode == "MDL":
            return 0.5 * N_p * np.log(T * D) - ll
        if self.mode == "AIC":
            return 2 * N_p - 2 * ll
        if self.mode == "BIC":
            return N_p * np.log(D) - 2 * ll

        raise ValueError(
            f"Unsupported score type {self.mode}; Supported ones are: AIC/BIC/MDL"
        )
 def _load_phi(self, num_topics: int, subsample_number: int) -> pd.DataFrame:
     topic_model = TopicModel.load(
         self._folder_path_model(
             num_topics, subsample_number=subsample_number
         )
     )
     return topic_model.get_phi()
    def call(self, model: TopicModel):
        theta = model.get_theta(dataset=self._dataset)
        phi = model.get_phi(class_ids=self.modalities)

        c_m1 = np.linalg.svd(phi, compute_uv=False)
        c_m2 = self.document_lengths.dot(theta.T)
        c_m2 += 0.0001  # we need this to prevent components equal to zero

        if len(c_m1) != phi.shape[1]:
            warnings.warn(
                f'Phi has {phi.shape[1]} topics'
                f' but its SVD resulted in a vector of size {len(c_m1)}!'
                f' To work correctly, SpectralDivergenceScore expects to get a vector'
                f' of exactly {phi.shape[1]} singular values.')

            return 1.0

        # we do not need to normalize these vectors
        return _symmetric_kl(c_m1, c_m2)
    def call(self, model: TopicModel):
        phi = model.get_phi(class_ids=self.modalities)

        relevant_words = self._select_topwords(phi)

        loglift = self._compute_lift(phi, relevant_words)

        if self.topic_names is not None:
            topic_names = self.topic_names
        else:
            topic_names = model.topic_names

        total_loglift = loglift[topic_names]

        return float(total_loglift.mean())
Exemple #8
0
def _get_topic_model(dataset: Dataset,
                     phi: pd.DataFrame = None,
                     num_topics: int = None,
                     seed: int = None,
                     scores: List[BaseScore] = None,
                     num_safe_fit_iterations: int = 3,
                     num_processors: int = 3,
                     cache_theta: bool = False) -> TopicModel:

    dictionary = dataset.get_dictionary()

    if num_topics is not None and phi is not None:
        assert num_topics >= phi.shape[1]
    elif num_topics is None and phi is not None:
        num_topics = phi.shape[1]
    elif num_topics is None and phi is None:
        raise ValueError()

    topic_names = [f'topic_{i}' for i in range(num_topics)]

    if seed is None:
        artm_model = artm.ARTM(topic_names=topic_names)
    else:
        artm_model = artm.ARTM(topic_names=topic_names, seed=seed)

    artm_model.num_processors = num_processors
    artm_model.initialize(dictionary)

    if phi is None:
        pass
    elif num_safe_fit_iterations is not None and num_safe_fit_iterations > 0:
        init_phi_utils._safe_copy_phi(artm_model, phi, dataset,
                                      num_safe_fit_iterations)
    else:
        init_phi_utils._copy_phi(artm_model, phi)

    topic_model = TopicModel(artm_model=artm_model,
                             model_id='0',
                             cache_theta=cache_theta,
                             theta_columns_naming='title')

    if scores is not None:
        for score in scores:
            score._attach(topic_model)

    return topic_model
    def _get_matrices(model: TopicModel) -> Tuple[np.array, np.array]:
        pwt = model.get_phi().values
        nwt = model._model.get_phi(model_name=model._model.model_nwt).values

        return pwt, nwt
Exemple #10
0
    def test_optimize_for_model(self, keep_in_memory, model_family):
        # Thetaless currently fails
        # see https://github.com/machine-intelligence-laboratory/TopicNet/issues/79

        artm_score_name = 'perplexity_score'
        artm_score = PerplexityScore(
            name=artm_score_name,
            class_ids=[self.main_modality, self.other_modality])

        custom_score_name = 'diversity_score'
        custom_score = DiversityScore(custom_score_name,
                                      class_ids=self.main_modality)

        self.text_collection._set_dataset_kwargs(keep_in_memory=keep_in_memory)

        min_num_topics = 1
        max_num_topics = 2
        num_topics_interval = 1
        num_fit_iterations = 3
        num_search_points = len(
            list(range(min_num_topics, max_num_topics + 1,
                       num_topics_interval)))
        num_restarts = 3
        experiment_name = model_family.value
        experiment_folder = self.working_folder_path

        optimizer = OptimizeScoresMethod(
            scores=[artm_score, custom_score],
            model_family=model_family,
            min_num_topics=min_num_topics,
            max_num_topics=max_num_topics,
            num_topics_interval=num_topics_interval,
            num_fit_iterations=num_fit_iterations,
            num_restarts=num_restarts,
            one_model_num_processors=1,
            separate_thread=False,
            experiment_name=experiment_name,
            experiment_directory=experiment_folder,
        )

        optimizer.search_for_optimum(text_collection=self.text_collection)
        restart_folder_names = os.listdir(experiment_folder)

        assert len(restart_folder_names) == num_restarts

        for restart_folder_name in restart_folder_names:
            assert restart_folder_name.startswith(experiment_name)

            restart_folder_path = os.path.join(experiment_folder,
                                               restart_folder_name)
            model_folder_names = os.listdir(restart_folder_path)

            assert len(model_folder_names) == num_search_points

            for model_folder_name in model_folder_names:
                topic_model = TopicModel.load(
                    os.path.join(restart_folder_path, model_folder_name))

                assert artm_score_name in topic_model.scores
                assert custom_score_name in topic_model.scores

                assert len(
                    topic_model.scores[artm_score_name]) == num_fit_iterations
                assert len(topic_model.scores[custom_score_name]) == 1

                assert all(
                    isinstance(v, Number)
                    for v in topic_model.scores[artm_score_name])
                assert all(
                    isinstance(v, Number)
                    for v in topic_model.scores[custom_score_name])
Exemple #11
0
    def call(self, model: TopicModel):
        theta = model.get_theta(dataset=self._dataset)
        T = theta.shape[0]

        return _compute_kl(T, theta, self.document_lengths)
    def _train_models(
            self,
            text_collection: VowpalWabbitTextCollection,
            min_df_rate: float,
            max_df_rate: float,
            num_topics_to_skip: List[int] = None) -> None:

        modalities_to_use = list(text_collection._modalities.keys())
        main_modality = text_collection._main_modality

        numbers_of_topics = list(range(
            self._min_num_topics,
            self._max_num_topics + 1,
            self._num_topics_interval))

        if num_topics_to_skip is not None:
            numbers_of_topics = [
                n for n in numbers_of_topics if n not in num_topics_to_skip
            ]

        num_topics_for_message = ', '.join(
            [str(n) for n in numbers_of_topics[:10]]
        )

        print(f'\n Folder for models saving: "{self._models_folder_path}"')
        print(
            f'Training models for {len(numbers_of_topics)}'
            f' numbers of topics: {num_topics_for_message}...'
        )

        for num_topics in tqdm(
                numbers_of_topics,
                total=len(numbers_of_topics),
                file=sys.stdout):

            os.makedirs(
                self._folder_path_num_topics(num_topics)
            )

            subsample_data_paths = self._get_dataset_subsample_file_paths()

            for subsample_number, data_path in tqdm(
                    enumerate(subsample_data_paths),
                    total=len(subsample_data_paths),
                    file=sys.stdout):

                dataset = Dataset(data_path=data_path)

                dictionary = dataset.get_dictionary()
                dictionary.filter(
                    min_df_rate=min_df_rate,
                    max_df_rate=max_df_rate,
                )

                artm_model = init_model_from_family(
                    family=self._model_family,
                    dataset=dataset,
                    modalities_to_use=modalities_to_use,
                    main_modality=main_modality,
                    num_topics=num_topics,
                    seed=self._model_seed,
                    num_processors=self._model_num_processors,
                    model_params=self._model_params,
                )
                topic_model = TopicModel(artm_model)

                topic_model._fit(
                    dataset_trainable=dataset.get_batch_vectorizer(),
                    num_iterations=self._num_fit_iterations,
                )

                model_save_path = self._folder_path_model(num_topics, subsample_number)
                topic_model.save(
                    model_save_path=model_save_path,
                    phi=True,
                    theta=False,
                )
    def compute(self,
                model: TopicModel,
                topics: List[str] = None,
                documents: List[str] = None) -> Dict[str, float]:

        phi = model.get_phi()

        if topics is not None:
            pass
        if self._topics is not None:
            topics = self._topics
        else:
            topics = list(phi.columns)

        if self._modalities is not None:
            # As self._modalities is list, here always will be df with multiIndex
            subphi = model.get_phi().loc[self._modalities, topics]
        else:
            subphi = model.get_phi().loc[:, topics]

        vocabulary_size = subphi.shape[0]

        topic_coherences = dict()

        if self._active_topic_threshold is None:
            pass
        else:
            # TODO: can't do without transform here, cache theta didn't help
            theta = model._model.transform(
                self._dataset.get_batch_vectorizer())
            subtheta_values = theta.loc[topics, :].values
            max_probs = np.max(subtheta_values, axis=1)
            active_topic_indices = np.where(
                max_probs > self._active_topic_threshold)[0]
            topics = [
                t for i, t in enumerate(topics) if i in active_topic_indices
            ]

        for topic in topics:
            topic_column = subphi.loc[:, topic]

            if not self._kernel:
                tokens = topic_column\
                    .sort_values(ascending=False)[:self._num_top_tokens]\
                    .index\
                    .get_level_values(1)\
                    .to_list()
            else:
                # if self._num_top_tokens is None — also Ok
                tokens = topic_column[topic_column > 1.0 / vocabulary_size][:self._num_top_tokens]\
                    .index\
                    .get_level_values(1)\
                    .to_list()

            current_cooc_values = list()

            for token_a, token_b in combinations(tokens, 2):
                if (token_a, token_b) in self._cooc_values:
                    current_cooc_values.append(self._cooc_values[(token_a,
                                                                  token_b)])
                elif (token_b, token_a) in self._cooc_values:
                    current_cooc_values.append(self._cooc_values[(token_b,
                                                                  token_a)])
                else:
                    _logger.warning(
                        f'Cooc pair "{token_a}, {token_b}" not found in the provided data!'
                        f' Using zero 0 for this pair as cooc value')

                    current_cooc_values.append(0)

            if len(current_cooc_values) > 0:
                topic_coherences[topic] = float(np.mean(current_cooc_values))
            else:
                # TODO: warn?
                topic_coherences[topic] = 0.0

        return topic_coherences
Exemple #14
0
    def _get_matrices(self, model: TopicModel) -> Tuple[np.array, np.array]:
        pwt = model.get_phi(class_ids=self._class_ids).values
        nwt = model._model.get_phi(model_name=model._model.model_nwt).values

        return pwt, nwt