def get_theta(phi: np.array, dataset: Dataset) -> pd.DataFrame: artm_model = artm.ARTM(num_topics=phi.shape[1]) artm_model.initialize(dataset.get_dictionary()) artm_model.fit_offline(dataset.get_batch_vectorizer(), 1) (_, phi_ref) = artm_model.master.attach_model(model=artm_model.model_pwt) np.copyto(phi_ref, phi) return artm_model.transform(dataset.get_batch_vectorizer())
def regularization_train_func(dataset: Dataset, model_number: int, num_topics: int, num_fit_iterations: int, scores: List[BaseScore] = None, decorrelating_tau: float = 10**5, smoothing_tau: float = 1e-5, sparsing_tau: float = -0.01, **kwargs) -> TopicModel: topic_model = _get_topic_model( dataset, num_topics=num_topics, seed=model_number, **kwargs, ) topic_model._model.regularizers.add( artm.regularizers.DecorrelatorPhiRegularizer(tau=decorrelating_tau)) for topic_name in list(topic_model.get_phi().columns): topic_model._model.regularizers.add( artm.regularizers.SmoothSparsePhiRegularizer( tau=smoothing_tau, topic_names=topic_name)) num_fit_iterations_with_scores = 1 first_num_fit_iterations = int( 0.75 * (num_fit_iterations - num_fit_iterations_with_scores)) second_num_fit_iterations = (num_fit_iterations - num_fit_iterations_with_scores - first_num_fit_iterations) topic_model._fit(dataset.get_batch_vectorizer(), num_iterations=first_num_fit_iterations) for regularizer_name in topic_model._model.regularizers.data: topic_model._model.regularizers[regularizer_name].tau = 0 topic_model._model.regularizers.add( artm.regularizers.SmoothSparsePhiRegularizer(tau=sparsing_tau)) topic_model._fit(dataset.get_batch_vectorizer(), num_iterations=max( 0, second_num_fit_iterations - num_fit_iterations_with_scores)) _fit_model_with_scores(topic_model, dataset, scores, num_fit_iterations=num_fit_iterations_with_scores) return topic_model
def magic_clutch(): test_dataset = None try: # Just some dataset, whatever test_dataset = Dataset( data_path=os.path.join(os.path.dirname(topicnet.__file__), 'tests', 'test_data', 'test_dataset.csv'), internals_folder_path=tempfile.mkdtemp(prefix='magic_clutch__')) # If not itialize a new score at least once in the notebook # it won't be possible to load it _ = HoldoutPerplexityScore( '', test_dataset, ) _ = MeanLiftScore('', test_dataset, []) _ = UniformThetaDivergenceScore('', test_dataset, []) _ = build_every_score(test_dataset, test_dataset, {"word": "@word"}) _ = IntratextCoherenceScore("jbi", test_dataset) _ = SophisticatedTopTokensCoherenceScore("sds", test_dataset) finally: if test_dataset is not None and os.path.isdir( test_dataset._internals_folder_path): shutil.rmtree(test_dataset._internals_folder_path)
def specific_initial_phi_train_func(dataset: Dataset, model_number: int, num_topics: int, num_fit_iterations: int, scores: List[BaseScore] = None, initialize_phi_func: Callable[ [Dataset, int, int], pd.DataFrame] = None, **kwargs) -> TopicModel: topic_model = _get_topic_model( dataset, num_topics=num_topics, seed=model_number, **kwargs, ) if initialize_phi_func is None: initialize_phi_func = initialize_phi_funcs.initialize_randomly initial_phi = initialize_phi_func(dataset, model_number, num_topics) init_phi_utils._copy_phi(topic_model._model, initial_phi) num_fit_iterations_with_scores = 1 topic_model._fit(dataset.get_batch_vectorizer(), num_iterations=max( 0, num_fit_iterations - num_fit_iterations_with_scores)) _fit_model_with_scores(topic_model, dataset, scores, num_fit_iterations=num_fit_iterations_with_scores) return topic_model
def default_train_func(dataset: Dataset, model_number: int, num_topics: int, num_fit_iterations: int, scores: List[BaseScore] = None, **kwargs) -> TopicModel: """ Additional Parameters --------------------- kwargs Some params for `_get_topic_model`, such as `cache_theta` and `num_processors` """ topic_model = _get_topic_model( dataset, num_topics=num_topics, seed=model_number, **kwargs, ) num_fit_iterations_with_scores = 1 topic_model._fit(dataset.get_batch_vectorizer(), num_iterations=max( 0, num_fit_iterations - num_fit_iterations_with_scores)) _fit_model_with_scores(topic_model, dataset, scores, num_fit_iterations=num_fit_iterations_with_scores) return topic_model
def load(cls, path: str): with open(path, 'rb') as f: score = dill.load(f) score._dataset = Dataset( score._dataset_file_path, internals_folder_path=score._dataset_internals_folder_path, keep_in_memory=score._keep_dataset_in_memory, ) return score
def _fit_model_with_scores(topic_model: TopicModel, dataset: Dataset, scores: List[BaseScore] = None, num_fit_iterations: int = 1): if scores is not None: for score in scores: score._attach(topic_model) topic_model._fit(dataset.get_batch_vectorizer(), num_iterations=num_fit_iterations)
def setup_class(cls): cls.model = _MockModel(cls.create_phi()) document_words = cls.create_documents() dataset_table = cls.create_dataset_table(document_words) cls.data_folder_path = tempfile.mkdtemp() cls.dataset_file_path = os.path.join( cls.data_folder_path, DATASET_FILE_NAME, ) dataset_table.to_csv(cls.dataset_file_path, index=False) cls.dataset = Dataset(cls.dataset_file_path)
def split_into_train_test(dataset: Dataset, config: dict, save_folder: str = None): # TODO: no need for `config` here, just `batches_prefix` documents = list(dataset._data.index) dn = config['batches_prefix'] random = np.random.RandomState(seed=123) random.shuffle(documents) test_size = 0.2 train_documents = documents[:int(1.0 - test_size * len(documents))] test_documents = documents[len(train_documents):] assert len(train_documents) + len(test_documents) == len(documents) # TODO: test with keep_in_memory = False just in case train_data = dataset._data.loc[train_documents] test_data = dataset._data.loc[test_documents] train_data['id'] = train_data.index test_data['id'] = test_data.index to_csv_kwargs = dict() if not dataset._small_data: to_csv_kwargs['single_file'] = True if save_folder is None: save_folder = '.' elif not os.path.isdir(save_folder): os.mkdir(save_folder) train_dataset_path = os.path.join(save_folder, f'{dn}_train.csv') test_dataset_path = os.path.join(save_folder, f'{dn}_test.csv') train_data.to_csv(train_dataset_path, index=False, **to_csv_kwargs) test_data.to_csv(test_dataset_path, index=False, **to_csv_kwargs) train_dataset = Dataset( train_dataset_path, batch_vectorizer_path=f'{dn}_train_internals', keep_in_memory=dataset._small_data, ) test_dataset = Dataset( test_dataset_path, batch_vectorizer_path=f'{dn}_test_internals', keep_in_memory=dataset._small_data, ) # TODO: quick hack, i'm not sure what for test_dataset._to_dataset = lambda: test_dataset train_dataset._to_dataset = lambda: train_dataset return train_dataset, test_dataset
def _get_topic_model(dataset: Dataset, phi: pd.DataFrame = None, num_topics: int = None, seed: int = None, scores: List[BaseScore] = None, num_safe_fit_iterations: int = 3, num_processors: int = 3, cache_theta: bool = False) -> TopicModel: dictionary = dataset.get_dictionary() if num_topics is not None and phi is not None: assert num_topics >= phi.shape[1] elif num_topics is None and phi is not None: num_topics = phi.shape[1] elif num_topics is None and phi is None: raise ValueError() topic_names = [f'topic_{i}' for i in range(num_topics)] if seed is None: artm_model = artm.ARTM(topic_names=topic_names) else: artm_model = artm.ARTM(topic_names=topic_names, seed=seed) artm_model.num_processors = num_processors artm_model.initialize(dictionary) if phi is None: pass elif num_safe_fit_iterations is not None and num_safe_fit_iterations > 0: init_phi_utils._safe_copy_phi(artm_model, phi, dataset, num_safe_fit_iterations) else: init_phi_utils._copy_phi(artm_model, phi) topic_model = TopicModel(artm_model=artm_model, model_id='0', cache_theta=cache_theta, theta_columns_naming='title') if scores is not None: for score in scores: score._attach(topic_model) return topic_model
def test_call_toptokens_small_big_data(self, keep_in_memory) -> None: dataset = Dataset(self.dataset_file_path, keep_in_memory=keep_in_memory) score = _TopTokensCoherenceScore(dataset) self._check_call(score)
def test_call_intratext_small_big_data(self, keep_in_memory) -> None: dataset = Dataset(self.dataset_file_path, keep_in_memory=keep_in_memory) score = _IntratextCoherenceScore(dataset) self._check_call(score)
def background_topics_train_func(dataset: Dataset, model_number: int, num_topics: int, num_fit_iterations: int, scores: List[BaseScore] = None, num_background_topics: int = 2, smoothing_tau: float = 0.01, **kwargs) -> TopicModel: topic_model = _get_topic_model( dataset, num_topics=num_topics + num_background_topics, seed=model_number, **kwargs, ) for background_topic_name in list( topic_model.get_phi().columns)[-num_background_topics:]: topic_model._model.regularizers.add( artm.regularizers.SmoothSparsePhiRegularizer( tau=smoothing_tau, topic_names=background_topic_name # TODO: why not list? )) topic_model._fit(dataset.get_batch_vectorizer(), num_iterations=num_fit_iterations) specific_topics_phi = topic_model.get_phi( ).iloc[:, :-num_background_topics] del topic_model topic_model = _get_topic_model( dataset, num_topics=num_topics, seed=model_number, ) num_fit_iterations_with_scores = 1 num_fit_iterations_without_scores = num_fit_iterations - num_fit_iterations_with_scores phi_ref = None for fit_iteration in range(num_fit_iterations_without_scores): phi_ref = init_phi_utils._copy_phi(topic_model._model, specific_topics_phi, phi_ref=phi_ref) topic_model._fit(dataset.get_batch_vectorizer(), num_iterations=1) phi_ref = init_phi_utils._copy_phi(topic_model._model, specific_topics_phi, phi_ref=phi_ref) _fit_model_with_scores(topic_model, dataset, scores, num_fit_iterations=num_fit_iterations_with_scores) # TODO: not very safe here? (if cache_theta us True, Theta not updated here) init_phi_utils._copy_phi(topic_model._model, specific_topics_phi, phi_ref=phi_ref) return topic_model