def load(cls, path: str): """ Parameters ---------- path Returns ------- an instance of this class """ with open(path, 'rb') as f: score = dill.load(f) if __NO_LOADING_DATASET__[0]: score._dataset = None else: score._dataset = Dataset( score._dataset_file_path, internals_folder_path=score._dataset_internals_folder_path, keep_in_memory=score._keep_dataset_in_memory, ) return score
def _to_dataset(self) -> Dataset: if self._dataset is not None: return self._dataset if self._dataset_folder is not None: assert os.path.isdir(self._dataset_folder) else: self._dataset_folder = tempfile.mkdtemp(prefix='_dataset_', dir=os.path.dirname( self._file_path)) dataset_table_path = os.path.join(self._dataset_folder, 'dataset.csv') with open(self._file_path, 'r') as f_in, open(dataset_table_path, 'w') as f_out: writer = csv.writer(f_out) writer.writerow([DOC_ID_COL, VW_TEXT_COL, RAW_TEXT_COL]) for raw_vw_text in f_in: vw_text = raw_vw_text.strip() if len(vw_text) == 0: continue doc_id = vw_text.split()[0] raw_text = None # TODO: check if this OK writer.writerow([doc_id, vw_text, raw_text]) self._dataset = Dataset(dataset_table_path, **self._dataset_kwargs) return self._dataset
def get_phi_index(dataset: Dataset) -> Index: artm_model_template = artm.ARTM(num_topics=1, num_processors=1) artm_model_template.initialize(dictionary=dataset.get_dictionary()) model_template = TopicModel(artm_model=artm_model_template) phi_template = model_template.get_phi() phi_index = phi_template.index del model_template del artm_model_template return phi_index
def _safe_copy_phi(model: artm.ARTM, phi: pd.DataFrame, dataset: Dataset, small_num_fit_iterations: int = 3) -> np.ndarray: if small_num_fit_iterations == 0: phi_ref = _copy_phi(model, phi) return phi_ref phi_ref = None # TODO: small_num_fit_iterations bigger than 1 seems not working for big matrices for _ in range(small_num_fit_iterations): phi_ref = _copy_phi(model, phi, phi_ref=phi_ref) model.fit_offline(dataset.get_batch_vectorizer(), 1) return phi_ref
def init_model_from_family(family: str or KnownModel, dataset: Dataset, main_modality: str, num_topics: int, seed: int, modalities_to_use: List[str] = None, num_processors: int = 3, model_params: dict = None): """ Returns ------- model: TopicModel() instance """ if isinstance(family, KnownModel): family = family.value if modalities_to_use is None: modalities_to_use = [main_modality] custom_regs = {} if family == "LDA": model = init_lda(dataset, modalities_to_use, main_modality, num_topics, model_params) elif family == "PLSA": model = init_plsa(dataset, modalities_to_use, main_modality, num_topics) elif family == "TARTM": model, custom_regs = init_thetaless(dataset, modalities_to_use, main_modality, num_topics, model_params) elif family == "sparse": model = init_bcg_sparse_model(dataset, modalities_to_use, main_modality, num_topics, 1, model_params) elif family == "decorrelation": model = init_decorrelated_plsa(dataset, modalities_to_use, main_modality, num_topics, model_params) elif family == "ARTM": model = init_baseline_artm(dataset, modalities_to_use, main_modality, num_topics, 1, model_params) else: raise ValueError(f'family: {family}') model.num_processors = num_processors if seed is not None: model.seed = seed dictionary = dataset.get_dictionary() # TODO: maybe this cycle is not necessary for modality in dataset.get_possible_modalities(): if modality not in modalities_to_use: dictionary.filter(class_id=modality, max_df=0, inplace=True) model.initialize(dictionary) add_standard_scores(model, dictionary, main_modality=main_modality, all_modalities=modalities_to_use) model = TopicModel(artm_model=model, custom_regularizers=custom_regs) return model
def init_lda( dataset: Dataset, modalities_to_use: List[str], main_modality: str, num_topics: int, model_params: dict = None, ): """ Creates simple artm model with standard scores. Parameters ---------- dataset modalities_to_use main_modality num_topics model_params Returns ------- model: artm.ARTM() instance """ if model_params is None: model_params = dict() model = init_plsa(dataset, modalities_to_use, main_modality, num_topics) prior = model_params.get('prior', 'symmetric') # What GenSim returns by default (everything is 'symmetric') # see https://github.com/RaRe-Technologies/gensim/blob/master/gensim/models/ldamodel.py#L521 # Note that you can specify prior shape for alpha and beta separately, # but we do not do that here if prior == "symmetric": alpha = 1.0 / num_topics eta = 1.0 / num_topics elif prior == "asymmetric": # following the recommendation from # http://papers.nips.cc/paper/3854-rethinking-lda-why-priors-matter # we will use symmetric prior over Phi and asymmetric over Theta eta = 1.0 / num_topics num_terms = 0 # isn't used, so let's not compute it alpha = _init_dirichlet_prior("alpha", num_topics, num_terms=num_terms) elif prior == "double_asymmetric": # this stuff is needed for asymmetric Phi initialization: artm_dict = dataset.get_dictionary() temp_df = artm_dict2df(artm_dict) # noqa: F821 num_terms = temp_df.query("class_id in @modalities_to_use").shape[0] eta = _init_dirichlet_prior("eta", num_topics, num_terms) alpha = _init_dirichlet_prior("alpha", num_topics, num_terms) # TODO: turns out, BigARTM does not support tau as a list of floats (or dictionary) # so we need to use custom regularizer instead # (TopicPrior doesn't work because it provides $beta_t$ instead of $beta_w$) raise NotImplementedError elif prior == "heuristic": # Found in doi.org/10.1007/s10664-015-9379-3 (2016) # "We use the defacto standard heuristics of α=50/K and β=0.01 # (Biggers et al. 2014) for our hyperparameter values" alpha = 50.0 / num_topics eta = 0.01 else: raise TypeError(f"prior type '{prior}' is not supported") model.regularizers.add( artm.SmoothSparsePhiRegularizer( name='smooth_phi', tau=eta, class_ids=[main_modality], ), ) if isinstance(alpha, (list, np.ndarray)): alpha = [float(a) for a in alpha] assert (len(alpha) == len(model.topic_names)) for i, topic in enumerate(model.topic_names): model.regularizers.add( artm.SmoothSparseThetaRegularizer(name=f'smooth_theta_{i}', tau=alpha[i], topic_names=topic)) else: model.regularizers.add( artm.SmoothSparseThetaRegularizer( name='smooth_theta', tau=alpha, ), ) return model
def _get_artm_model_template(dataset: Dataset, num_topics: int) -> artm.ARTM: artm_model = artm.ARTM(num_topics=num_topics, num_processors=1) artm_model.initialize(dictionary=dataset.get_dictionary()) return artm_model
def _train_models( self, text_collection: VowpalWabbitTextCollection, min_df_rate: float, max_df_rate: float, num_topics_to_skip: List[int] = None) -> None: modalities_to_use = list(text_collection._modalities.keys()) main_modality = text_collection._main_modality numbers_of_topics = list(range( self._min_num_topics, self._max_num_topics + 1, self._num_topics_interval)) if num_topics_to_skip is not None: numbers_of_topics = [ n for n in numbers_of_topics if n not in num_topics_to_skip ] num_topics_for_message = ', '.join( [str(n) for n in numbers_of_topics[:10]] ) print(f'\n Folder for models saving: "{self._models_folder_path}"') print( f'Training models for {len(numbers_of_topics)}' f' numbers of topics: {num_topics_for_message}...' ) for num_topics in tqdm( numbers_of_topics, total=len(numbers_of_topics), file=sys.stdout): os.makedirs( self._folder_path_num_topics(num_topics) ) subsample_data_paths = self._get_dataset_subsample_file_paths() for subsample_number, data_path in tqdm( enumerate(subsample_data_paths), total=len(subsample_data_paths), file=sys.stdout): dataset = Dataset(data_path=data_path) dictionary = dataset.get_dictionary() dictionary.filter( min_df_rate=min_df_rate, max_df_rate=max_df_rate, ) artm_model = init_model_from_family( family=self._model_family, dataset=dataset, modalities_to_use=modalities_to_use, main_modality=main_modality, num_topics=num_topics, seed=self._model_seed, num_processors=self._model_num_processors, model_params=self._model_params, ) topic_model = TopicModel(artm_model) topic_model._fit( dataset_trainable=dataset.get_batch_vectorizer(), num_iterations=self._num_fit_iterations, ) model_save_path = self._folder_path_model(num_topics, subsample_number) topic_model.save( model_save_path=model_save_path, phi=True, theta=False, )