def train_model(self, config, reporter): """Main training function which takes a single model configuration and a budget (i.e. number of epochs) and fits the model to the training data. Parameters: config: `Configuration` object within the specified `ConfigSpace` reporter: Reporter callback for model selection Returns: (tuple): Tuple containing: - model (:class:`tmnt.estimator.BowEstimator`) VAE model estimator with trained parameters - obj (float): scaled objective - results_details (dict): Dictionary of estimator metrics on validation data """ logging.debug("Evaluating with Config: {}".format(config)) ctx_list = self._get_mxnet_visible_gpus() if self.use_gpu else [ mx.cpu() ] ctx = ctx_list[0] vae_estimator = self._get_estimator(config, reporter, ctx) X, y, _, _ = file_to_data(self.train_data_path, len(self.vocabulary)) if self.test_data_path is None: vX, vy = None, None else: vX, vy, _, _ = file_to_data(self.test_data_path, len(self.vocabulary)) obj, v_res = vae_estimator.fit_with_validation(X, y, vX, vy) return vae_estimator, obj, v_res
def set_heldout_data_as_test(self): """Load in the heldout test data for final model evaluation """ tst_mat, tst_labels, _, total_tst_words = file_to_data( self.c_args.tst_vec_file, self.vocabulary) self.data_test_csr = tst_mat self.test_labels = tst_labels self.total_tst_words = total_tst_words
def _get_x_y_data(self, data_source): if isinstance(data_source, str): X, y, _, _ = file_to_data(data_source, len(self.vocabulary)) elif isinstance(data_source, tuple): X, y = data_source else: X, y = data_source, None return X, y
def from_arguments(cls, c_args, val_each_epoch=True): """Constructor method to build BowVAETrainer from command-line arguments directly. Parameters: c_args (`argparse.Namespace`): Command-line arguments. val_each_epoch (bool): Flag for performing validation each epoch. optional (default = True) """ i_dt = datetime.datetime.now() log_out_dir = \ os.path.join(c_args.save_dir, "train_{}_{}_{}_{}_{}_{}_{}" .format(i_dt.year,i_dt.month,i_dt.day,i_dt.hour,i_dt.minute,i_dt.second,i_dt.microsecond)) if not os.path.exists(log_out_dir): lpath = Path(log_out_dir) lpath.mkdir(parents=True, exist_ok=True) if not log_utils.CONFIGURED: logging_config(folder=log_out_dir, name='tmnt', level=c_args.log_level, console_level=c_args.log_level) logging.info(c_args) seed_rng(c_args.seed) if c_args.vocab_file and c_args.tr_vec_file: vpath = Path(c_args.vocab_file) tpath = Path(c_args.tr_vec_file) if not (vpath.is_file() and tpath.is_file()): raise Exception( "Vocab file {} and/or training vector file {} do not exist" .format(c_args.vocab_file, c_args.tr_vec_file)) logging.info( "Loading data via pre-computed vocabulary and sparse vector format document representation" ) vocab = load_vocab(c_args.vocab_file, encoding=c_args.str_encoding) voc_size = len(vocab) X, y, wd_freqs, _ = file_to_data(c_args.tr_vec_file, voc_size) model_out_dir = c_args.model_dir if c_args.model_dir else os.path.join( log_out_dir, 'MODEL') n_covars = int(float(np.max(y)) + 1) if not os.path.exists(model_out_dir): os.mkdir(model_out_dir) return cls(log_out_dir, model_out_dir, vocab, wd_freqs, c_args.tr_vec_file, c_args.val_vec_file, coherence_via_encoder=c_args.encoder_coherence, pretrained_param_file=c_args.pretrained_param_file, topic_seed_file=c_args.topic_seed_file, use_labels_as_covars=c_args.use_labels_as_covars, use_gpu=c_args.use_gpu, n_covars=n_covars, val_each_epoch=val_each_epoch)
def get_model_details(self, sp_vec_file): data_csr, labels, _, _ = file_to_data(sp_vec_file, len(self.vocab)) data_csr = mx.nd.sparse.csr_matrix(data_csr, dtype='float32') ## 1) K x W matrix of P(term|topic) probabilities w = self.model.decoder.collect_params().get( 'weight').data().transpose() ## (K x W) w_pr = mx.nd.softmax(w, axis=1) ## 2) D x K matrix over the test data of topic probabilities covars = labels if self.covar_model else None dt_matrix = self.encode_data(data_csr, covars, use_probs=True) ## 3) D-length vector of document sizes doc_lengths = data_csr.sum(axis=1) ## 4) vocab (in same order as W columns) ## 5) frequency of each word w_i \in W over the test corpus term_cnts = data_csr.sum(axis=0) return w_pr, dt_matrix, doc_lengths, term_cnts
top_k_terms.append(ts) return top_k_terms os.environ["MXNET_STORAGE_FALLBACK_LOG_VERBOSE"] = "0" if __name__ == "__main__": parser = setup_parser() args = parser.parse_args() verbose = False ### XXX - add as argument vocab = load_vocab(args.vocab_file) if args.override_top_k_terms: top_k_words_per_topic = get_top_k_terms_from_file( args.override_top_k_terms) tst_csr, _, _, _ = file_to_data(args.test_file, len(vocab)) top_k_words_per_topic_ids = [[vocab[t] for t in t_set] for t_set in top_k_words_per_topic] npmi_eval = EvaluateNPMI(top_k_words_per_topic_ids) test_npmi = npmi_eval.evaluate_csr_mat(tst_csr) print("**** Test NPMI = {} *******".format(test_npmi)) exit(0) inference_model = BowVAEInferencer.from_saved( model_dir=args.model_dir, ctx=mx.cpu() if args.gpu < 0 else mx.gpu(args.gpu)) if args.plot_file: # get UMAP embedding visualization import matplotlib.pyplot as plt encoded, labels = inference_model.encode_vec_file(args.test_file) encodings = np.array([doc.asnumpy() for doc in encoded])
return top_k_terms os.environ["MXNET_STORAGE_FALLBACK_LOG_VERBOSE"] = "0" if __name__ == "__main__": parser = setup_parser() args = parser.parse_args() verbose = False ### XXX - add as argument inference_model = BowVAEInferencer.from_saved(model_dir=args.model_dir, ctx=mx.cpu() if args.gpu < 0 else mx.gpu(args.gpu)) if args.override_top_k_terms: top_k_words_per_topic = get_top_k_terms_from_file(args.override_top_k_terms) tst_csr, _, _, _ = file_to_data(args.test_file, len(inference_model.vocab)) top_k_words_per_topic_ids = [ [ inference_model.vocab[t] for t in t_set ] for t_set in top_k_words_per_topic ] npmi_eval = EvaluateNPMI(top_k_words_per_topic_ids) test_npmi = npmi_eval.evaluate_csr_mat(tst_csr) print("**** Test NPMI = {} *******".format(test_npmi)) exit(0) if args.plot_file: # get UMAP embedding visualization import matplotlib.pyplot as plt encoded, labels = inference_model.encode_vec_file(args.test_file) encodings = np.array([doc.asnumpy() for doc in encoded]) print("There are {0} labels and {1} encodings".format(len(labels), len(encodings))) umap_model = umap.UMAP(n_neighbors=4, min_dist=0.5, metric='euclidean') embeddings = umap_model.fit_transform(encodings) plt.scatter(*embeddings.T, c=labels, s=0.2, alpha=0.7, cmap='coolwarm')
def from_arguments(cls, c_args, val_each_epoch=True): """Constructor method to build BowVAETrainer from command-line arguments directly. Parameters: c_args (`argparse.Namespace`): Command-line arguments. val_each_epoch (bool): Flag for performing validation each epoch. optional (default = True) """ i_dt = datetime.datetime.now() log_out_dir = \ os.path.join(c_args.save_dir, "train_{}_{}_{}_{}_{}_{}_{}" .format(i_dt.year,i_dt.month,i_dt.day,i_dt.hour,i_dt.minute,i_dt.second,i_dt.microsecond)) ll = c_args.log_level log_level = logging.INFO if ll.lower() == 'info': log_level = logging.INFO elif ll.lower() == 'debug': log_level = logging.DEBUG elif ll.lower() == 'error': log_level = logging.ERROR elif ll.lower() == 'warning': log_level = logging.WARNING else: log_level = logging.INFO logging_config(folder=log_out_dir, name='tmnt', level=log_level, console_level=log_level) logging.info(c_args) seed_rng(c_args.seed) if c_args.vocab_file and c_args.tr_vec_file: vpath = Path(c_args.vocab_file) tpath = Path(c_args.tr_vec_file) if not (vpath.is_file() and tpath.is_file()): raise Exception( "Vocab file {} and/or training vector file {} do not exist" .format(c_args.vocab_file, c_args.tr_vec_file)) logging.info( "Loading data via pre-computed vocabulary and sparse vector format document representation" ) vocab = load_vocab(c_args.vocab_file, encoding=c_args.str_encoding) voc_size = len(vocab) X, y, wd_freqs, _ = file_to_data(c_args.tr_vec_file, voc_size) total_test_wds = 0 if c_args.val_vec_file: val_X, val_y, _, total_test_wds = file_to_data( c_args.val_vec_file, voc_size) else: val_X, val_y, total_test_wds = None, None, 0 ctx = mx.cpu() if not c_args.use_gpu else mx.gpu(0) model_out_dir = c_args.model_dir if c_args.model_dir else os.path.join( log_out_dir, 'MODEL') if not os.path.exists(model_out_dir): os.mkdir(model_out_dir) return cls(log_out_dir, model_out_dir, c_args, vocab, wd_freqs, X, val_X, total_test_wds, train_labels=y, test_labels=val_y, label_map=None, use_gpu=c_args.use_gpu, val_each_epoch=val_each_epoch)
def encode_vec_file(self, sp_vec_file): data_mat, labels, _, _ = file_to_data(sp_vec_file, len(self.vocab)) return self.encode_data(data_mat, labels), labels