Exemple #1
0
    def train_model(self, config, reporter):
        """Main training function which takes a single model configuration and a budget (i.e. number of epochs) and
        fits the model to the training data.
        
        Parameters:
            config: `Configuration` object within the specified `ConfigSpace`
            reporter: Reporter callback for model selection

        Returns:
            (tuple): Tuple containing:
                - model (:class:`tmnt.estimator.BowEstimator`) VAE model estimator with trained parameters
                - obj (float): scaled objective
                - results_details (dict): Dictionary of estimator metrics on validation data
        """
        logging.debug("Evaluating with Config: {}".format(config))
        ctx_list = self._get_mxnet_visible_gpus() if self.use_gpu else [
            mx.cpu()
        ]
        ctx = ctx_list[0]
        vae_estimator = self._get_estimator(config, reporter, ctx)
        X, y, _, _ = file_to_data(self.train_data_path, len(self.vocabulary))
        if self.test_data_path is None:
            vX, vy = None, None
        else:
            vX, vy, _, _ = file_to_data(self.test_data_path,
                                        len(self.vocabulary))
        obj, v_res = vae_estimator.fit_with_validation(X, y, vX, vy)
        return vae_estimator, obj, v_res
Exemple #2
0
 def set_heldout_data_as_test(self):
     """Load in the heldout test data for final model evaluation
     """
     tst_mat, tst_labels, _, total_tst_words = file_to_data(
         self.c_args.tst_vec_file, self.vocabulary)
     self.data_test_csr = tst_mat
     self.test_labels = tst_labels
     self.total_tst_words = total_tst_words
Exemple #3
0
 def _get_x_y_data(self, data_source):
     if isinstance(data_source, str):
         X, y, _, _ = file_to_data(data_source, len(self.vocabulary))
     elif isinstance(data_source, tuple):
         X, y = data_source
     else:
         X, y = data_source, None
     return X, y
Exemple #4
0
 def from_arguments(cls, c_args, val_each_epoch=True):
     """Constructor method to build BowVAETrainer from command-line arguments directly.
     
     Parameters:
         c_args (`argparse.Namespace`): Command-line arguments.
         val_each_epoch (bool): Flag for performing validation each epoch. optional (default = True)
     """
     i_dt = datetime.datetime.now()
     log_out_dir = \
         os.path.join(c_args.save_dir,
                      "train_{}_{}_{}_{}_{}_{}_{}"
                      .format(i_dt.year,i_dt.month,i_dt.day,i_dt.hour,i_dt.minute,i_dt.second,i_dt.microsecond))
     if not os.path.exists(log_out_dir):
         lpath = Path(log_out_dir)
         lpath.mkdir(parents=True, exist_ok=True)
     if not log_utils.CONFIGURED:
         logging_config(folder=log_out_dir,
                        name='tmnt',
                        level=c_args.log_level,
                        console_level=c_args.log_level)
     logging.info(c_args)
     seed_rng(c_args.seed)
     if c_args.vocab_file and c_args.tr_vec_file:
         vpath = Path(c_args.vocab_file)
         tpath = Path(c_args.tr_vec_file)
         if not (vpath.is_file() and tpath.is_file()):
             raise Exception(
                 "Vocab file {} and/or training vector file {} do not exist"
                 .format(c_args.vocab_file, c_args.tr_vec_file))
     logging.info(
         "Loading data via pre-computed vocabulary and sparse vector format document representation"
     )
     vocab = load_vocab(c_args.vocab_file, encoding=c_args.str_encoding)
     voc_size = len(vocab)
     X, y, wd_freqs, _ = file_to_data(c_args.tr_vec_file, voc_size)
     model_out_dir = c_args.model_dir if c_args.model_dir else os.path.join(
         log_out_dir, 'MODEL')
     n_covars = int(float(np.max(y)) + 1)
     if not os.path.exists(model_out_dir):
         os.mkdir(model_out_dir)
     return cls(log_out_dir,
                model_out_dir,
                vocab,
                wd_freqs,
                c_args.tr_vec_file,
                c_args.val_vec_file,
                coherence_via_encoder=c_args.encoder_coherence,
                pretrained_param_file=c_args.pretrained_param_file,
                topic_seed_file=c_args.topic_seed_file,
                use_labels_as_covars=c_args.use_labels_as_covars,
                use_gpu=c_args.use_gpu,
                n_covars=n_covars,
                val_each_epoch=val_each_epoch)
Exemple #5
0
 def get_model_details(self, sp_vec_file):
     data_csr, labels, _, _ = file_to_data(sp_vec_file, len(self.vocab))
     data_csr = mx.nd.sparse.csr_matrix(data_csr, dtype='float32')
     ## 1) K x W matrix of P(term|topic) probabilities
     w = self.model.decoder.collect_params().get(
         'weight').data().transpose()  ## (K x W)
     w_pr = mx.nd.softmax(w, axis=1)
     ## 2) D x K matrix over the test data of topic probabilities
     covars = labels if self.covar_model else None
     dt_matrix = self.encode_data(data_csr, covars, use_probs=True)
     ## 3) D-length vector of document sizes
     doc_lengths = data_csr.sum(axis=1)
     ## 4) vocab (in same order as W columns)
     ## 5) frequency of each word w_i \in W over the test corpus
     term_cnts = data_csr.sum(axis=0)
     return w_pr, dt_matrix, doc_lengths, term_cnts
Exemple #6
0
            top_k_terms.append(ts)
    return top_k_terms


os.environ["MXNET_STORAGE_FALLBACK_LOG_VERBOSE"] = "0"

if __name__ == "__main__":
    parser = setup_parser()
    args = parser.parse_args()

    verbose = False  ### XXX - add as argument
    vocab = load_vocab(args.vocab_file)
    if args.override_top_k_terms:
        top_k_words_per_topic = get_top_k_terms_from_file(
            args.override_top_k_terms)
        tst_csr, _, _, _ = file_to_data(args.test_file, len(vocab))
        top_k_words_per_topic_ids = [[vocab[t] for t in t_set]
                                     for t_set in top_k_words_per_topic]
        npmi_eval = EvaluateNPMI(top_k_words_per_topic_ids)
        test_npmi = npmi_eval.evaluate_csr_mat(tst_csr)
        print("**** Test NPMI = {} *******".format(test_npmi))
        exit(0)

    inference_model = BowVAEInferencer.from_saved(
        model_dir=args.model_dir,
        ctx=mx.cpu() if args.gpu < 0 else mx.gpu(args.gpu))

    if args.plot_file:  # get UMAP embedding visualization
        import matplotlib.pyplot as plt
        encoded, labels = inference_model.encode_vec_file(args.test_file)
        encodings = np.array([doc.asnumpy() for doc in encoded])
Exemple #7
0
    return top_k_terms


os.environ["MXNET_STORAGE_FALLBACK_LOG_VERBOSE"] = "0"

if __name__ == "__main__":
    parser = setup_parser()
    args = parser.parse_args()

    verbose = False ### XXX - add as argument
    inference_model = BowVAEInferencer.from_saved(model_dir=args.model_dir,
                                                  ctx=mx.cpu() if args.gpu < 0 else mx.gpu(args.gpu))
    
    if args.override_top_k_terms:
        top_k_words_per_topic = get_top_k_terms_from_file(args.override_top_k_terms)
        tst_csr, _, _, _ = file_to_data(args.test_file, len(inference_model.vocab))
        top_k_words_per_topic_ids = [ [ inference_model.vocab[t] for t in t_set ]  for t_set in top_k_words_per_topic ]
        npmi_eval = EvaluateNPMI(top_k_words_per_topic_ids)
        test_npmi = npmi_eval.evaluate_csr_mat(tst_csr)
        print("**** Test NPMI = {} *******".format(test_npmi))
        exit(0)


    if args.plot_file: # get UMAP embedding visualization
        import matplotlib.pyplot as plt
        encoded, labels = inference_model.encode_vec_file(args.test_file)
        encodings = np.array([doc.asnumpy() for doc in encoded])
        print("There are {0} labels and {1} encodings".format(len(labels), len(encodings)))
        umap_model = umap.UMAP(n_neighbors=4, min_dist=0.5, metric='euclidean')
        embeddings = umap_model.fit_transform(encodings)
        plt.scatter(*embeddings.T, c=labels, s=0.2, alpha=0.7, cmap='coolwarm')
Exemple #8
0
 def from_arguments(cls, c_args, val_each_epoch=True):
     """Constructor method to build BowVAETrainer from command-line arguments directly.
     
     Parameters:
         c_args (`argparse.Namespace`): Command-line arguments.
         val_each_epoch (bool): Flag for performing validation each epoch. optional (default = True)
     """
     i_dt = datetime.datetime.now()
     log_out_dir = \
         os.path.join(c_args.save_dir,
                      "train_{}_{}_{}_{}_{}_{}_{}"
                      .format(i_dt.year,i_dt.month,i_dt.day,i_dt.hour,i_dt.minute,i_dt.second,i_dt.microsecond))
     ll = c_args.log_level
     log_level = logging.INFO
     if ll.lower() == 'info':
         log_level = logging.INFO
     elif ll.lower() == 'debug':
         log_level = logging.DEBUG
     elif ll.lower() == 'error':
         log_level = logging.ERROR
     elif ll.lower() == 'warning':
         log_level = logging.WARNING
     else:
         log_level = logging.INFO
     logging_config(folder=log_out_dir,
                    name='tmnt',
                    level=log_level,
                    console_level=log_level)
     logging.info(c_args)
     seed_rng(c_args.seed)
     if c_args.vocab_file and c_args.tr_vec_file:
         vpath = Path(c_args.vocab_file)
         tpath = Path(c_args.tr_vec_file)
         if not (vpath.is_file() and tpath.is_file()):
             raise Exception(
                 "Vocab file {} and/or training vector file {} do not exist"
                 .format(c_args.vocab_file, c_args.tr_vec_file))
     logging.info(
         "Loading data via pre-computed vocabulary and sparse vector format document representation"
     )
     vocab = load_vocab(c_args.vocab_file, encoding=c_args.str_encoding)
     voc_size = len(vocab)
     X, y, wd_freqs, _ = file_to_data(c_args.tr_vec_file, voc_size)
     total_test_wds = 0
     if c_args.val_vec_file:
         val_X, val_y, _, total_test_wds = file_to_data(
             c_args.val_vec_file, voc_size)
     else:
         val_X, val_y, total_test_wds = None, None, 0
     ctx = mx.cpu() if not c_args.use_gpu else mx.gpu(0)
     model_out_dir = c_args.model_dir if c_args.model_dir else os.path.join(
         log_out_dir, 'MODEL')
     if not os.path.exists(model_out_dir):
         os.mkdir(model_out_dir)
     return cls(log_out_dir,
                model_out_dir,
                c_args,
                vocab,
                wd_freqs,
                X,
                val_X,
                total_test_wds,
                train_labels=y,
                test_labels=val_y,
                label_map=None,
                use_gpu=c_args.use_gpu,
                val_each_epoch=val_each_epoch)
Exemple #9
0
 def encode_vec_file(self, sp_vec_file):
     data_mat, labels, _, _ = file_to_data(sp_vec_file, len(self.vocab))
     return self.encode_data(data_mat, labels), labels