Esempio n. 1
0
 def from_arguments(cls, c_args, val_each_epoch=True):
     """Constructor method to build BowVAETrainer from command-line arguments directly.
     
     Parameters:
         c_args (`argparse.Namespace`): Command-line arguments.
         val_each_epoch (bool): Flag for performing validation each epoch. optional (default = True)
     """
     i_dt = datetime.datetime.now()
     log_out_dir = \
         os.path.join(c_args.save_dir,
                      "train_{}_{}_{}_{}_{}_{}_{}"
                      .format(i_dt.year,i_dt.month,i_dt.day,i_dt.hour,i_dt.minute,i_dt.second,i_dt.microsecond))
     if not os.path.exists(log_out_dir):
         lpath = Path(log_out_dir)
         lpath.mkdir(parents=True, exist_ok=True)
     if not log_utils.CONFIGURED:
         logging_config(folder=log_out_dir,
                        name='tmnt',
                        level=c_args.log_level,
                        console_level=c_args.log_level)
     logging.info(c_args)
     seed_rng(c_args.seed)
     if c_args.vocab_file and c_args.tr_vec_file:
         vpath = Path(c_args.vocab_file)
         tpath = Path(c_args.tr_vec_file)
         if not (vpath.is_file() and tpath.is_file()):
             raise Exception(
                 "Vocab file {} and/or training vector file {} do not exist"
                 .format(c_args.vocab_file, c_args.tr_vec_file))
     logging.info(
         "Loading data via pre-computed vocabulary and sparse vector format document representation"
     )
     vocab = load_vocab(c_args.vocab_file, encoding=c_args.str_encoding)
     voc_size = len(vocab)
     X, y, wd_freqs, _ = file_to_data(c_args.tr_vec_file, voc_size)
     model_out_dir = c_args.model_dir if c_args.model_dir else os.path.join(
         log_out_dir, 'MODEL')
     n_covars = int(float(np.max(y)) + 1)
     if not os.path.exists(model_out_dir):
         os.mkdir(model_out_dir)
     return cls(log_out_dir,
                model_out_dir,
                vocab,
                wd_freqs,
                c_args.tr_vec_file,
                c_args.val_vec_file,
                coherence_via_encoder=c_args.encoder_coherence,
                pretrained_param_file=c_args.pretrained_param_file,
                topic_seed_file=c_args.topic_seed_file,
                use_labels_as_covars=c_args.use_labels_as_covars,
                use_gpu=c_args.use_gpu,
                n_covars=n_covars,
                val_each_epoch=val_each_epoch)
Esempio n. 2
0
 def from_arguments(cls, args, config):
     i_dt = datetime.datetime.now()
     train_out_dir = '{}/train_{}_{}_{}_{}_{}_{}'.format(
         args.save_dir, i_dt.year, i_dt.month, i_dt.day, i_dt.hour,
         i_dt.minute, i_dt.second)
     print("Set logging config to {}".format(train_out_dir))
     logging_config(folder=train_out_dir,
                    name='train_trans_vae',
                    level=args.log_level,
                    console_level=args.log_level,
                    no_console=False)
     logging.info(args)
     bow_vocab = load_vocab(args.bow_vocab_file)
     trainer = cls(train_out_dir,
                   bow_vocab,
                   args.tr_file,
                   args.val_file,
                   use_gpu=args.use_gpu,
                   log_interval=args.log_interval)
     return trainer
Esempio n. 3
0
 def from_arguments(cls, args, config):
     i_dt = datetime.datetime.now()
     train_out_dir = '{}/train_{}_{}_{}_{}_{}_{}'.format(
         args.save_dir, i_dt.year, i_dt.month, i_dt.day, i_dt.hour,
         i_dt.minute, i_dt.second)
     print("Set logging config to {}".format(train_out_dir))
     logging_config(folder=train_out_dir,
                    name='train_trans_vae',
                    level=logging.INFO,
                    no_console=False)
     logging.info(args)
     bow_vocab = load_vocab(args.bow_vocab_file)
     data_train, bert_base, vocab, data_csr = load_dataset_bert(
         args.tr_file,
         len(bow_vocab),
         max_len=config.sent_size,
         ctx=mx.cpu())
     if args.val_file:
         data_val, _, _, val_csr = load_dataset_bert(
             args.val_file,
             len(bow_vocab),
             max_len=config.sent_size,
             ctx=mx.cpu())
         val_wds = val_csr.sum().asscalar()
     else:
         data_val, val_csr, val_wds = None, None, None
     sample_size = min(50000, data_csr.shape[0])
     data = data_csr[:sample_size]
     wd_freqs = mx.nd.sum(data, axis=0)
     trainer = cls(train_out_dir,
                   bow_vocab,
                   wd_freqs,
                   val_wds, (data_train, data_csr), (data_val, val_csr),
                   use_gpu=args.use_gpu,
                   log_interval=args.log_interval)
     return trainer
Esempio n. 4
0
    top_k_terms = []
    with io.open(in_file, 'r') as fp:
        for l in fp:
            ts = [t.strip() for t in l.split(',')]
            top_k_terms.append(ts)
    return top_k_terms


os.environ["MXNET_STORAGE_FALLBACK_LOG_VERBOSE"] = "0"

if __name__ == "__main__":
    parser = setup_parser()
    args = parser.parse_args()

    verbose = False  ### XXX - add as argument
    vocab = load_vocab(args.vocab_file)
    if args.override_top_k_terms:
        top_k_words_per_topic = get_top_k_terms_from_file(
            args.override_top_k_terms)
        tst_csr, _, _, _ = file_to_data(args.test_file, len(vocab))
        top_k_words_per_topic_ids = [[vocab[t] for t in t_set]
                                     for t_set in top_k_words_per_topic]
        npmi_eval = EvaluateNPMI(top_k_words_per_topic_ids)
        test_npmi = npmi_eval.evaluate_csr_mat(tst_csr)
        print("**** Test NPMI = {} *******".format(test_npmi))
        exit(0)

    inference_model = BowVAEInferencer.from_saved(
        model_dir=args.model_dir,
        ctx=mx.cpu() if args.gpu < 0 else mx.gpu(args.gpu))
Esempio n. 5
0
 def from_arguments(cls, c_args, val_each_epoch=True):
     """Constructor method to build BowVAETrainer from command-line arguments directly.
     
     Parameters:
         c_args (`argparse.Namespace`): Command-line arguments.
         val_each_epoch (bool): Flag for performing validation each epoch. optional (default = True)
     """
     i_dt = datetime.datetime.now()
     log_out_dir = \
         os.path.join(c_args.save_dir,
                      "train_{}_{}_{}_{}_{}_{}_{}"
                      .format(i_dt.year,i_dt.month,i_dt.day,i_dt.hour,i_dt.minute,i_dt.second,i_dt.microsecond))
     ll = c_args.log_level
     log_level = logging.INFO
     if ll.lower() == 'info':
         log_level = logging.INFO
     elif ll.lower() == 'debug':
         log_level = logging.DEBUG
     elif ll.lower() == 'error':
         log_level = logging.ERROR
     elif ll.lower() == 'warning':
         log_level = logging.WARNING
     else:
         log_level = logging.INFO
     logging_config(folder=log_out_dir,
                    name='tmnt',
                    level=log_level,
                    console_level=log_level)
     logging.info(c_args)
     seed_rng(c_args.seed)
     if c_args.vocab_file and c_args.tr_vec_file:
         vpath = Path(c_args.vocab_file)
         tpath = Path(c_args.tr_vec_file)
         if not (vpath.is_file() and tpath.is_file()):
             raise Exception(
                 "Vocab file {} and/or training vector file {} do not exist"
                 .format(c_args.vocab_file, c_args.tr_vec_file))
     logging.info(
         "Loading data via pre-computed vocabulary and sparse vector format document representation"
     )
     vocab = load_vocab(c_args.vocab_file, encoding=c_args.str_encoding)
     voc_size = len(vocab)
     X, y, wd_freqs, _ = file_to_data(c_args.tr_vec_file, voc_size)
     total_test_wds = 0
     if c_args.val_vec_file:
         val_X, val_y, _, total_test_wds = file_to_data(
             c_args.val_vec_file, voc_size)
     else:
         val_X, val_y, total_test_wds = None, None, 0
     ctx = mx.cpu() if not c_args.use_gpu else mx.gpu(0)
     model_out_dir = c_args.model_dir if c_args.model_dir else os.path.join(
         log_out_dir, 'MODEL')
     if not os.path.exists(model_out_dir):
         os.mkdir(model_out_dir)
     return cls(log_out_dir,
                model_out_dir,
                c_args,
                vocab,
                wd_freqs,
                X,
                val_X,
                total_test_wds,
                train_labels=y,
                test_labels=val_y,
                label_map=None,
                use_gpu=c_args.use_gpu,
                val_each_epoch=val_each_epoch)