Beispiel #1
0
def get_worker(args, budget, id_str, ns_port):
    i_dt = datetime.datetime.now()
    train_out_dir = \
        os.path.join(args.save_dir,
                     "train_{}_{}_{}_{}_{}_{}_{}".format(i_dt.year,i_dt.month,i_dt.day,i_dt.hour,i_dt.minute,i_dt.second,i_dt.microsecond))
    logging_config(folder=train_out_dir, name='tmnt', level=logging.INFO)
    logging.info(args)
    seed_rng(args.seed)
    if args.vocab_file and args.tr_vec_file:
        vpath = Path(args.vocab_file)
        tpath = Path(args.tr_vec_file)
        if not (vpath.is_file() and tpath.is_file()):
            raise Exception(
                "Vocab file {} and/or training vector file {} do not exist".
                format(args.vocab_file, args.tr_vec_file))
    logging.info(
        "Loading data via pre-computed vocabulary and sparse vector format document representation"
    )
    vocab, tr_csr_mat, total_tr_words, tr_labels, label_map = \
        collect_sparse_data(args.tr_vec_file, args.vocab_file, scalar_labels=args.scalar_covars, encoding=args.str_encoding)
    if args.val_vec_file:
        tst_csr_mat, total_tst_words, tst_labels = \
            collect_sparse_test(args.val_vec_file, vocab, scalar_labels=args.scalar_covars, encoding=args.str_encoding)
    else:
        tst_csr_mat, total_tst_words, tst_labels = None, None, None
    ctx = mx.cpu() if args.gpu is None or args.gpu == '' or int(
        args.gpu) < 0 else mx.gpu(int(args.gpu))
    model_out_dir = args.model_dir if args.model_dir else os.path.join(
        train_out_dir, 'MODEL')
    if not os.path.exists(model_out_dir):
        os.mkdir(model_out_dir)
    if args.use_labels_as_covars and tr_labels is not None:
        if label_map is not None:
            n_covars = len(label_map)
            tr_labels = mx.nd.one_hot(tr_labels, n_covars)
            tst_labels = mx.nd.one_hot(
                tst_labels, n_covars) if tst_labels is not None else None
        else:
            tr_labels = mx.nd.expand_dims(tr_labels, 1)
            tst_labels = mx.nd.expand_dims(
                tst_labels, 1) if tst_labels is not None else None
    worker = BowVAEWorker(model_out_dir,
                          args,
                          vocab,
                          tr_csr_mat,
                          total_tr_words,
                          tst_csr_mat,
                          total_tst_words,
                          tr_labels,
                          tst_labels,
                          label_map,
                          ctx=ctx,
                          max_budget=budget,
                          nameserver='127.0.0.1',
                          run_id=id_str,
                          nameserver_port=ns_port)
    return worker, train_out_dir
Beispiel #2
0
 def retrain_best_config(self, config, budget, rng_seed, ntimes=1):
     """Train a model as per the provided `Configuration` and `budget` and write to file.
     
     Parameters
     ----------
     config: `Configuration` to use to train/evaluate the model
     budget: int - number of iterations to train
     """
     best_loss = 100000000.0
     best_model = None
     npmis = []
     perplexities = []
     redundancies = []
     if self.c_args.tst_vec_file:
         self.set_heldout_data_as_test()
     if self.c_args.val_vec_file:
         for i in range(ntimes):
             seed_rng(rng_seed + i)
             model, results = self._train_model(config, budget)
             loss = results['loss']
             npmis.append(results['info']['test_npmi'])
             perplexities.append(results['info']['test_perplexity'])
             redundancies.append(results['info']['redundancy'])
             if loss < best_loss:
                 best_loss = loss
                 best_model = model
         logging.info("******************************************")
         test_type = "HELDOUT" if self.c_args.tst_vec_file else "VALIDATATION"
         if ntimes > 1:
             logging.info(
                 "Final {} NPMI       ==> Mean: {}, StdDev: {}".format(
                     test_type, statistics.mean(npmis),
                     statistics.stdev(npmis)))
             logging.info(
                 "Final {} Perplexity ==> Mean: {}, StdDev: {}".format(
                     test_type, statistics.mean(perplexities),
                     statistics.stdev(perplexities)))
             logging.info(
                 "Final {} Redundancy ==> Mean: {}, StdDev: {}".format(
                     test_type, statistics.mean(redundancies),
                     statistics.stdev(redundancies)))
         else:
             logging.info("Final {} NPMI       ==> {}".format(
                 test_type, npmis[0]))
             logging.info("Final {} Perplexity ==> {}".format(
                 test_type, perplexities[0]))
             logging.info("Final {} Redundancy ==> {}".format(
                 test_type, redundancies[0]))
     else:
         ## in this case, no validation test data supplied
         best_model, _ = self._train_model(config, budget)
     write_model(best_model, self.model_out_dir, config, budget,
                 self.c_args)
Beispiel #3
0
 def from_arguments(cls, c_args, val_each_epoch=True):
     """Constructor method to build BowVAETrainer from command-line arguments directly.
     
     Parameters:
         c_args (`argparse.Namespace`): Command-line arguments.
         val_each_epoch (bool): Flag for performing validation each epoch. optional (default = True)
     """
     i_dt = datetime.datetime.now()
     log_out_dir = \
         os.path.join(c_args.save_dir,
                      "train_{}_{}_{}_{}_{}_{}_{}"
                      .format(i_dt.year,i_dt.month,i_dt.day,i_dt.hour,i_dt.minute,i_dt.second,i_dt.microsecond))
     if not os.path.exists(log_out_dir):
         lpath = Path(log_out_dir)
         lpath.mkdir(parents=True, exist_ok=True)
     if not log_utils.CONFIGURED:
         logging_config(folder=log_out_dir,
                        name='tmnt',
                        level=c_args.log_level,
                        console_level=c_args.log_level)
     logging.info(c_args)
     seed_rng(c_args.seed)
     if c_args.vocab_file and c_args.tr_vec_file:
         vpath = Path(c_args.vocab_file)
         tpath = Path(c_args.tr_vec_file)
         if not (vpath.is_file() and tpath.is_file()):
             raise Exception(
                 "Vocab file {} and/or training vector file {} do not exist"
                 .format(c_args.vocab_file, c_args.tr_vec_file))
     logging.info(
         "Loading data via pre-computed vocabulary and sparse vector format document representation"
     )
     vocab = load_vocab(c_args.vocab_file, encoding=c_args.str_encoding)
     voc_size = len(vocab)
     X, y, wd_freqs, _ = file_to_data(c_args.tr_vec_file, voc_size)
     model_out_dir = c_args.model_dir if c_args.model_dir else os.path.join(
         log_out_dir, 'MODEL')
     n_covars = int(float(np.max(y)) + 1)
     if not os.path.exists(model_out_dir):
         os.mkdir(model_out_dir)
     return cls(log_out_dir,
                model_out_dir,
                vocab,
                wd_freqs,
                c_args.tr_vec_file,
                c_args.val_vec_file,
                coherence_via_encoder=c_args.encoder_coherence,
                pretrained_param_file=c_args.pretrained_param_file,
                topic_seed_file=c_args.topic_seed_file,
                use_labels_as_covars=c_args.use_labels_as_covars,
                use_gpu=c_args.use_gpu,
                n_covars=n_covars,
                val_each_epoch=val_each_epoch)
Beispiel #4
0
    def train_with_single_config(self, config, num_evals):
        """Fit models with a single configuration and report the value of the objective function.

        This method trains a model defined by the configuration `num_evals` times. Each time
        the model weights are randomly initialized with a different RNG seed. The results
        of each run are captured and mean and std reported.
        
        Args:
            config (dict): Configuration instance with hyperparameter values for model definition.
            num_evals (int): Number of model fits and evaluations to perform (with random initialization)

        Returns:
            (tuple): Tuple containing:
                - model (:class:`tmnt.modeling.BowVAEModel`): VAE Model instance with trained/fit parameters.
                - obj (float): objective value of the objective function with the best model.
       """
        rng_seed = self.rng_seed
        best_obj = -1000000000.0
        best_model = None
        if self.test_data_path is not None:
            #if c_args.tst_vec_file:
            #    trainer.set_heldout_data_path_as_test()
            logging.info("Training with config: {}".format(config))
            npmis, perplexities, redundancies, objectives = [], [], [], []
            ntimes = int(num_evals)
            for i in range(ntimes):
                seed_rng(rng_seed)  # update RNG
                rng_seed += 1
                model, obj, v_res = self.train_model(config, FakeReporter())
                npmis.append(v_res['npmi'])
                perplexities.append(v_res['ppl'])
                redundancies.append(v_res['redundancy'])
                objectives.append(obj)
                if obj > best_obj:
                    best_obj = obj
                    best_model = model
            #test_type = "HELDOUT" if c_args.tst_vec_file else "VALIDATION"
            test_type = "VALIDATION"
            if ntimes > 1:
                logging.info(
                    "Final {} NPMI         ==> Mean: {}, StdDev: {}".format(
                        test_type, statistics.mean(npmis),
                        statistics.stdev(npmis)))
                logging.info(
                    "Final {} Perplexity   ==> Mean: {}, StdDev: {}".format(
                        test_type, statistics.mean(perplexities),
                        statistics.stdev(perplexities)))
                logging.info(
                    "Final {} Redundancy   ==> Mean: {}, StdDev: {}".format(
                        test_type, statistics.mean(redundancies),
                        statistics.stdev(redundancies)))
                logging.info(
                    "Final {} Objective    ==> Mean: {}, StdDev: {}".format(
                        test_type, statistics.mean(objectives),
                        statistics.stdev(objectives)))
            else:
                logging.info("Final {} NPMI           ==> {}".format(
                    test_type, npmis[0]))
                logging.info("Final {} Perplexity     ==> {}".format(
                    test_type, perplexities[0]))
                logging.info("Final {} Redundancy     ==> {}".format(
                    test_type, redundancies[0]))
                logging.info("Final {} Objective      ==> {}".format(
                    test_type, objectives[0]))
            return best_model, best_obj
        else:
            model, obj, _ = self.train_model(config, FakeReporter())
            return model, obj
Beispiel #5
0
 def from_arguments(cls, c_args, val_each_epoch=True):
     """Constructor method to build BowVAETrainer from command-line arguments directly.
     
     Parameters:
         c_args (`argparse.Namespace`): Command-line arguments.
         val_each_epoch (bool): Flag for performing validation each epoch. optional (default = True)
     """
     i_dt = datetime.datetime.now()
     log_out_dir = \
         os.path.join(c_args.save_dir,
                      "train_{}_{}_{}_{}_{}_{}_{}"
                      .format(i_dt.year,i_dt.month,i_dt.day,i_dt.hour,i_dt.minute,i_dt.second,i_dt.microsecond))
     ll = c_args.log_level
     log_level = logging.INFO
     if ll.lower() == 'info':
         log_level = logging.INFO
     elif ll.lower() == 'debug':
         log_level = logging.DEBUG
     elif ll.lower() == 'error':
         log_level = logging.ERROR
     elif ll.lower() == 'warning':
         log_level = logging.WARNING
     else:
         log_level = logging.INFO
     logging_config(folder=log_out_dir,
                    name='tmnt',
                    level=log_level,
                    console_level=log_level)
     logging.info(c_args)
     seed_rng(c_args.seed)
     if c_args.vocab_file and c_args.tr_vec_file:
         vpath = Path(c_args.vocab_file)
         tpath = Path(c_args.tr_vec_file)
         if not (vpath.is_file() and tpath.is_file()):
             raise Exception(
                 "Vocab file {} and/or training vector file {} do not exist"
                 .format(c_args.vocab_file, c_args.tr_vec_file))
     logging.info(
         "Loading data via pre-computed vocabulary and sparse vector format document representation"
     )
     vocab = load_vocab(c_args.vocab_file, encoding=c_args.str_encoding)
     voc_size = len(vocab)
     X, y, wd_freqs, _ = file_to_data(c_args.tr_vec_file, voc_size)
     total_test_wds = 0
     if c_args.val_vec_file:
         val_X, val_y, _, total_test_wds = file_to_data(
             c_args.val_vec_file, voc_size)
     else:
         val_X, val_y, total_test_wds = None, None, 0
     ctx = mx.cpu() if not c_args.use_gpu else mx.gpu(0)
     model_out_dir = c_args.model_dir if c_args.model_dir else os.path.join(
         log_out_dir, 'MODEL')
     if not os.path.exists(model_out_dir):
         os.mkdir(model_out_dir)
     return cls(log_out_dir,
                model_out_dir,
                c_args,
                vocab,
                wd_freqs,
                X,
                val_X,
                total_test_wds,
                train_labels=y,
                test_labels=val_y,
                label_map=None,
                use_gpu=c_args.use_gpu,
                val_each_epoch=val_each_epoch)