Ejemplo n.º 1
0
 def __init__(self,
              log_out_dir,
              model_out_dir,
              vocabulary,
              wd_freqs,
              train_data_path,
              test_data_path,
              coherence_via_encoder=False,
              pretrained_param_file=None,
              topic_seed_file=None,
              use_labels_as_covars=False,
              use_gpu=False,
              n_covars=None,
              val_each_epoch=True,
              rng_seed=1234):
     super().__init__(vocabulary, train_data_path, test_data_path,
                      val_each_epoch, rng_seed)
     if not log_utils.CONFIGURED:
         logging_config(folder=log_out_dir,
                        name='tmnt',
                        level='info',
                        console_level='info')
     self.log_out_dir = log_out_dir
     self.model_out_dir = model_out_dir
     self.use_gpu = use_gpu
     self.wd_freqs = wd_freqs
     self.seed_matrix = None
     self.pretrained_param_file = pretrained_param_file
     self.n_covars = n_covars
     self.use_labels_as_covars = use_labels_as_covars
     self.coherence_via_encoder = coherence_via_encoder
     if topic_seed_file:
         self.seed_matrix = get_seed_matrix_from_file(
             topic_seed_file, vocabulary, ctx)
Ejemplo n.º 2
0
def model_select_bow_vae(c_args):
    logging_config(folder=c_args.save_dir,
                   name='tmnt',
                   level=c_args.log_level,
                   console_level=c_args.log_level)
    ## dask config overrides
    dask.config.config['distributed']['worker']['use-file-locking'] = False
    dask.config.config['distributed']['comm']['timeouts']['connect'] = '90s'
    ##
    tmnt_config = TMNTConfigBOW(c_args.config_space).get_configspace()
    trainer = BowVAETrainer.from_arguments(
        c_args, val_each_epoch=(not (c_args.searcher == 'random')))
    selector = BaseSelector(tmnt_config,
                            iterations=c_args.iterations,
                            searcher=c_args.searcher,
                            scheduler=c_args.scheduler,
                            brackets=c_args.brackets,
                            cpus_per_task=c_args.cpus_per_task,
                            num_final_evals=c_args.num_final_evals,
                            rng_seed=c_args.seed,
                            log_dir=trainer.log_out_dir)
    sources = [
        e['source'] for e in tmnt_config.get('embedding').data
        if e['source'] != 'random'
    ]
    logging.info(
        '>> Pre-caching pre-trained embeddings/vocabularies: {}'.format(
            sources))
    trainer.pre_cache_vocabularies(sources)
    selector.select_model(trainer)
Ejemplo n.º 3
0
def get_worker(args, budget, id_str, ns_port):
    i_dt = datetime.datetime.now()
    train_out_dir = \
        os.path.join(args.save_dir,
                     "train_{}_{}_{}_{}_{}_{}_{}".format(i_dt.year,i_dt.month,i_dt.day,i_dt.hour,i_dt.minute,i_dt.second,i_dt.microsecond))
    logging_config(folder=train_out_dir, name='tmnt', level=logging.INFO)
    logging.info(args)
    seed_rng(args.seed)
    if args.vocab_file and args.tr_vec_file:
        vpath = Path(args.vocab_file)
        tpath = Path(args.tr_vec_file)
        if not (vpath.is_file() and tpath.is_file()):
            raise Exception(
                "Vocab file {} and/or training vector file {} do not exist".
                format(args.vocab_file, args.tr_vec_file))
    logging.info(
        "Loading data via pre-computed vocabulary and sparse vector format document representation"
    )
    vocab, tr_csr_mat, total_tr_words, tr_labels, label_map = \
        collect_sparse_data(args.tr_vec_file, args.vocab_file, scalar_labels=args.scalar_covars, encoding=args.str_encoding)
    if args.val_vec_file:
        tst_csr_mat, total_tst_words, tst_labels = \
            collect_sparse_test(args.val_vec_file, vocab, scalar_labels=args.scalar_covars, encoding=args.str_encoding)
    else:
        tst_csr_mat, total_tst_words, tst_labels = None, None, None
    ctx = mx.cpu() if args.gpu is None or args.gpu == '' or int(
        args.gpu) < 0 else mx.gpu(int(args.gpu))
    model_out_dir = args.model_dir if args.model_dir else os.path.join(
        train_out_dir, 'MODEL')
    if not os.path.exists(model_out_dir):
        os.mkdir(model_out_dir)
    if args.use_labels_as_covars and tr_labels is not None:
        if label_map is not None:
            n_covars = len(label_map)
            tr_labels = mx.nd.one_hot(tr_labels, n_covars)
            tst_labels = mx.nd.one_hot(
                tst_labels, n_covars) if tst_labels is not None else None
        else:
            tr_labels = mx.nd.expand_dims(tr_labels, 1)
            tst_labels = mx.nd.expand_dims(
                tst_labels, 1) if tst_labels is not None else None
    worker = BowVAEWorker(model_out_dir,
                          args,
                          vocab,
                          tr_csr_mat,
                          total_tr_words,
                          tst_csr_mat,
                          total_tst_words,
                          tr_labels,
                          tst_labels,
                          label_map,
                          ctx=ctx,
                          max_budget=budget,
                          nameserver='127.0.0.1',
                          run_id=id_str,
                          nameserver_port=ns_port)
    return worker, train_out_dir
Ejemplo n.º 4
0
def test_ar(args):
    i_dt = datetime.datetime.now()
    train_out_dir = '{}/train_{}_{}_{}_{}_{}_{}'.format(
        args.save_dir, i_dt.year, i_dt.month, i_dt.day, i_dt.hour, i_dt.minute,
        i_dt.second)
    print("Set logging config to {}".format(train_out_dir))
    logging_config(folder=train_out_dir,
                   name='train_trans_vae',
                   level=logging.INFO,
                   no_console=False)
    logging.info(args)
    context = mx.cpu() if args.gpus is None or args.gpus == '' else mx.gpu(
        int(args.gpus))
    emb = nlp.embedding.create(
        'glove',
        source=args.embedding_source) if args.embedding_source else None
    data_train, vocab = load_dataset_basic(args.input_file,
                                           vocab=None,
                                           json_text_key=args.json_text_key,
                                           max_len=args.sent_size,
                                           max_vocab_size=args.max_vocab_size,
                                           ctx=context)
    if emb:
        vocab.set_embedding(emb)
        _, emb_size = vocab.embedding.idx_to_vec.shape
        oov_items = 0
        for word in vocab.embedding._idx_to_token:
            if (vocab.embedding[word] == mx.nd.zeros(emb_size)
                ).sum() == emb_size:
                oov_items += 1
                vocab.embedding[word] = mx.nd.random.normal(0.0, 0.1, emb_size)
        logging.info("** There are {} out of vocab items **".format(oov_items))
    else:
        logging.info(
            "** No pre-trained embedding provided, learning embedding weights from scratch **"
        )
    emb_dim = len(vocab.embedding.idx_to_vec[0])
    model = ARTransformerVAE(vocab,
                             emb_dim,
                             args.latent_dist,
                             num_units=args.num_units,
                             hidden_size=args.hidden_size,
                             num_heads=args.num_heads,
                             n_latent=args.latent_dim,
                             max_sent_len=args.sent_size,
                             transformer_layers=args.transformer_layers,
                             kappa=args.kappa,
                             batch_size=args.batch_size,
                             kld=args.kld_wt,
                             ctx=context)
    model.latent_dist.initialize(init=mx.init.Xavier(magnitude=2.34),
                                 ctx=context)
    model.encoder.initialize(init=mx.init.Xavier(magnitude=2.34), ctx=context)
    #model.decoder.initialize(init=mx.init.Xavier(magnitude=2.34), ctx=context)
    pad_id = vocab[vocab.padding_token]
Ejemplo n.º 5
0
 def from_arguments(cls, c_args, val_each_epoch=True):
     """Constructor method to build BowVAETrainer from command-line arguments directly.
     
     Parameters:
         c_args (`argparse.Namespace`): Command-line arguments.
         val_each_epoch (bool): Flag for performing validation each epoch. optional (default = True)
     """
     i_dt = datetime.datetime.now()
     log_out_dir = \
         os.path.join(c_args.save_dir,
                      "train_{}_{}_{}_{}_{}_{}_{}"
                      .format(i_dt.year,i_dt.month,i_dt.day,i_dt.hour,i_dt.minute,i_dt.second,i_dt.microsecond))
     if not os.path.exists(log_out_dir):
         lpath = Path(log_out_dir)
         lpath.mkdir(parents=True, exist_ok=True)
     if not log_utils.CONFIGURED:
         logging_config(folder=log_out_dir,
                        name='tmnt',
                        level=c_args.log_level,
                        console_level=c_args.log_level)
     logging.info(c_args)
     seed_rng(c_args.seed)
     if c_args.vocab_file and c_args.tr_vec_file:
         vpath = Path(c_args.vocab_file)
         tpath = Path(c_args.tr_vec_file)
         if not (vpath.is_file() and tpath.is_file()):
             raise Exception(
                 "Vocab file {} and/or training vector file {} do not exist"
                 .format(c_args.vocab_file, c_args.tr_vec_file))
     logging.info(
         "Loading data via pre-computed vocabulary and sparse vector format document representation"
     )
     vocab = load_vocab(c_args.vocab_file, encoding=c_args.str_encoding)
     voc_size = len(vocab)
     X, y, wd_freqs, _ = file_to_data(c_args.tr_vec_file, voc_size)
     model_out_dir = c_args.model_dir if c_args.model_dir else os.path.join(
         log_out_dir, 'MODEL')
     n_covars = int(float(np.max(y)) + 1)
     if not os.path.exists(model_out_dir):
         os.mkdir(model_out_dir)
     return cls(log_out_dir,
                model_out_dir,
                vocab,
                wd_freqs,
                c_args.tr_vec_file,
                c_args.val_vec_file,
                coherence_via_encoder=c_args.encoder_coherence,
                pretrained_param_file=c_args.pretrained_param_file,
                topic_seed_file=c_args.topic_seed_file,
                use_labels_as_covars=c_args.use_labels_as_covars,
                use_gpu=c_args.use_gpu,
                n_covars=n_covars,
                val_each_epoch=val_each_epoch)
Ejemplo n.º 6
0
def train(args):
    i_dt = datetime.datetime.now()
    exp_folder = '{}/exp_{}_{}_{}_{}_{}_{}'.format(args.logdir, i_dt.year,
                                                   i_dt.month, i_dt.day,
                                                   i_dt.hour, i_dt.minute,
                                                   i_dt.second)
    logging_config(exp_folder, name="Embeddings", level=logging.INFO)
    logging.info(args)
    random.seed(args.seed)
    mx.random.seed(args.seed)
    np.random.seed(args.seed)
    train_embeddings(args, exp_folder)
Ejemplo n.º 7
0
def model_select_seq_bow(c_args):
    logging_config(folder=c_args.save_dir,
                   name='tmnt',
                   level=c_args.log_level,
                   console_level=c_args.log_level)
    tmnt_config = TMNTConfigSeqBOW(c_args.config_space).get_configspace()
    trainer = SeqBowVEDTrainer.from_arguments(c_args)
    selector = BaseSelector(tmnt_config, c_args.iterations, c_args.searcher,
                            c_args.scheduler, c_args.brackets,
                            c_args.cpus_per_task, c_args.use_gpu,
                            c_args.num_final_evals, c_args.seed,
                            trainer.model_out_dir)
    selector.select_model(trainer)
Ejemplo n.º 8
0
 def from_arguments(cls, args, config):
     i_dt = datetime.datetime.now()
     train_out_dir = '{}/train_{}_{}_{}_{}_{}_{}'.format(
         args.save_dir, i_dt.year, i_dt.month, i_dt.day, i_dt.hour,
         i_dt.minute, i_dt.second)
     print("Set logging config to {}".format(train_out_dir))
     logging_config(folder=train_out_dir,
                    name='train_trans_vae',
                    level=args.log_level,
                    console_level=args.log_level,
                    no_console=False)
     logging.info(args)
     trainer = cls(train_out_dir,
                   args.tr_file,
                   args.val_file,
                   aux_data_path=args.aux_file,
                   use_gpu=args.use_gpu,
                   log_interval=args.log_interval)
     return trainer
Ejemplo n.º 9
0
 def from_arguments(cls, args, config):
     i_dt = datetime.datetime.now()
     train_out_dir = '{}/train_{}_{}_{}_{}_{}_{}'.format(
         args.save_dir, i_dt.year, i_dt.month, i_dt.day, i_dt.hour,
         i_dt.minute, i_dt.second)
     print("Set logging config to {}".format(train_out_dir))
     logging_config(folder=train_out_dir,
                    name='train_trans_vae',
                    level=logging.INFO,
                    no_console=False)
     logging.info(args)
     bow_vocab = load_vocab(args.bow_vocab_file)
     data_train, bert_base, vocab, data_csr = load_dataset_bert(
         args.tr_file,
         len(bow_vocab),
         max_len=config.sent_size,
         ctx=mx.cpu())
     if args.val_file:
         data_val, _, _, val_csr = load_dataset_bert(
             args.val_file,
             len(bow_vocab),
             max_len=config.sent_size,
             ctx=mx.cpu())
         val_wds = val_csr.sum().asscalar()
     else:
         data_val, val_csr, val_wds = None, None, None
     sample_size = min(50000, data_csr.shape[0])
     data = data_csr[:sample_size]
     wd_freqs = mx.nd.sum(data, axis=0)
     trainer = cls(train_out_dir,
                   bow_vocab,
                   wd_freqs,
                   val_wds, (data_train, data_csr), (data_val, val_csr),
                   use_gpu=args.use_gpu,
                   log_interval=args.log_interval)
     return trainer
Ejemplo n.º 10
0
def train_main(args):
    i_dt = datetime.datetime.now()
    train_out_dir = '{}/train_{}_{}_{}_{}_{}_{}'.format(
        args.save_dir, i_dt.year, i_dt.month, i_dt.day, i_dt.hour, i_dt.minute,
        i_dt.second)
    print("Set logging config to {}".format(train_out_dir))
    logging_config(folder=train_out_dir,
                   name='train_trans_vae',
                   level=logging.INFO,
                   no_console=False)
    logging.info(args)
    context = mx.cpu() if args.gpus is None or args.gpus == '' else mx.gpu(
        int(args.gpus))
    if args.use_bert:
        data_train, bert_base, vocab = load_dataset_bert(
            args.input_file, max_len=args.sent_size, ctx=context)
        model = get_bert_model(args, bert_base, context)
        pad_id = vocab[vocab.padding_token]
        report_fn = get_report_reconstruct_data_fn(vocab, pad_id=pad_id)
        train_trans_vae(args,
                        model,
                        data_train,
                        data_test=None,
                        ctx=context,
                        report_fn=report_fn,
                        use_bert=True)
    else:
        emb = nlp.embedding.create(
            'glove',
            source=args.embedding_source) if args.embedding_source else None
        data_train, vocab = load_dataset_basic(
            args.input_file,
            vocab=None,
            json_text_key=args.json_text_key,
            max_len=args.sent_size,
            max_vocab_size=args.max_vocab_size,
            ctx=context)
        if emb:
            vocab.set_embedding(emb)
            _, emb_size = vocab.embedding.idx_to_vec.shape
            oov_items = 0
            for word in vocab.embedding._idx_to_token:
                if (vocab.embedding[word] == mx.nd.zeros(emb_size)
                    ).sum() == emb_size:
                    oov_items += 1
                    vocab.embedding[word] = mx.nd.random.normal(
                        0.0, 0.1, emb_size)
            logging.info(
                "** There are {} out of vocab items **".format(oov_items))
        else:
            logging.info(
                "** No pre-trained embedding provided, learning embedding weights from scratch **"
            )
        model = get_basic_model(args, vocab, context)
        pad_id = vocab[vocab.padding_token]
        report_fn = get_report_reconstruct_data_fn(vocab, pad_id=pad_id)
        train_trans_vae(args,
                        model,
                        data_train,
                        data_test=None,
                        ctx=context,
                        report_fn=report_fn,
                        use_bert=False)
Ejemplo n.º 11
0
                    type=int,
                    help='Use first N characters of label',
                    default=-1)
parser.add_argument('--str_encoding',
                    type=str,
                    help='String/file encoding to use',
                    default='utf-8')
parser.add_argument('--log_dir',
                    type=str,
                    help='Logging directory',
                    default='.')

args = parser.parse_args()

if __name__ == '__main__':
    logging_config(folder=args.log_dir, name='vectorizer', level=logging.INFO)
    if args.vocab_file is None:
        raise Exception("Vocabulary output file name/path must be provided")
    vectorizer = \
        TextVectorizer(min_doc_size=args.min_doc_length, encoding=args.str_encoding, custom_stop_word_file=args.custom_stop_words) \
        if args.txt_mode \
           else JsonVectorizer(text_key=args.json_text_key, custom_stop_word_file=args.custom_stop_words, label_key=args.json_label_key,
                            min_doc_size=args.min_doc_length, label_prefix=args.label_prefix_chars,
                            json_out_dir=args.json_out_dir,
                            encoding=args.str_encoding)
    vocab = vectorizer.get_sparse_vecs(
        args.tr_vec_file,
        args.vocab_file,
        args.tr_input_dir,
        args.vocab_size,
        full_histogram_file=args.full_vocab_histogram,
Ejemplo n.º 12
0
parser.add_argument('--label_prefix_chars',
                    type=int,
                    help='Use first N characters of label',
                    default=-1)
parser.add_argument('--str_encoding',
                    type=str,
                    help='String/file encoding to use',
                    default='utf-8')
parser.add_argument('--log_dir',
                    type=str,
                    help='Logging directory',
                    default='.')

args = parser.parse_args()

if __name__ == '__main__':
    logging_config(folder=args.log_dir, name='vectorizer', level='info')
    if args.vocab_file is None:
        raise Exception("Vocabulary output file name/path must be provided")
    vectorizer = \
           TMNTVectorizer(text_key=args.json_text_key, custom_stop_word_file=args.custom_stop_words, label_key=args.json_label_key,
                            min_doc_size=args.min_doc_length, label_prefix=args.label_prefix_chars,
                            json_out_dir=args.json_out_dir, vocab_size = args.vocab_size,
                            encoding=args.str_encoding)
    vectorizer.fit_transform_in_place_json(args.tr_input_file)
    vectorizer.write_vocab(args.vocab_file)
    if args.val_input_file:
        vectorizer.transform_in_place_json(args.val_input_file)
    if args.tst_input_file:
        vectorizer.transform_in_place_json(args.tst_input_file)
Ejemplo n.º 13
0
    all_labels = []
    for i, (data, label, mask) in enumerate(dataloader):
        out = model(data, mask)
        predictions = mx.nd.argmax(out, axis=1).astype('int32')
        for j in range(out.shape[0]):
            probs = mx.nd.softmax(out[j])
            lab = int(label[j].asscalar())
            all_scores.append(probs[1].asscalar())
            all_labels.append(lab)
            if probs[1] > probs[0] and lab == 1:
                total_correct += 1
            elif probs[1] < probs[0] and lab == 0:
                total_correct += 1
            total += 1
    acc = total_correct / float(total)
    ap = average_precision_score(all_labels, all_scores)
    return ap, acc


if __name__ == '__main__':
    args = get_args()
    logging_config(args.log_dir,
                   'train',
                   level=logging.INFO,
                   console_level=logging.INFO)
    train_dataset, val_dataset, test_dataset, transform = \
        load_sparse_dataset(args.train_file, args.val_file, args.test_file, voc_size=args.voc_size, max_length=args.max_length)
    ctx = mx.cpu()
    train_classifier(args.voc_size, args.embedding_dim, transform,
                     train_dataset, val_dataset, test_dataset, ctx)
Ejemplo n.º 14
0
 def from_arguments(cls, c_args, val_each_epoch=True):
     """Constructor method to build BowVAETrainer from command-line arguments directly.
     
     Parameters:
         c_args (`argparse.Namespace`): Command-line arguments.
         val_each_epoch (bool): Flag for performing validation each epoch. optional (default = True)
     """
     i_dt = datetime.datetime.now()
     log_out_dir = \
         os.path.join(c_args.save_dir,
                      "train_{}_{}_{}_{}_{}_{}_{}"
                      .format(i_dt.year,i_dt.month,i_dt.day,i_dt.hour,i_dt.minute,i_dt.second,i_dt.microsecond))
     ll = c_args.log_level
     log_level = logging.INFO
     if ll.lower() == 'info':
         log_level = logging.INFO
     elif ll.lower() == 'debug':
         log_level = logging.DEBUG
     elif ll.lower() == 'error':
         log_level = logging.ERROR
     elif ll.lower() == 'warning':
         log_level = logging.WARNING
     else:
         log_level = logging.INFO
     logging_config(folder=log_out_dir,
                    name='tmnt',
                    level=log_level,
                    console_level=log_level)
     logging.info(c_args)
     seed_rng(c_args.seed)
     if c_args.vocab_file and c_args.tr_vec_file:
         vpath = Path(c_args.vocab_file)
         tpath = Path(c_args.tr_vec_file)
         if not (vpath.is_file() and tpath.is_file()):
             raise Exception(
                 "Vocab file {} and/or training vector file {} do not exist"
                 .format(c_args.vocab_file, c_args.tr_vec_file))
     logging.info(
         "Loading data via pre-computed vocabulary and sparse vector format document representation"
     )
     vocab = load_vocab(c_args.vocab_file, encoding=c_args.str_encoding)
     voc_size = len(vocab)
     X, y, wd_freqs, _ = file_to_data(c_args.tr_vec_file, voc_size)
     total_test_wds = 0
     if c_args.val_vec_file:
         val_X, val_y, _, total_test_wds = file_to_data(
             c_args.val_vec_file, voc_size)
     else:
         val_X, val_y, total_test_wds = None, None, 0
     ctx = mx.cpu() if not c_args.use_gpu else mx.gpu(0)
     model_out_dir = c_args.model_dir if c_args.model_dir else os.path.join(
         log_out_dir, 'MODEL')
     if not os.path.exists(model_out_dir):
         os.mkdir(model_out_dir)
     return cls(log_out_dir,
                model_out_dir,
                c_args,
                vocab,
                wd_freqs,
                X,
                val_X,
                total_test_wds,
                train_labels=y,
                test_labels=val_y,
                label_map=None,
                use_gpu=c_args.use_gpu,
                val_each_epoch=val_each_epoch)