Esempio n. 1
0
File: gam.py Progetto: ye-man/GAM
 def fit(self):
     """
     Fitting a model on the training dataset.
     """
     print("\nTraining started.\n")
     self.model.train()
     self.optimizer = torch.optim.Adam(self.model.parameters(),
                                       lr=self.args.learning_rate,
                                       weight_decay=self.args.weight_decay)
     self.optimizer.zero_grad()
     epoch_range = trange(self.args.epochs, desc="Epoch: ", leave=True)
     for epoch in epoch_range:
         random.shuffle(self.training_graphs)
         batches = create_batches(self.training_graphs,
                                  self.args.batch_size)
         self.epoch_loss = 0
         self.nodes_processed = 0
         batch_range = trange(len(batches))
         for batch in batch_range:
             self.epoch_loss = self.epoch_loss + self.process_batch(
                 batches[batch])
             self.nodes_processed = self.nodes_processed + len(
                 batches[batch])
             loss_score = round(self.epoch_loss / self.nodes_processed, 4)
             batch_range.set_description("(Loss=%g)" % loss_score)
         self.update_log()
Esempio n. 2
0
def process_seed_pages(pages_db,
                       redirects_lookup,
                       seed_pages,
                       depth=1,
                       limit=10000):
    '''Get the mentions in each of the seed pages as well as the pages
they link to. Set `depth` > 1 to also process the pages that those
pages link to'''
    processed_pages = _process_pages(redirects_lookup,
                                     seed_pages,
                                     is_seed_page=True,
                                     limit=limit)
    latest_processed_pages = processed_pages
    visited_page_titles = set([
        processed_page['document_info']['title']
        for processed_page in processed_pages
    ])
    for layer in range(depth):
        print("Getting referenced pages")
        pages_referenced = get_outlinks(latest_processed_pages)
        page_titles_to_fetch = pages_referenced - visited_page_titles
        batch_size = 1000
        print("Fetching and processing", len(page_titles_to_fetch), "pages in",
              batch_size, "batches")
        for batch_num, titles_batch in progressbar(
                enumerate(
                    u.create_batches(list(page_titles_to_fetch),
                                     batch_size=batch_size)),
                max_value=int(len(page_titles_to_fetch) / batch_size)):
            batch_pages_to_process = _fetch_pages(pages_db, titles_batch)
            latest_processed_pages = _process_pages(redirects_lookup,
                                                    batch_pages_to_process)
            processed_pages += latest_processed_pages
        visited_page_titles = visited_page_titles.union(pages_referenced)
    return processed_pages
Esempio n. 3
0
def main():
    batch_size = 10
    ext_emb_path = config.ext_emb_path
    input_x, input_y = loader.prepare_input(config.datadir + config.train)
    emb_layer = pretrain.Embedding(ext_emb_path)
    seqlen, input_x = utils.convert_to_id(input_x, emb_layer.word_to_id)
    input_y, tag_to_id = utils.convert_tag_to_id(input_y)
    seqlen, inp = utils.create_batches(input_x, input_y, seqlen, batch_size)

    sess = tf.Session()
    graph = loader.reload_smodel(sess)
    num_labels = len(tag_to_id)
    source_lstm = SourceLSTM()
    target_lstm = TargetLSTM()
    ff_layer = pretrain.FeedForward(2 * config.lstm_size, num_labels)

    init_op = tf.global_variables_initializer()
    batch_input = tf.placeholder("int32", shape=[None, None], name="input")
    sequence_length = tf.placeholder("int32", shape=[None], name="seqlen")
    labels = tf.placeholder("int32",
                            shape=[None, None, num_labels],
                            name="labels")

    embeddings = emb_layer.lookup(batch_input)
    source_hidden_output = source_lstm.forward(embeddings, sequence_length)
    target_hidden_output = target_lstm.forward(embeddings, sequence_length)

    #sess.run(init_op)
    target_lstm._initialize(sess)
Esempio n. 4
0
def test(sess, model, test_url, batch_size):
    test_set, test_count, _ = utils.data_set(test_url)
    test_batches = utils.create_batches(len(test_set),
                                        batch_size,
                                        shuffle=False)
    loss_sum = 0.0
    kld_sum = 0.0
    ppx_sum = 0.0
    word_count = 0
    doc_count = 0
    for idx_batch in test_batches:
        data_batch, count_batch, mask = utils.fetch_data(
            test_set, test_count, idx_batch, FLAGS.vocab_size)
        input_feed = {model.x.name: data_batch, model.mask.name: mask}
        loss, kld = sess.run([model.objective, model.kld], input_feed)
        loss_sum += np.sum(loss)
        kld_sum += np.sum(kld) / np.sum(mask)
        word_count += np.sum(count_batch)
        count_batch = np.add(count_batch, 1e-12)
        ppx_sum += np.sum(np.divide(loss, count_batch))
        doc_count += np.sum(mask)
    print_ppx = np.exp(loss_sum / word_count)
    print_ppx_perdoc = np.exp(ppx_sum / doc_count)
    print_kld = kld_sum / len(test_batches)
    print('| Epoch test: {:d} |'.format(1),
          '| Perplexity: {:.9f}'.format(print_ppx),
          '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc),
          '| KLD: {:.5}'.format(print_kld))
Esempio n. 5
0
    def prepare_batches(self, train_data, batch_size):
        train_batches = utils.create_batches(train_data, batch_size)
        batches = []

        for batch in train_batches:
            data_batch, prediction_batch = utils.unify_batch(batch)
            batches.append((data_batch, prediction_batch))

        return batches
Esempio n. 6
0
def train(nvdm, train_url, optimizer, batch_size=64, training_epochs=1000):
    train_set, train_count = utils.data_set(train_url)
    for epoch in range(training_epochs):
        train_batches = utils.create_batches(len(train_set), batch_size)
        loss_sum = 0.0
        for idx_batch in train_batches:
            data_batch, count_batch, mask = utils.fetch_data(
                train_set, train_count, idx_batch, 2000)
            data_batch = torch.FloatTensor(data_batch)
            mask = torch.FloatTensor(mask)
            loss = nvdm(data_batch, mask)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            loss_sum += loss.item()

        print(loss_sum / len(train_batches))
Esempio n. 7
0
File: main.py Progetto: mxiny/NB-NTM
def run(net, optimizer, data_list, corpus_word_count, is_train):
    perplexity = torch.tensor(0, dtype=torch.float)
    kld = torch.tensor(0, dtype=torch.float)
    doc_count = torch.tensor(0, dtype=torch.float)

    idx_batches = utils.create_batches(len(data_list),
                                       batch_size,
                                       shuffle=is_train)
    for idx_batch in idx_batches:
        # get batch data
        batch, batch_word_count, mask = utils.fetch_batch_data(
            data_list, corpus_word_count, idx_batch, vocab_num)
        batch = torch.tensor(batch, dtype=torch.float, device=device)
        batch_word_count = torch.tensor(batch_word_count,
                                        dtype=torch.float,
                                        device=device)
        mask = torch.tensor(mask, dtype=torch.float, device=device)

        # forward propagation
        shape, scale, lam, out = net(batch)

        # compute batch loss
        batch_likelihood, batch_kld = net.compute_batch_loss(
            batch, out, shape, scale)
        batch_loss = (batch_likelihood + batch_kld) * mask

        # compute cumulative loss
        perplexity += torch.sum(batch_loss /
                                (batch_word_count + 1e-12)).detach()
        kld += (torch.sum(batch_kld) / torch.sum(mask)).detach()
        doc_count += torch.sum(mask).detach()

        # train or validate
        if is_train:
            optimizer.zero_grad()
            batch_loss.backward(mask)
            optimizer.step()

    perplexity = torch.exp(perplexity / doc_count)
    kld = kld / len(idx_batches)

    return perplexity, kld
Esempio n. 8
0
 def optimize(self,
              x_train,
              y_train,
              x_test,
              y_test,
              epochs=1,
              batch_size=100):
     from utils import create_batches
     self.x_train, self.y_train = x_train, y_train
     self.x_test, self.y_test = x_test, y_test
     for epoch in range(epochs):
         for X, y in create_batches(x_train, y_train, batch_size):
             loss = self.nn.eval(X, y, training_run=True)
             self.optimizer.optimization_step()
             # print(loss)
             for callback in self.batch_callbacks:
                 callback.step()
         for callback in self.epoch_callbacks:
             callback.step()
     for callback in self.on_finish_callbacks:
         callback.step()
Esempio n. 9
0
def run(config, model_name):
    config = load_yaml(config)
    if model_name not in config['model']:
        raise NotImplementedError("{} is not implemented. ".format(model_name))
    preprocessing_params = config['preprocessing']
    training_params = config['training']
    model_params = config['model'][model_name]
    train_df = pd.read_csv(preprocessing_params['train_path'], sep='\t')
    test_df = pd.read_csv(preprocessing_params['test_path'], sep='\t')
    t_list = preprocessing_params['target_list']
    model_params['targets'] = len(t_list)

    train_df['tokens'] = train_df['Tweet'].map(lambda x: tokenize(x))
    test_df['tokens'] = test_df['Tweet'].map(lambda x: tokenize(x))
    train_df['lengths'] = train_df['tokens'].map(lambda x: len(x))
    test_df['lengths'] = test_df['tokens'].map(lambda x: len(x))

    word_freq_dict = create_freq_vocabulary(
        list(train_df['tokens']) + list(test_df['tokens']))

    tokens = get_top_freq_words(word_freq_dict, 1)

    train_df = train_df.sort_values(by="lengths")
    test_df = test_df.sort_values(by="lengths")
    embeddings = get_embeddings(path=preprocessing_params['embeddings_path'])
    w2i = create_final_dictionary(tokens,
                                  embeddings,
                                  unk_token=preprocessing_params['unk_token'],
                                  pad_token=preprocessing_params['pad_token'])
    emb_matrix = get_embeddings_matrix(w2i, embeddings,
                                       preprocessing_params['embedding_size'])

    model_params['embeddings'] = emb_matrix

    train_batches = create_batches(train_df,
                                   training_params['batch_size'],
                                   w2i=w2i,
                                   pad_token=preprocessing_params['pad_token'],
                                   unk_token=preprocessing_params['unk_token'],
                                   target_list=t_list)
    test_batches = create_batches(test_df,
                                  training_params['batch_size'],
                                  w2i=w2i,
                                  pad_token=preprocessing_params['pad_token'],
                                  unk_token=preprocessing_params['unk_token'],
                                  target_list=t_list)

    model = ModelFactory.get_model(model_name, model_params)
    optimizer = Adam(model.trainable_weights, training_params['lr'])
    criterion = BCEWithLogitsLoss()
    train(model,
          train_batches,
          test_batches,
          optimizer,
          criterion,
          epochs=training_params['epochs'],
          init_patience=training_params['patience'],
          cuda=False,
          target_list=t_list)
    model = load_model(model)
    full_classification_report(model, test_batches, t_list)
Esempio n. 10
0
def train(sess,
          model,
          train_url,
          test_url,
          batch_size,
          training_epochs=1000,
          alternate_epochs=10):
    """train nvdm model."""
    train_set, train_count = utils.data_set(train_url)
    test_set, test_count = utils.data_set(test_url)
    # hold-out development dataset
    dev_set = test_set[:50]
    dev_count = test_count[:50]

    dev_batches = utils.create_batches(len(dev_set), batch_size, shuffle=False)
    test_batches = utils.create_batches(len(test_set),
                                        batch_size,
                                        shuffle=False)

    for epoch in range(training_epochs):
        train_batches = utils.create_batches(len(train_set),
                                             batch_size,
                                             shuffle=True)
        #-------------------------------
        # train
        for switch in range(0, 2):
            if switch == 0:
                optim = model.optim_dec
                print_mode = 'updating decoder'
            else:
                optim = model.optim_enc
                print_mode = 'updating encoder'
            for i in range(alternate_epochs):
                loss_sum = 0.0
                ppx_sum = 0.0
                kld_sum = 0.0
                word_count = 0
                doc_count = 0
                for idx_batch in train_batches:
                    data_batch, count_batch, mask = utils.fetch_data(
                        train_set, train_count, idx_batch, FLAGS.vocab_size)
                    input_feed = {
                        model.x.name: data_batch,
                        model.mask.name: mask
                    }
                    _, (loss, kld) = sess.run(
                        (optim, [model.objective, model.kld]), input_feed)
                    loss_sum += np.sum(loss)
                    kld_sum += np.sum(kld) / np.sum(mask)
                    word_count += np.sum(count_batch)
                    # to avoid nan error
                    count_batch = np.add(count_batch, 1e-12)
                    # per document loss
                    ppx_sum += np.sum(np.divide(loss, count_batch))
                    doc_count += np.sum(mask)
                print_ppx = np.exp(loss_sum / word_count)
                print_ppx_perdoc = np.exp(ppx_sum / doc_count)
                print_kld = kld_sum / len(train_batches)
                print(
                    '| Epoch train: {:d} |'.format(epoch + 1),
                    print_mode,
                    '{:d}'.format(i),
                    '| Corpus ppx: {:.5f}'.format(
                        print_ppx),  # perplexity for all docs
                    '| Per doc ppx: {:.5f}'.format(
                        print_ppx_perdoc),  # perplexity for per doc
                    '| KLD: {:.5}'.format(print_kld))
        #-------------------------------
        # dev
        loss_sum = 0.0
        kld_sum = 0.0
        ppx_sum = 0.0
        word_count = 0
        doc_count = 0
        for idx_batch in dev_batches:
            data_batch, count_batch, mask = utils.fetch_data(
                dev_set, dev_count, idx_batch, FLAGS.vocab_size)
            input_feed = {model.x.name: data_batch, model.mask.name: mask}
            loss, kld = sess.run([model.objective, model.kld], input_feed)
            loss_sum += np.sum(loss)
            kld_sum += np.sum(kld) / np.sum(mask)
            word_count += np.sum(count_batch)
            count_batch = np.add(count_batch, 1e-12)
            ppx_sum += np.sum(np.divide(loss, count_batch))
            doc_count += np.sum(mask)
        print_ppx = np.exp(loss_sum / word_count)
        print_ppx_perdoc = np.exp(ppx_sum / doc_count)
        print_kld = kld_sum / len(dev_batches)
        print('| Epoch dev: {:d} |'.format(epoch + 1),
              '| Perplexity: {:.9f}'.format(print_ppx),
              '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc),
              '| KLD: {:.5}'.format(print_kld))
        #-------------------------------
        # test
        if FLAGS.test:
            loss_sum = 0.0
            kld_sum = 0.0
            ppx_sum = 0.0
            word_count = 0
            doc_count = 0
            for idx_batch in test_batches:
                data_batch, count_batch, mask = utils.fetch_data(
                    test_set, test_count, idx_batch, FLAGS.vocab_size)
                input_feed = {model.x.name: data_batch, model.mask.name: mask}
                loss, kld = sess.run([model.objective, model.kld], input_feed)
                loss_sum += np.sum(loss)
                kld_sum += np.sum(kld) / np.sum(mask)
                word_count += np.sum(count_batch)
                count_batch = np.add(count_batch, 1e-12)
                ppx_sum += np.sum(np.divide(loss, count_batch))
                doc_count += np.sum(mask)
            print_ppx = np.exp(loss_sum / word_count)
            print_ppx_perdoc = np.exp(ppx_sum / doc_count)
            print_kld = kld_sum / len(test_batches)
            print('| Epoch test: {:d} |'.format(epoch + 1),
                  '| Perplexity: {:.9f}'.format(print_ppx),
                  '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc),
                  '| KLD: {:.5}'.format(print_kld))
        "-w", "--word2vec", default=True,
        help="Use word2vec embeddings"
    )
    optparser.add_option(
        "-r", "--restore", default=True,
        help="Rebuild the model and restore weights from checkpoint"
    )
    opts = optparser.parse_args()[0]

    sess = tf.Session()

    adv = AdversarialLearning(sess, opts)

    input_x, _ = loader.prepare_input(config.datadir + config.train)
    s_seqlen, s_input = utils.convert_to_id(input_x, adv.emb_layer.word_to_id)
    s_seqlen, s_input = utils.create_batches(s_input, s_seqlen)
    input_x, _ = loader.prepare_medpost_input()
    t_seqlen, t_input = utils.convert_to_id(input_x, adv.emb_layer.word_to_id)
    t_seqlen, t_input = utils.create_batches(t_input, t_seqlen)
    s_len = len(s_input)
    t_len = len(t_input)

    # Do not initialize Source and Target LSTM weights; The variables are from index 0 to 8.
    # TODO: Find better fix for initialization of variables
    init = tf.variables_initializer(tf.global_variables()[9:])
    sess.run(init)

    gloss = []
    dloss = []
    plt.axis([0, 10000, 0, 4])
    plt.ion()
Esempio n. 12
0
def train(sess,
          model,
          train_url,
          test_url,
          dev_url,
          model_url,
          batch_size,
          saver,
          training_epochs=400,
          alternate_epochs=1):
    """train nvctm model."""
    train_set, train_count = utils.data_set(train_url)
    dev_set, dev_count = utils.data_set(dev_url)
    test_set, test_count = utils.data_set(test_url)

    dev_batches = utils.create_batches(len(dev_set), batch_size, shuffle=False)
    test_batches = utils.create_batches(len(test_set),
                                        batch_size,
                                        shuffle=False)

    train_theta = []
    train_beta = []
    for epoch in range(training_epochs):
        train_batches = utils.create_batches(len(train_set),
                                             batch_size,
                                             shuffle=True)
        # -------------------------------
        # train
        for switch in range(0, 2):
            if switch == 0:
                optim = model.optim_dec
                print_mode = 'updating decoder'
            else:
                optim = model.optim_enc
                print_mode = 'updating encoder'
            for i in range(alternate_epochs):
                loss_sum = 0.0
                ppx_sum = 0.0
                kld_sum = 0.0
                word_count = 0
                doc_count = 0
                res_sum = 0
                log_sum = 0
                mean_sum = 0
                var_sum = 0
                m = None
                Um = None
                enc = None

                for idx_batch in train_batches:
                    data_batch, count_batch, mask = utils.fetch_data(
                        train_set, train_count, idx_batch, FLAGS.vocab_size)
                    input_feed = {
                        model.x.name: data_batch,
                        model.mask.name: mask
                    }
                    _, (loss, kld, mean, Umean, enc, rec_loss, log_s, mean_s,
                        vk_show, theta, beta, lp, v) = sess.run((optim, [
                            model.objective, model.kld, model.mean, model.U,
                            model.vk, model.recons_loss, model.log_squre,
                            model.mean_squre, model.vk_show, model.theta,
                            model.beta, model.log_prob, model.variance
                        ]), input_feed)
                    m = mean
                    Um = Umean
                    # print('*********************vk show', vk_show)
                    # print('Umean', Umean[0])
                    loss_sum += np.sum(loss)
                    kld_sum += np.sum(kld) / np.sum(mask)
                    word_count += np.sum(count_batch)
                    res_sum += np.sum(rec_loss)
                    log_sum += np.sum(log_s)
                    mean_sum += np.sum(mean_s)
                    var_sum += np.sum(v) / np.sum(mask)
                    # to avoid nan error
                    count_batch = np.add(count_batch, 1e-12)
                    # per document loss
                    ppx_sum += np.sum(np.divide(loss, count_batch))
                    doc_count += np.sum(mask)

                    if epoch == training_epochs - 1 and switch == 1 and i == alternate_epochs - 1:
                        train_theta.extend(theta)
                        train_beta.extend(beta)

                print_ppx = np.exp(loss_sum / word_count)
                # print_ppx_perdoc = np.exp(ppx_sum / doc_count)
                print_kld = kld_sum / len(train_batches)
                print_res = res_sum / len(train_batches)
                print_log = log_sum / len(train_batches)
                print_mean = mean_sum / len(train_batches)
                print_var = var_sum / len(train_batches)

                print(
                    '| Epoch train: {:d} |'.format(epoch + 1),
                    print_mode,
                    '{:d}'.format(i),
                    '| Corpus ppx: {:.5f}'.format(
                        print_ppx),  # perplexity per word
                    # '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc),  # perplexity for per doc
                    '| KLD: {:.5}'.format(print_kld),
                    '| stddev {:.5}'.format(print_var),
                    '| res_loss: {:5}'.format(print_res),
                    '| log_loss: {:5}'.format(print_log),
                    '| mean_loss: {:5}'.format(print_mean))

                with codecs.open('./nvctm_train_theta', 'wb') as fp:
                    pickle.dump(np.array(train_theta), fp)
                fp.close()

                if (epoch + 1
                    ) % 50 == 0 and switch == 1 and i == alternate_epochs - 1:
                    with codecs.open('./nvctm_train_beta', 'wb') as fp:
                        pickle.dump(beta, fp)
                    fp.close()
                    npmi.print_coherence('nvctm',
                                         FLAGS.data_dir + '/train.feat',
                                         FLAGS.vocab_size)

        # dev
        loss_sum = 0.0
        kld_sum = 0.0
        ppx_sum = 0.0
        var_sum = 0
        word_count = 0
        doc_count = 0
        for idx_batch in dev_batches:
            data_batch, count_batch, mask = utils.fetch_data(
                dev_set, dev_count, idx_batch, FLAGS.vocab_size)
            input_feed = {model.x.name: data_batch, model.mask.name: mask}
            loss, kld, v = sess.run(
                [model.objective, model.kld, model.variance], input_feed)
            loss_sum += np.sum(loss)
            kld_sum += np.sum(kld) / np.sum(mask)
            var_sum += np.sum(v) / np.sum(mask)
            word_count += np.sum(count_batch)
            count_batch = np.add(count_batch, 1e-12)
            ppx_sum += np.sum(np.divide(loss, count_batch))
            doc_count += np.sum(mask)
        print_ppx = np.exp(loss_sum / word_count)
        print_var = var_sum / len(train_batches)
        # print_ppx_perdoc = np.exp(ppx_sum / doc_count)
        print_kld = kld_sum / len(dev_batches)
        print('\n| Epoch dev: {:d}'.format(epoch + 1),
              '| Perplexity: {:.9f}'.format(print_ppx),
              '| stddev {:.5}'.format(print_var),
              '| KLD: {:.5}'.format(print_kld))

        # test
        if FLAGS.test:
            loss_sum = 0.0
            kld_sum = 0.0
            ppx_sum = 0.0
            var_sum = 0.0
            word_count = 0
            doc_count = 0
            for idx_batch in test_batches:
                data_batch, count_batch, mask = utils.fetch_data(
                    test_set, test_count, idx_batch, FLAGS.vocab_size)
                input_feed = {model.x.name: data_batch, model.mask.name: mask}
                loss, kld, v = sess.run(
                    [model.objective, model.kld, model.variance], input_feed)
                loss_sum += np.sum(loss)
                kld_sum += np.sum(kld) / np.sum(mask)
                var_sum += np.sum(v) / np.sum(mask)
                word_count += np.sum(count_batch)
                count_batch = np.add(count_batch, 1e-12)
                ppx_sum += np.sum(np.divide(loss, count_batch))
                doc_count += np.sum(mask)
            print_ppx = np.exp(loss_sum / word_count)
            print_var = var_sum / len(train_batches)
            # print_ppx_perdoc = np.exp(ppx_sum / doc_count)
            print_kld = kld_sum / len(test_batches)
            print('| Epoch test: {:d}'.format(epoch + 1),
                  '| Perplexity: {:.9f}'.format(print_ppx),
                  '| stddev {:.5}'.format(print_var),
                  '| KLD: {:.5}\n'.format(print_kld))
    npmi.print_coherence('nvctm', FLAGS.data_dir + '/train.feat',
                         FLAGS.vocab_size)
    saver.save(sess, model_url)
Esempio n. 13
0
def train(gpu, args):
    rank = args.nr * args.gpus + gpu
    dist.init_process_group(backend='nccl',
                            init_method='env://',
                            world_size=args.world_size,
                            rank=rank)
    torch.manual_seed(0)

    words = read_words(
        '/users/PAS1588/liuluyu0378/lab1/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled',
        seq_len, kernel[0])
    word_counter = collections.Counter(words).most_common(vocab_size - 1)
    vocab = [w for w, _ in word_counter]
    w2i = dict((w, i) for i, w in enumerate(vocab, 1))
    w2i['<unk>'] = 0
    print('vocab_size', vocab_size)
    print('w2i size', len(w2i))

    data = [w2i[w] if w in w2i else 0 for w in words]
    data = create_batches(data, batch_size, seq_len)
    split_idx = int(len(data) * 0.8)
    training_data = data[:split_idx]
    test_data = data[split_idx:]
    print('train samples:', len(training_data))
    print('test samples:', len(test_data))

    model = GatedCNN(seq_len, vocab_size, embd_size, n_layers, kernel, out_chs,
                     res_block_count, vocab_size)

    torch.cuda.set_device(gpu)
    model.cuda(gpu)
    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda(gpu)
    optimizer = torch.optim.SGD(model.parameters(), 1e-4)
    # Wrap the model
    model = nn.parallel.DataParallel(model, device_ids=[gpu])
    print("model transfered")

    optimizer = torch.optim.Adadelta(model.parameters())
    loss_fn = nn.NLLLoss()
    # Data loading code

    train_sampler = torch.utils.data.distributed.DistributedSampler(
        training_data, num_replicas=args.world_size, rank=rank)
    train_loader = torch.utils.data.DataLoader(dataset=training_data,
                                               batch_size=batch_size,
                                               shuffle=False,
                                               num_workers=0,
                                               pin_memory=True,
                                               sampler=train_sampler)

    start = datetime.now()
    total_step = len(train_loader)

    print("loaded")
    for epoch in range(args.epochs):
        a = time.time()
        print('----epoch', epoch)
        # random.shuffle(data)
        # print(len(data))
        for batch_ct, (X, Y) in enumerate(train_loader):
            X = to_var(torch.LongTensor(X))  # (bs, seq_len)
            Y = to_var(torch.LongTensor(Y))  # (bs,)
            # print(X.size(), Y.size())
            # print(X)
            # print(batch_ct, X.size(), Y.size())
            pred = model(X)  # (bs, ans_size)
            # _, pred_ids = torch.max(pred, 1)
            loss = loss_fn(pred, Y)
            if batch_ct % 100 == 0:
                print('loss: {:.4f}'.format(loss.data.item()))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        b = time.time()
        print('current performance at epoch', epoch, "time:", b - a)

    if gpu == 0:
        print("Training complete in: " + str(datetime.now() - start))
Esempio n. 14
0
    # load model
    model = torch.load(fn_model, map_location={'cuda:1':'cuda:{}'.format(gpu_device)})
    # make the rnn parameters a continuous chunk, which will speed up forward pass
    model.rnn.flatten_parameters()
    criterion = torch.nn.CrossEntropyLoss()

    loader = DataLoader(fn_vocab)

    plt.figure()
    for fn in fn_data:
        # prepare dataset
        print('Processing %s...' % fn)
        word_list = load_clm_words(fn)
        test_data = loader.tokenize(word_list)
        test_data_batches = utils.create_batches(test_data, batch_size=1, device='cuda')

        losses = evaluate(test_data_batches)

        ppl_counter = Counter()

        x_interval = numpy.array([i*0.2 for i in range(100)])
        for loss in losses:
            idx = numpy.argmin(abs(x_interval-loss))
            ppl_counter.update([x_interval[idx]])

        keys = []
        vals = []
        for key, value in sorted(ppl_counter.items()):
            keys.append(key)
            vals.append(value)
Esempio n. 15
0
def train(sess, model, train_url, test_url, dev_url, batch_size, training_epochs=1000, alternate_epochs=1):
    """train gsm model."""
    train_set, train_count = utils.data_set(train_url)
    test_set, test_count = utils.data_set(test_url)
    dev_set, dev_count = utils.data_set(dev_url)

    dev_batches = utils.create_batches(len(dev_set), batch_size, shuffle=False)
    test_batches = utils.create_batches(len(test_set), batch_size, shuffle=False)

    kld_list = []
    var_list = []
    train_theta = []
    train_beta = []
    test_theta = []
    test_beta = []
    for epoch in range(training_epochs):
        train_batches = utils.create_batches(len(train_set), batch_size, shuffle=True)
        # -------------------------------
        # train
        for switch in range(0, 2):
            if switch == 0:
                optimize = model.optimize_dec
                print_mode = 'updating decoder'
            elif switch == 1:
                optimize = model.optimize_enc
                print_mode = 'updating encoder'
            for i in range(alternate_epochs):
                loss_sum = 0.0
                ppx_sum = 0.0
                kld_sum = 0.0
                word_count = 0
                doc_count = 0
                var_sum = 0
                for idx_batch in train_batches:
                    data_batch, count_batch, mask = utils.fetch_data(
                        train_set, train_count, idx_batch, FLAGS.vocab_size)

                    input_feed = {model.x.name: data_batch, model.mask.name: mask, model.is_training.name: True, model.gamma.name: epoch/training_epochs}
                    _, (loss, kld, v, theta, beta) =\
                        sess.run((optimize, [model.reconstruction_loss, model.kld, model.variance, model.topic_dist, model.beta]), input_feed)
                    loss_sum += np.sum(loss)
                    kld_sum += np.sum(kld) / np.sum(mask)
                    var_sum += np.sum(v) / np.sum(mask)
                    # print([np.max(theta[i]) for i in range(batch_size)])
                    # print([np.argmax(theta[i]) for i in range(batch_size)])
                    word_count += np.sum(count_batch)
                    # to avoid nan error
                    count_batch = np.add(count_batch, 1e-12)
                    # per document loss
                    ppx_sum += np.sum(np.divide(loss, count_batch))
                    doc_count += np.sum(mask)

                    if epoch == training_epochs - 1 and switch == 1 and i == alternate_epochs - 1:
                        train_theta.extend(theta)
                        train_beta.extend(beta)

                print_ppx = np.exp(loss_sum / word_count)
                print_ppx_perdoc = np.exp(ppx_sum / doc_count)
                print_kld = kld_sum / len(train_batches)
                print_var = var_sum / len(train_batches)
                kld_list.append(print_kld)
                var_list.append(print_var)
                print('| Epoch train: {:d}'.format(epoch + 1),
                      print_mode, '{:d}'.format(i + 1),
                      '| Corpus ppx: {:.5f}'.format(print_ppx),  # perplexity for all docs
                      '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc),  # perplexity for per doc
                      '| KLD: {:.5}'.format(print_kld),
                      '| stddev {:.5}'.format(print_var))

                with codecs.open('./gsm_train_theta', 'wb') as fp:
                    pickle.dump(np.array(train_theta), fp)
                fp.close()

                if (epoch + 1) % 50 == 0 and switch == 1 and i == alternate_epochs - 1:
                    with codecs.open('./gsm_train_beta', 'wb') as fp:
                        pickle.dump(beta, fp)
                    fp.close()
                    npmi.print_coherence('gsm', FLAGS.data_dir + '/train.feat', FLAGS.vocab_size)

        # -------------------------------
        # dev
        loss_sum = 0.0
        kld_sum = 0.0
        ppx_sum = 0.0
        word_count = 0
        doc_count = 0
        var_sum = 0
        for idx_batch in dev_batches:
            data_batch, count_batch, mask = utils.fetch_data(dev_set, dev_count, idx_batch, FLAGS.vocab_size)
            input_feed = {model.x.name: data_batch, model.mask.name: mask, model.is_training.name: False, model.gamma.name: 0}
            loss, kld, v = sess.run([model.objective, model.kld, model.variance], input_feed)
            loss_sum += np.sum(loss)
            kld_sum += np.sum(kld) / np.sum(mask)
            word_count += np.sum(count_batch)
            count_batch = np.add(count_batch, 1e-12)
            ppx_sum += np.sum(np.divide(loss, count_batch))
            var_sum += np.sum(v) / np.sum(mask)
            doc_count += np.sum(mask)
        print_ppx = np.exp(loss_sum / word_count)
        print_ppx_perdoc = np.exp(ppx_sum / doc_count)
        print_kld = kld_sum / len(dev_batches)
        print_var = var_sum / len(train_batches)
        print('\n| Epoch dev: {:d}'.format(epoch + 1),
              '| Perplexity: {:.9f}'.format(print_ppx),
              '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc),
              '| KLD: {:.5}'.format(print_kld),
              '| stddev: {:.5}'.format(print_var))

        # test
        if FLAGS.test:
            loss_sum = 0.0
            kld_sum = 0.0
            ppx_sum = 0.0
            word_count = 0
            doc_count = 0
            for idx, idx_batch in enumerate(test_batches):
                data_batch, count_batch, mask = utils.fetch_data(
                    test_set, test_count, idx_batch, FLAGS.vocab_size)
                input_feed = {model.x.name: data_batch, model.mask.name: mask, model.is_training.name: False, model.gamma.name: 0}
                loss, kld, theta, beta, v = sess.run([model.objective, model.kld, model.topic_dist, model.beta, model.variance], input_feed)
                loss_sum += np.sum(loss)
                kld_sum += np.sum(kld) / np.sum(mask)
                word_count += np.sum(count_batch)
                count_batch = np.add(count_batch, 1e-12)
                ppx_sum += np.sum(np.divide(loss, count_batch))
                doc_count += np.sum(mask)
                test_theta.extend(theta)
                if idx == len(test_batches) - 1:
                    test_beta.extend(beta)
            print_ppx = np.exp(loss_sum / word_count)
            print_ppx_perdoc = np.exp(ppx_sum / doc_count)
            print_kld = kld_sum / len(test_batches)
            print('| Epoch test: {:d}'.format(epoch + 1),
                  '| Perplexity: {:.9f}'.format(print_ppx),
                  '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc),
                  '| KLD: {:.5}'.format(print_kld),
                  '| stddev: {:.5}\n'.format(print_var))

    npmi.print_coherence('gsm', FLAGS.data_dir + '/train.feat', FLAGS.vocab_size)

    with codecs.open('./test_theta', 'wb') as fp:
        pickle.dump(test_theta, fp)
    fp.close()

    with codecs.open('./test_beta', 'wb') as fp:
        pickle.dump(test_beta, fp)
    fp.close()

    with codecs.open('./kld.txt', 'w', 'utf-8') as fp:
        for idx, kld in enumerate(kld_list):
            if idx < len(kld_list) - 1:
                fp.write(str(kld) + ', ')
            else:
                fp.write(str(kld))
        fp.close()
    with codecs.open('./var.txt', 'w', 'utf-8') as fp:
        for idx, var in enumerate(var_list):
            if idx < len(var_list) - 1:
                fp.write(str(var) + ', ')
            else:
                fp.write(str(var))
        fp.close()
Esempio n. 16
0
def train(sess,
          model,
          train_url,
          test_url,
          batch_size,
          training_epochs=1000,
          alternate_epochs=10):
    """train gsm model."""
    # train_set: 维度为1 x vocab_size,每一维是对应的词出现次数, train_count: 训练集的总词数
    train_set, train_count = utils.data_set(train_url)
    test_set, test_count = utils.data_set(test_url)
    # hold-out development dataset, 选取前50篇文档
    dev_set = test_set[:50]
    dev_count = test_count[:50]

    dev_batches = utils.create_batches(len(dev_set), batch_size, shuffle=False)
    test_batches = utils.create_batches(len(test_set),
                                        batch_size,
                                        shuffle=False)

    for epoch in range(training_epochs):
        # 创建batches,大小为batch_size
        train_batches = utils.create_batches(len(train_set),
                                             batch_size,
                                             shuffle=True)
        # -------------------------------
        # train
        for switch in range(0, 2):
            if switch == 0:
                optimize = model.optimize_dec
                print_mode = 'updating decoder'
            elif switch == 1:
                optimize = model.optimize_enc
                print_mode = 'updating encoder'
            for i in range(alternate_epochs):
                loss_sum = 0.0
                ppx_sum = 0.0
                kld_sum = 0.0
                word_count = 0
                doc_count = 0
                # 训练每个batch
                for idx_batch in train_batches:
                    '''
                    data_batch: 当前batch的词频向量集合,batch_size*vocab_size
                    count_batch: 当前batch中每篇文档的词数
                    train_set: 训练集
                    train_count: 训练集词数
                    idx_batch: 当前batch
                    mask: 用于某个batch文档不足时做序列对齐
                    '''
                    data_batch, count_batch, mask = utils.fetch_data(
                        train_set, train_count, idx_batch, FLAGS.vocab_size)
                    # input: x = data_batch, mask = mask
                    input_feed = {
                        model.x.name: data_batch,
                        model.mask.name: mask
                    }
                    # return: loss = objective, kld = kld, optimizer = optimize
                    # 以上三者组成feed_dict, 将模型中的tensor映射到具体的值
                    _, (loss, kld) = sess.run(
                        (optimize, [model.objective, model.kld]), input_feed)
                    loss_sum += np.sum(loss)
                    kld_sum += np.sum(kld) / np.sum(mask)
                    # 总词数
                    word_count += np.sum(count_batch)
                    # to avoid nan error, 避免0分母
                    count_batch = np.add(count_batch, 1e-12)
                    # per document loss
                    ppx_sum += np.sum(np.divide(loss, count_batch))
                    doc_count += np.sum(mask)
                print_ppx = np.exp(loss_sum / word_count)
                print_ppx_perdoc = np.exp(ppx_sum / doc_count)
                print_kld = kld_sum / len(train_batches)
                print(
                    '| Epoch train: {:d} |'.format(epoch + 1),
                    print_mode,
                    '{:d}'.format(i + 1),
                    '| Corpus ppx: {:.5f}'.format(
                        print_ppx),  # perplexity for all docs
                    '| Per doc ppx: {:.5f}'.format(
                        print_ppx_perdoc),  # perplexity for per doc
                    '| KLD: {:.5}'.format(print_kld))
        # -------------------------------
        # dev
        loss_sum = 0.0
        kld_sum = 0.0
        ppx_sum = 0.0
        word_count = 0
        doc_count = 0
        for idx_batch in dev_batches:
            data_batch, count_batch, mask = utils.fetch_data(
                dev_set, dev_count, idx_batch, FLAGS.vocab_size)
            input_feed = {model.x.name: data_batch, model.mask.name: mask}
            loss, kld = sess.run([model.objective, model.kld], input_feed)
            loss_sum += np.sum(loss)
            kld_sum += np.sum(kld) / np.sum(mask)
            word_count += np.sum(count_batch)
            count_batch = np.add(count_batch, 1e-12)
            ppx_sum += np.sum(np.divide(loss, count_batch))
            doc_count += np.sum(mask)
        print_ppx = np.exp(loss_sum / word_count)
        print_ppx_perdoc = np.exp(ppx_sum / doc_count)
        print_kld = kld_sum / len(dev_batches)
        print('| Epoch dev: {:d} |'.format(epoch + 1),
              '| Perplexity: {:.9f}'.format(print_ppx),
              '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc),
              '| KLD: {:.5}'.format(print_kld))
        # -------------------------------
        # test
        if FLAGS.test:
            loss_sum = 0.0
            kld_sum = 0.0
            ppx_sum = 0.0
            word_count = 0
            doc_count = 0
            for idx_batch in test_batches:
                data_batch, count_batch, mask = utils.fetch_data(
                    test_set, test_count, idx_batch, FLAGS.vocab_size)
                input_feed = {model.x.name: data_batch, model.mask.name: mask}
                loss, kld = sess.run([model.objective, model.kld], input_feed)
                loss_sum += np.sum(loss)
                kld_sum += np.sum(kld) / np.sum(mask)
                word_count += np.sum(count_batch)
                count_batch = np.add(count_batch, 1e-12)
                ppx_sum += np.sum(np.divide(loss, count_batch))
                doc_count += np.sum(mask)
            print_ppx = np.exp(loss_sum / word_count)
            print_ppx_perdoc = np.exp(ppx_sum / doc_count)
            print_kld = kld_sum / len(test_batches)
            print('| Epoch test: {:d} |'.format(epoch + 1),
                  '| Perplexity: {:.9f}'.format(print_ppx),
                  '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc),
                  '| KLD: {:.5}'.format(print_kld))
Esempio n. 17
0
def train(
    train_url,
    test_url,
    model_url,
    vocab_url,
    non_linearity,
    embedding_url,
    training_epochs,
    alternate_epochs,
    vocab_size,
    embedding_size,
    n_hidden,
    n_topic,
    n_sample,
    learning_rate,
    batch_size,
    is_training,
    mix_num,
):
    """train crntm model."""

    train_set, train_count = utils.data_set(train_url)
    test_set, test_count = utils.data_set(test_url)
    vocab = utils.get_vocab(vocab_url)
    embedding_table = utils.load_embedding(
        embedding_url, embedding_size, vocab,
        FLAGS.data_dir + '/vocab_embedding-{}.pkl'.format(embedding_size))

    # hold-out development dataset
    dev_count = test_count[:50]
    dev_onehot_set = test_set[:50]
    dev_batches = utils.create_batches(len(dev_onehot_set),
                                       batch_size,
                                       shuffle=False)
    test_batches = utils.create_batches(len(test_set),
                                        batch_size,
                                        shuffle=False)

    # create model
    crntm = CRNTM(vocab_size=vocab_size,
                  embedding_size=embedding_size,
                  n_hidden=n_hidden,
                  n_topic=n_topic,
                  n_sample=n_sample,
                  learning_rate=learning_rate,
                  batch_size=batch_size,
                  non_linearity=non_linearity,
                  embedding_table=embedding_table,
                  is_training=is_training,
                  mix_num=mix_num)
    crntm.construct_model()

    sess = tf.Session()
    init = tf.initialize_all_variables()
    sess.run(init)
    model = crntm
    saver = tf.train.Saver()

    #
    # if RESTORE:
    #     return embedding_table[1:]

    for epoch in range(training_epochs):
        train_batches = utils.create_batches(len(train_set),
                                             batch_size,
                                             shuffle=True)
        #-------------------------------
        # train
        for switch in range(0, 2):
            if switch == 0:
                optim = model.optim_dec
                print_mode = 'updating decoder'
            else:
                optim = model.optim_enc
                print_mode = 'updating encoder'
            for i in range(alternate_epochs):
                loss_sum = 0.0
                ppx_sum = 0.0
                kld_sum = 0.0
                word_count = 0
                doc_count = 0
                res_sum = 0
                log_sum = 0
                r_sum = 0
                log_s = None
                r_loss = None
                g_loss = None
                for bn, idx_batch in enumerate(train_batches):
                    data_onehot_batch, count_batch, mask = utils.fetch_data(
                        train_set, train_count, idx_batch, FLAGS.vocab_size)

                    input_feed = {
                        model.x_onehot.name: data_onehot_batch,
                        model.mask.name: mask
                    }
                    _, (loss, kld, rec_loss, log_s, r_loss, g_loss) = sess.run(
                        (optim, [
                            model.objective, model.kld, model.recons_loss,
                            model.logits, model.doc_vec, model.topic_word_prob
                        ]), input_feed)

                    # if switch==0:
                    # #     # print(bn, len(train_batches), mask.sum(), r_loss.shape)
                    #     print('ptheta', log_s)
                    #     print('doc_Vec', r_loss)
                    #     print('topic_prob', g_loss)

                    res_sum += np.sum(rec_loss)
                    log_sum += np.sum(log_s)
                    loss_sum += np.sum(loss)
                    r_sum += np.sum(r_loss)
                    kld_sum += np.sum(kld) / np.sum(mask)
                    word_count += np.sum(count_batch)
                    # to avoid nan error
                    count_batch = np.add(count_batch, 1e-12)
                    # per document loss
                    ppx_sum += np.sum(np.divide(loss, count_batch))
                    # print(np.sum(np.divide(loss, count_batch)))
                    doc_count += np.sum(mask)
                    # if doc_count>11264:
                    #   print('debug:: ', doc_count, rec_loss, kld, loss[-1], count_batch[-1])
                print_ppx = np.exp(loss_sum / word_count)
                print_ppx_perdoc = np.exp(ppx_sum / doc_count)
                print_kld = kld_sum / len(train_batches)
                print_res = res_sum / len(train_batches)
                print_log = log_sum / len(train_batches)
                print_mean = r_sum / len(train_batches)
                message = '| Epoch train: {:d} | {} {:d} | Corpus ppx: {:.5f}::{} | Per doc ppx: {:.5f}::{} | KLD: {:.5} | res_loss: {:5} | log_loss: {:5} | r_loss: {:5}'.format(
                    epoch + 1,
                    print_mode,
                    i,
                    print_ppx,
                    word_count,
                    print_ppx_perdoc,
                    doc_count,
                    print_kld,
                    print_res,
                    print_log,
                    print_mean,
                )
                print(message)
                write_result(message)
        TopicWords(sess, vocab_url, embedding_table[1:])

        #-------------------------------
        # dev
        loss_sum = 0.0
        ppx_sum = 0.0
        kld_sum = 0.0
        word_count = 0
        doc_count = 0
        res_sum = 0
        log_sum = 0
        mean_sum = 0
        r_sum = 0
        for idx_batch in dev_batches:
            data_onehot_batch, count_batch, mask = utils.fetch_data(
                dev_onehot_set, dev_count, idx_batch, FLAGS.vocab_size)

            input_feed = {
                model.x_onehot.name: data_onehot_batch,
                model.mask.name: mask
            }
            loss, kld, rec_loss, log_s, r_loss = sess.run([
                model.objective, model.kld, model.recons_loss,
                model.embedding_loss, model.res_loss
            ], input_feed)

            res_sum += np.sum(rec_loss)
            log_sum += np.sum(log_s)
            loss_sum += np.sum(loss)
            r_sum += np.sum(r_loss)
            kld_sum += np.sum(kld) / np.sum(mask)
            word_count += np.sum(count_batch)
            # to avoid nan error
            count_batch = np.add(count_batch, 1e-12)
            # per document loss
            ppx_sum += np.sum(np.divide(loss, count_batch))
            # print(np.sum(np.divide(loss, count_batch)))
            doc_count += np.sum(mask)
            # if doc_count>11264:
            #   print('debug:: ', doc_count, rec_loss, kld, loss[-1], count_batch[-1])
        print_ppx = np.exp(loss_sum / word_count)
        print_ppx_perdoc = np.exp(ppx_sum / doc_count)
        # print_ppx_perdoc = ppx_sum / doc_count
        # print(loss_sum, word_count)
        print_kld = kld_sum / len(train_batches)
        print_res = res_sum / len(train_batches)
        print_log = log_sum / len(train_batches)
        print_mean = r_sum / len(train_batches)
        message = '| Epoch dev: {:d} | Corpus ppx: {:.5f}::{} | Per doc ppx: {:.5f}::{} | KLD: {:.5} | res_loss: {:5} | log_loss: {:5} | r_loss: {:5}'.format(
            epoch + 1,
            print_ppx,
            word_count,
            print_ppx_perdoc,
            doc_count,
            print_kld,
            print_res,
            print_log,
            print_mean,
        )
        print(message)
        write_result(message)

        # test
        if FLAGS.test:
            loss_sum = 0.0
            kld_sum = 0.0
            ppx_sum = 0.0
            word_count = 0
            doc_count = 0
            for idx_batch in test_batches:
                data_onehot_batch, count_batch, mask = utils.fetch_data(
                    test_set, test_count, idx_batch, FLAGS.vocab_size)
                input_feed = {
                    model.x_onehot.name: data_onehot_batch,
                    model.mask.name: mask
                }
                loss, kld = sess.run([model.objective, model.kld], input_feed)
                loss_sum += np.sum(loss)
                kld_sum += np.sum(kld) / np.sum(mask)
                word_count += np.sum(count_batch)
                count_batch = np.add(count_batch, 1e-12)
                ppx_sum += np.sum(np.divide(loss, count_batch))
                doc_count += np.sum(mask)
            print_ppx = np.exp(loss_sum / word_count)
            print_ppx_perdoc = np.exp(ppx_sum / doc_count)
            print_kld = kld_sum / len(test_batches)
            message = '| Epoch test: {:d} | Corpus ppx: {:.5f} | Per doc ppx: {:.5f} | KLD: {:.5} '.format(
                epoch + 1,
                print_ppx,
                print_ppx_perdoc,
                print_kld,
            )
            print(message)
            write_result(message)

    saver.save(sess, model_url)
Esempio n. 18
0
def train(sess,
          model,
          train_url,
          test_url,
          batch_size,
          FLAGS,
          train_csv_filename,
          dev_csv_filename,
          test_csv_filename,
          training_epochs=1000,
          alternate_epochs=10,
          is_restore=False):
    """train nvdm model."""
    train_set, train_count = utils.data_set(train_url)
    test_set, test_count = utils.data_set(test_url)
    # hold-out development dataset
    dev_set = test_set[:50]
    dev_count = test_count[:50]

    dev_batches = utils.create_batches(len(dev_set), batch_size, shuffle=False)
    test_batches = utils.create_batches(len(test_set),
                                        batch_size,
                                        shuffle=False)
    #save model
    saver = tf.train.Saver()

    if is_restore:
        saver.restore(sess, "./checkpoints/model.ckpt")

    for epoch in range(training_epochs):
        train_batches = utils.create_batches(len(train_set),
                                             batch_size,
                                             shuffle=True)
        #-------------------------------
        # train
        for switch in xrange(0, 2):
            if switch == 0:
                optim = model.optim_dec
                print_mode = 'updating decoder'
            else:
                optim = model.optim_enc
                print_mode = 'updating encoder'
            for i in xrange(alternate_epochs):
                loss_sum = 0.0
                ppx_sum = 0.0
                kld_sum = 0.0
                word_count = 0
                doc_count = 0
                for idx_batch in train_batches:
                    data_batch, count_batch, mask = utils.fetch_data(
                        train_set, train_count, idx_batch, FLAGS.vocab_size)
                    input_feed = {
                        model.x.name: data_batch,
                        model.mask.name: mask
                    }
                    _, (loss, kld) = sess.run(
                        (optim, [model.objective, model.kld]), input_feed)
                    loss_sum += np.sum(loss)
                    kld_sum += np.sum(kld) / np.sum(mask)
                    word_count += np.sum(count_batch)
                    # to avoid nan error
                    count_batch = np.add(count_batch, 1e-12)
                    # per document loss
                    ppx_sum += np.sum(np.divide(loss, count_batch))
                    doc_count += np.sum(mask)
                print_ppx = np.exp(loss_sum / word_count)
                print_ppx_perdoc = np.exp(ppx_sum / doc_count)
                print_kld = kld_sum / len(train_batches)

                with open(train_csv_filename, 'a') as train_csv:
                    train_writer = csv.writer(train_csv,
                                              delimiter=',',
                                              quotechar='|',
                                              quoting=csv.QUOTE_MINIMAL)
                    train_writer.writerow([
                        epoch + 1, print_mode, i, print_ppx, print_ppx_perdoc,
                        print_kld
                    ])

                print(
                    '| Epoch train: {:d} |'.format(epoch + 1),
                    print_mode,
                    '{:d}'.format(i),
                    '| Corpus ppx: {:.5f}'.format(
                        print_ppx),  # perplexity for all docs
                    '| Per doc ppx: {:.5f}'.format(
                        print_ppx_perdoc),  # perplexity for per doc
                    '| KLD: {:.5}'.format(print_kld))
        #-------------------------------
        # dev
        loss_sum = 0.0
        kld_sum = 0.0
        ppx_sum = 0.0
        word_count = 0
        doc_count = 0
        for idx_batch in dev_batches:
            data_batch, count_batch, mask = utils.fetch_data(
                dev_set, dev_count, idx_batch, FLAGS.vocab_size)
            input_feed = {model.x.name: data_batch, model.mask.name: mask}
            loss, kld = sess.run([model.objective, model.kld], input_feed)
            loss_sum += np.sum(loss)
            kld_sum += np.sum(kld) / np.sum(mask)
            word_count += np.sum(count_batch)
            count_batch = np.add(count_batch, 1e-12)
            ppx_sum += np.sum(np.divide(loss, count_batch))
            doc_count += np.sum(mask)
        print_ppx = np.exp(loss_sum / word_count)
        print_ppx_perdoc = np.exp(ppx_sum / doc_count)
        print_kld = kld_sum / len(dev_batches)

        with open(dev_csv_filename, 'a') as dev_csv:
            dev_writer = csv.writer(dev_csv,
                                    delimiter=',',
                                    quotechar='|',
                                    quoting=csv.QUOTE_MINIMAL)
            dev_writer.writerow(
                [epoch + 1, print_ppx, print_ppx_perdoc, print_kld])

        print('| Epoch dev: {:d} |'.format(epoch + 1),
              '| Perplexity: {:.9f}'.format(print_ppx),
              '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc),
              '| KLD: {:.5}'.format(print_kld))
        #-------------------------------
        # test
        if FLAGS.test:
            loss_sum = 0.0
            kld_sum = 0.0
            ppx_sum = 0.0
            word_count = 0
            doc_count = 0
            for idx_batch in test_batches:
                data_batch, count_batch, mask = utils.fetch_data(
                    test_set, test_count, idx_batch, FLAGS.vocab_size)
                input_feed = {model.x.name: data_batch, model.mask.name: mask}
                loss, kld = sess.run([model.objective, model.kld], input_feed)
                loss_sum += np.sum(loss)
                kld_sum += np.sum(kld) / np.sum(mask)
                word_count += np.sum(count_batch)
                count_batch = np.add(count_batch, 1e-12)
                ppx_sum += np.sum(np.divide(loss, count_batch))
                doc_count += np.sum(mask)
            print_ppx = np.exp(loss_sum / word_count)
            print_ppx_perdoc = np.exp(ppx_sum / doc_count)
            print_kld = kld_sum / len(test_batches)

            with open(test_csv_filename, 'a') as test_csv:
                test_writer = csv.writer(test_csv,
                                         delimiter=',',
                                         quotechar='|',
                                         quoting=csv.QUOTE_MINIMAL)
                test_writer.writerow(
                    [epoch + 1, print_ppx, print_ppx_perdoc, print_kld])

            print('| Epoch test: {:d} |'.format(epoch + 1),
                  '| Perplexity: {:.9f}'.format(print_ppx),
                  '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc),
                  '| KLD: {:.5}'.format(print_kld))
Esempio n. 19
0
    # world_size (int, optional) – Number of processes participating in the job
    # init_method (str, optional) – URL specifying how to initialize the process group. Default is “env://” if no init_method or store is specified. Mutually exclusive with store.
    # setup()

    words = read_words(
        '/users/PAS1588/liuluyu0378/lab1/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled',
        seq_len, kernel[0])
    word_counter = collections.Counter(words).most_common(vocab_size - 1)
    vocab = [w for w, _ in word_counter]
    w2i = dict((w, i) for i, w in enumerate(vocab, 1))
    w2i['<unk>'] = 0
    print('vocab_size', vocab_size)
    print('w2i size', len(w2i))

    data = [w2i[w] if w in w2i else 0 for w in words]
    data = create_batches(data, batch_size, seq_len)
    split_idx = int(len(data) * 0.8)
    training_data = data[:split_idx]
    test_data = data[split_idx:]
    print('train samples:', len(training_data))
    print('test samples:', len(test_data))

    model = GatedCNN(seq_len, vocab_size, embd_size, n_layers, kernel, out_chs,
                     res_block_count, vocab_size)
    cuda = None
    if torch.cuda.is_available():
        print("cuda")
        model.cuda()
        cuda = True
    else:
        cuda = False
Esempio n. 20
0
def test_create_batches():
    batches = u.create_batches(range(10), 2)
    for i, batch in enumerate(batches):
        assert batch == range(i * 2, i * 2 + 2)
Esempio n. 21
0
    def train_x(
            self,
            dev_set_with_lab,
            dev_set_without_lab,
            dev_set_y,
            train_set_with_lab,
            train_set_without_lab,
            train_set_y,
            test_set,
            test_set_y,
            to_label,
            model_name,  # 10
            warm_up_period=100,
            n_dropout_rounds=100,
            max_learning_iterations=100,
            no_improvement_iterations=15,
            semi_supervised=True,
            debug=True,
            it=1):

        sess = tf.Session()
        sess.run(tf.global_variables_initializer())
        self.it = it
        is_training = False
        dev_batches_with_lab, dev_batches_without_lab = utils.create_batches_new(
            len(dev_set_y),
            len(dev_set_without_lab),
            self.batch_size,
            shuffle=False)

        test_batches = utils.create_batches(len(test_set_y),
                                            self.batch_size,
                                            shuffle=False)
        # train_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/train',
        #                                    sess.graph)
        warm_up = 0
        min_alpha = 0.001  #
        best_print_ana_ppx = 1e10
        no_improvement_iters = 0
        stopped = False
        epoch = -1

        while not stopped:
            epoch += 1
            train_batches_with_lab, train_batches_without_lab = utils.create_batches_new(
                len(train_set_with_lab),
                len(train_set_without_lab),
                self.batch_size,
                shuffle=True)
            if warm_up < 1.:
                warm_up += 1. / warm_up_period
            else:
                warm_up = 1.

            self.run_model(train_batches_with_lab,
                           train_set_with_lab,
                           train_set_y,
                           train_batches_without_lab,
                           train_set_without_lab,
                           debug,
                           semi_supervised,
                           epoch,
                           warm_up,
                           min_alpha,
                           sess,
                           optim=self.optim,
                           keep_prop=0.75,
                           print_statement="training",
                           training=True)

            print_ana_loss, _, _ = self.run_model(dev_batches_with_lab,
                                                  dev_set_with_lab,
                                                  dev_set_y,
                                                  dev_batches_without_lab,
                                                  dev_set_without_lab,
                                                  debug,
                                                  semi_supervised,
                                                  epoch,
                                                  warm_up,
                                                  min_alpha,
                                                  sess,
                                                  optim=None,
                                                  keep_prop=1.0,
                                                  print_statement="dev",
                                                  training=is_training)
            if debug:
                _, f1_measure_test, _ = self.run_model(test_batches,
                                                       test_set,
                                                       test_set_y,
                                                       test_batches,
                                                       test_set,
                                                       debug,
                                                       semi_supervised,
                                                       epoch,
                                                       warm_up,
                                                       min_alpha,
                                                       sess,
                                                       optim=None,
                                                       keep_prop=1.0,
                                                       print_statement="test",
                                                       training=is_training)
                print("TEST F1:", f1_measure_test)

            if print_ana_loss < best_print_ana_ppx:
                no_improvement_iters = 0
                best_print_ana_ppx = print_ana_loss
                #tf.train.Saver().save(sess, model_name + '/improved_model')
            else:
                no_improvement_iters += 1
                print("No improvement: ", no_improvement_iters, "epoch:",
                      epoch)
                if no_improvement_iters >= no_improvement_iterations:
                    break
        # print("load best dev f1 model...")
        #tf.train.Saver().restore(sess, model_name + '/improved_model')

        _, f1_measure, prop_clss = self.run_model(test_batches,
                                                  test_set,
                                                  test_set_y,
                                                  test_batches,
                                                  test_set,
                                                  debug,
                                                  semi_supervised,
                                                  epoch,
                                                  warm_up,
                                                  min_alpha,
                                                  sess,
                                                  optim=None,
                                                  keep_prop=1.0,
                                                  print_statement="test",
                                                  training=is_training,
                                                  model_name=model_name)

        data_batch_labeled, mask = utils.fetch_data_without_idx_new(
            to_label, self.vocab_size)
        data_batch_y, data_batch_y_neg, data_batch_y_pos = utils.fetch_data_y_dummy_new(
            to_label, self.n_class)
        input_feed = {
            self.x_labeled: data_batch_labeled,
            self.x_unlabeled: data_batch_labeled,
            self.y_labeled: data_batch_y,
            self.y_neg: data_batch_y_neg,
            self.y_pos: data_batch_y_pos,
            self.mask.name: mask,
            self.keep_prob: 0.75,
            self.warm_up: warm_up,
            self.min_alpha: min_alpha,
            self.training.name: is_training
        }

        prediction = [
            sess.run(([self.out_y]), input_feed)
            for _ in range(n_dropout_rounds)
        ]
        return f1_measure[1], prediction, prop_clss
Esempio n. 22
0
def evaluate(model, training_data, training_count, session, step, train_loss=None, epoch=None, summaries=None, writer=None, saver=None):

  #Get theta for the H1.
  data_url = os.path.join(FLAGS.data_dir, 'valid_h1.feat' if step != 'test' else 'test_h1.feat')
  dataset, dataset_count = utils.data_set(data_url)
  data_batches = utils.create_batches(len(dataset), FLAGS.batch_size, shuffle=False)
   
  theta = []
  for idx_batch in data_batches:

    data_batch, count_batch, mask = utils.fetch_data(dataset, dataset_count, idx_batch, FLAGS.vocab_size)
    input_feed = {model.x.name: data_batch, model.mask.name: mask}

    logit_theta = session.run(model.doc_vec, input_feed)
    theta.append(softmax(logit_theta, axis=1)) 

  theta = np.concatenate(theta, axis=0)

  weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='decoder/projection/Matrix:0')[0].eval(session)
  bias = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='decoder/projection/Bias:0')[0].eval(session)
  beta = softmax(weights + bias, axis=1)

  #H2 to calculate perplexity.
  data_url = os.path.join(FLAGS.data_dir, 'valid_h2.feat' if step != 'test' else 'test_h2.feat')
  dataset, dataset_count = utils.data_set(data_url)
  data_batches = utils.create_batches(len(dataset), FLAGS.batch_size, shuffle=False)

  test_data = [utils.fetch_data(dataset, dataset_count, idx_batch, FLAGS.vocab_size)[0] for idx_batch in data_batches]
  test_data = np.concatenate(test_data, axis=0)

  perplexity = get_perplexity(test_data, theta, beta)
  coherence = get_topic_coherence(beta, training_data, 'nvdm') if  step == 'test' else np.nan
  diversity = get_topic_diversity(beta, 'nvdm') if step == 'test' else np.nan
    
  if step == 'val':

    #tloss = tf.get_default_graph().get_tensor_by_name('tloss:0') 
    #vppl = tf.get_default_graph().get_tensor_by_name('vppl:0') 

    #weight_summaries = session.run(summaries, feed_dict={tloss: train_loss, vppl: perplexity})
    #weight_summaries = summaries.eval(session=session)
    #writer.add_summary(weight_summaries, epoch)
    save_path = saver.save(session, os.path.join(ckpt, 'model.ckpt'))

    print("Model saved in path: %s" % ckpt)
    print('| Epoch dev: {:d} |'.format(epoch+1)) 

  else:
    
    ## get most used topics
    cnt = 0
    thetaWeightedAvg = np.zeros((1, FLAGS.n_topic))
    data_batches = utils.create_batches(len(training_data), FLAGS.batch_size, shuffle=False)

    for idx_batch in data_batches:

        batch, count_batch, mask = utils.fetch_data(training_data, training_count, idx_batch, FLAGS.vocab_size)
        sums = batch.sum(axis=1)
        cnt += sums.sum(axis=0)

        input_feed = {model.x.name: batch, model.mask.name: mask}
        logit_theta = session.run(model.doc_vec, input_feed)
        theta = softmax(logit_theta, axis=1)
        weighed_theta = (theta.T * sums).T
        thetaWeightedAvg += weighed_theta.sum(axis=0)

    thetaWeightedAvg = thetaWeightedAvg.squeeze() / cnt
    print('\nThe 10 most used topics are {}'.format(thetaWeightedAvg.argsort()[::-1][:10]))

    with open(FLAGS.data_dir + '/vocab.new', 'rb') as f:
      vocab = pkl.load(f)

    topic_indices = list(np.random.choice(FLAGS.n_topic, 10)) # 10 random topics
    print('\n')

    with open(ckpt + '/topics.txt', 'w') as f:
      for k in range(FLAGS.n_topic):
        gamma = beta[k]
        top_words = list(gamma.argsort()[-FLAGS.n_words+1:][::-1])
        topic_words = [vocab[a] for a in top_words]
        f.write(str(k) + ' ' + str(topic_words) + '\n')
        print('Topic {}: {}'.format(k, topic_words))

  with open(ckpt + '/' + step + '_scores.csv', 'a') as handle:
    handle.write(str(perplexity) + ',' + str(coherence) + ',' + str(diversity) + '\n')
Esempio n. 23
0
#%%
model = models.MLPmod(7, dimensions, nn.ReLU)
out = model(X)
out.shape
#%%
model = models.MLPmod(7, dimensions, nn.ReLU)
mae_trains = []
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()

#x_test, target_test = utils.create_batches(X, Y, 64, 1)

for epoch in range(1500):

    x, y = utils.create_batches(X, Y, 256, 1)

    x = torch.tensor(x).type(dtype=torch.float)
    y = torch.tensor(y).type(dtype=torch.float)

    model.train()
    output = model(x)
    loss = criterion(output, y)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    model.eval()

    with torch.no_grad():
Esempio n. 24
0
def train(sess, model, train_url, batch_size, training_epochs=1000, alternate_epochs=10):

  train_set, train_count = utils.data_set(train_url)

  summaries = None#get_summaries(sess) 
  writer = None#tf.summary.FileWriter(ckpt + '/logs/', sess.graph)
  saver = tf.train.Saver()

  sess.graph.finalize()
 
  total_mem = 0
  mem = 0
 
  for epoch in range(training_epochs):

    train_batches = utils.create_batches(len(train_set), batch_size, shuffle=True)

    for switch in range(0, 2):

      if switch == 0:
        optim = model.optim_dec
        print_mode = 'updating decoder'
      else:
        optim = model.optim_enc
        print_mode = 'updating encoder'

      for i in range(alternate_epochs):

        loss_sum = 0.0
        ppx_sum = 0.0
        kld_sum = 0.0
        word_count = 0
        doc_count = 0

        for idx_batch in train_batches:

          data_batch, count_batch, mask = utils.fetch_data(train_set, train_count, idx_batch, FLAGS.vocab_size)
          input_feed = {model.x.name: data_batch, model.mask.name: mask}
          _, (loss, kld) = sess.run((optim, [model.objective, model.kld]), input_feed)

          #loss, kld = tf.cast(loss, tf.float64), tf.cast(kld, tf.float64)
          loss_sum += np.sum(loss)
          kld_sum += np.sum(kld) / np.sum(mask)  
          word_count += np.sum(count_batch)
          # to avoid nan error
          count_batch = np.add(count_batch, 1e-12)
          # per document loss
          ppx_sum += np.sum(np.divide(loss, count_batch)) 
          doc_count += np.sum(mask)
        print_ppx = np.exp(loss_sum / word_count)
        print_ppx_perdoc = np.exp(ppx_sum / doc_count)
        print_kld = kld_sum/len(train_batches)
        print('| Epoch train: {:d} |'.format(epoch+1), 
               print_mode, '{:d}'.format(i),
               '| Corpus ppx: {:.5f}'.format(print_ppx),  # perplexity for all docs
               '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc),  # perplexity for per doc
               '| KLD: {:.5}'.format(print_kld))
        
    evaluate(model, train_set, train_count, sess, 'val', (loss_sum + kld_sum), epoch, summaries, writer, saver)

    current_mem = process.memory_info().rss / (1024 ** 2)
    total_mem += (current_mem - mem)
    print("Memory increase: {}, Cumulative memory: {}, and current {} in MB".format(current_mem - mem, total_mem, current_mem))
    mem = current_mem
    gc.collect()
def train(sess, model, 
          train_url, 
          test_url, 
          batch_size, 
          vocab_size,
          training_epochs=200, 
          alternate_epochs=1,#10
          lexicon=[],
          result_file='test.txt',
          B=1,
          warm_up_period=100):
  """train nvdm model."""
  train_set, train_count = utils.data_set(train_url)
  test_set, test_count = utils.data_set(test_url)
  # hold-out development dataset
  train_size=len(train_set)
  validation_size=int(train_size*0.1)
  dev_set = train_set[:validation_size]
  dev_count = train_count[:validation_size]
  train_set = train_set[validation_size:]
  train_count = train_count[validation_size:]
  print('sizes',train_size,validation_size,len(dev_set),len(train_set))
  optimize_jointly = True
  dev_batches = utils.create_batches(len(dev_set), batch_size, shuffle=False)
  test_batches = utils.create_batches(len(test_set), batch_size, shuffle=False)

  warm_up = 0
  start_min_alpha = 0.00001
  min_alpha = start_min_alpha
  warm_up_alpha=False
  start_B=4
  curr_B=B
  
  #for early stopping
  best_print_ana_ppx=1e10
  early_stopping_iters=30
  no_improvement_iters=0
  stopped=False
  epoch=-1
  #for epoch in range(training_epochs):
  while not stopped:
    epoch+=1
    train_batches = utils.create_batches(len(train_set), batch_size, shuffle=True)
    if warm_up<1.:
      warm_up += 1./warm_up_period
    else:
      warm_up=1.
   
    # train
    #for switch in range(0, 2):
    if optimize_jointly:
      optim = model.optim_all
      print_mode = 'updating encoder and decoder'
    elif switch == 0:
      optim = model.optim_dec
      print_mode = 'updating decoder'
    else:
      optim = model.optim_enc
      print_mode = 'updating encoder'
    for i in range(alternate_epochs):
      loss_sum = 0.0
      ana_loss_sum = 0.0
      ppx_sum = 0.0
      kld_sum = 0.0
      ana_kld_sum = 0.0
      word_count = 0
      doc_count = 0
      recon_sum=0.0
      for idx_batch in train_batches:
        data_batch, count_batch, mask = utils.fetch_data(
        train_set, train_count, idx_batch, vocab_size)
        input_feed = {model.x.name: data_batch, model.mask.name: mask,model.keep_prob.name: 0.75,model.warm_up.name: warm_up,model.min_alpha.name:min_alpha,model.B.name: curr_B}
        _, (loss,recon, kld,ana_loss,ana_kld) = sess.run((optim, 
                                    [model.true_objective, model.recons_loss, model.kld,model.analytical_objective,model.analytical_kld]),
                                    input_feed)
        loss_sum += np.sum(loss)
        ana_loss_sum += np.sum(ana_loss)
        kld_sum += np.sum(kld) / np.sum(mask) 
        ana_kld_sum += np.sum(ana_kld) / np.sum(mask)
        word_count += np.sum(count_batch)
        # to avoid nan error
        count_batch = np.add(count_batch, 1e-12)
        # per document loss
        ppx_sum += np.sum(np.divide(loss, count_batch)) 
        doc_count += np.sum(mask)
        recon_sum+=np.sum(recon)
      print_loss = recon_sum/len(train_batches)
      dec_vars = utils.variable_parser(tf.trainable_variables(), 'decoder')
      phi = dec_vars[0]
      phi = sess.run(phi)
      utils.print_top_words(phi, lexicon,result_file=None)
      print_ppx = np.exp(loss_sum / word_count)
      print_ana_ppx = np.exp(ana_loss_sum / word_count)
      print_ppx_perdoc = np.exp(ppx_sum / doc_count)
      print_kld = kld_sum/len(train_batches)
      print_ana_kld = ana_kld_sum/len(train_batches)
      

      print('| Epoch train: {:d} |'.format(epoch+1), 
               print_mode, '{:d}'.format(i),
               '| Corpus ppx: {:.5f}'.format(print_ppx),  # perplexity for all docs
               '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc),  # perplexity for per doc
               '| KLD: {:.5}'.format(print_kld),
               '| Loss: {:.5}'.format(print_loss),
               '| ppx anal.: {:.5f}'.format(print_ana_ppx),
               '|KLD anal.: {:.5f}'.format(print_ana_kld))
    if warm_up_alpha:
      if min_alpha>0.0001:
        min_alpha-=(start_min_alpha-0.0001)/training_epochs
    #-------------------------------
    # dev
    loss_sum = 0.0
    kld_sum = 0.0
    ppx_sum = 0.0
    word_count = 0
    doc_count = 0
    recon_sum=0.0
    print_ana_ppx = 0.0
    ana_loss_sum = 0.0
    for idx_batch in dev_batches:
      data_batch, count_batch, mask = utils.fetch_data(
          dev_set, dev_count, idx_batch, vocab_size)
      input_feed = {model.x.name: data_batch, model.mask.name: mask,model.keep_prob.name: 1.0,model.warm_up.name: 1.0,model.min_alpha.name:min_alpha,model.B.name: B}#,model.B.name: B
      loss,recon, kld,ana_loss = sess.run([model.objective, model.recons_loss, model.analytical_kld,model.analytical_objective],
                           input_feed)
      loss_sum += np.sum(loss)
      ana_loss_sum += np.sum(ana_loss)
      kld_sum += np.sum(kld) / np.sum(mask)  
      word_count += np.sum(count_batch)
      count_batch = np.add(count_batch, 1e-12)
      ppx_sum += np.sum(np.divide(loss, count_batch))
      doc_count += np.sum(mask) 
      recon_sum+=np.sum(recon)
    print_ana_ppx = np.exp(ana_loss_sum / word_count)
    print_ppx = np.exp(loss_sum / word_count)
    print_ppx_perdoc = np.exp(ppx_sum / doc_count)
    print_kld = kld_sum/len(dev_batches)
    print_loss = recon_sum/len(dev_batches)
    if print_ana_ppx<best_print_ana_ppx:
      no_improvement_iters=0
      best_print_ana_ppx=print_ana_ppx
      #check on validation set, if ppx better-> save improved model
      tf.train.Saver().save(sess, 'models/improved_model_bernoulli') 
    else:
      no_improvement_iters+=1
      print('no_improvement_iters',no_improvement_iters,'best ppx',best_print_ana_ppx)
      if no_improvement_iters>=early_stopping_iters:
          #if model has not improved for 30 iterations, stop training
          ###########STOP TRAINING############
          stopped=True
          print('stop training after',epoch,'iterations,no_improvement_iters',no_improvement_iters)
          ###########LOAD BEST MODEL##########
          print('load stored model')
          tf.train.Saver().restore(sess,'models/improved_model_bernoulli')
          
    print('| Epoch dev: {:d} |'.format(epoch+1), 
           '| Perplexity: {:.9f}'.format(print_ppx),
           '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc),
           '| KLD: {:.5}'.format(print_kld)  ,
           '| Loss: {:.5}'.format(print_loss))  
    #-------------------------------
    # test
    #if epoch%10==0 or epoch==training_epochs-1:
    if FLAGS.test:
      #if epoch==training_epochs-1:
      if stopped:
        #only do it once in the end
        coherence=utils.topic_coherence(test_set,phi, lexicon)
        print('topic coherence',str(coherence))
      loss_sum = 0.0
      kld_sum = 0.0
      ppx_sum = 0.0
      word_count = 0
      doc_count = 0
      recon_sum = 0.0
      ana_loss_sum = 0.0
      ana_kld_sum = 0.0
      for idx_batch in test_batches:
        data_batch, count_batch, mask = utils.fetch_data(
          test_set, test_count, idx_batch, vocab_size)
        input_feed = {model.x.name: data_batch, model.mask.name: mask,model.keep_prob.name: 1.0,model.warm_up.name: 1.0,model.min_alpha.name:min_alpha,model.B.name: B}
        loss, recon,kld,ana_loss,ana_kld = sess.run([model.objective, model.recons_loss,model.kld,model.analytical_objective,model.analytical_kld],
                             input_feed)
        loss_sum += np.sum(loss)
        kld_sum += np.sum(kld)/np.sum(mask) 
        ana_loss_sum += np.sum(ana_loss)
        ana_kld_sum += np.sum(ana_kld) / np.sum(mask)
        word_count += np.sum(count_batch)
        count_batch = np.add(count_batch, 1e-12)
        ppx_sum += np.sum(np.divide(loss, count_batch))
        doc_count += np.sum(mask) 
        recon_sum+=np.sum(recon)
      print_loss = recon_sum/len(test_batches)
      print_ppx = np.exp(loss_sum / word_count)
      print_ppx_perdoc = np.exp(ppx_sum / doc_count)
      print_kld = kld_sum/len(test_batches)
      print_ana_ppx = np.exp(ana_loss_sum / word_count)
      print_ana_kld = ana_kld_sum/len(train_batches)
      print('| Epoch test: {:d} |'.format(epoch+1), 
             '| Perplexity: {:.9f}'.format(print_ppx),
             '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc),
             '| KLD: {:.5}'.format(print_kld),
             '| Loss: {:.5}'.format(print_loss),
             '| ppx anal.: {:.5f}'.format(print_ana_ppx),
               '|KLD anal.: {:.5f}'.format(print_ana_kld)) 
Esempio n. 26
0
def model(batch_size, lr, dims, numEpochs, cuda, alpha, pathLoad, pathSave, epochSave, activation, modelType,
             computeEigVectorsOnline, regularizerFcn, _seed, _run):
    """
    Function for creating and training MLPs on MNIST.
    :param batch_size: specifies batch size
    :param rlr: learning rate of stochastic optimizer
    :param dims: A list of N tuples that specifies the input and output sizes for the FC layers. where the last layer is the output layer
    :param numEpochs: number of epochs to train the network for
    :param cuda: boolean variable that will specify whether to use the GPU or nt
    :param alpha: weight for regularizer on spectra. If 0, the regularizer will not be used
    :param pathLoad: path to where MNIST lives
    :param pathSave: path specifying where to save the models
    :param epochSave: integer specifying how often to save loss
    :param activation: string that specified whether to use relu or not
    :param _seed: seed for RNG
    :param _run: Sacred object that logs the relevant data and stores them to a database

    :param computeEigVectorsOnline: online or offline eig estimator
    :param regularizerFcn: function name that computes the discrepancy between the idealized and empirical eigs
    """
    device = 'cuda' if cuda == True else 'cpu'
    os.makedirs(pathSave, exist_ok=True)
    npr.seed(_seed)
    torch.manual_seed(_seed + 1)
    alpha = alpha * torch.ones(1, device=device)

    "Load in MNIST"
    fracVal = 0.1
    train, val, test = split_mnist(pathLoad, fracVal)
    trainData, trainLabels = train[0], train[1]
    valData, valLabels = val[0], val[1]
    testData, testLabels = test[0], test[1]
    numSamples = trainData.shape[0]

    if modelType == 'mlp':
        model = MLP(dims, activation=activation)  # create a mlp object
    elif modelType == 'cnn':
        model = CNN(dims, activation=activation)  # create a CNN object
    else:
        print('WOAHHHHH RELAX')

    model = model.to(device)
    lossFunction = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    "Objects used to store performance metrics while network is training"
    trainSpectra = []  # store the (estimated) spectra of the network at the end of each epoch
    trainLoss = []  # store the training loss (reported at the end of each epoch on the last batch)
    trainRegularizer = []  # store the value of the regularizer during training
    valLoss = []  # validation loss
    valRegularizer = []  # validation regularizer

    "Sample indices for eigenvectors all at once"
    eigBatchIdx = npr.randint(numSamples, size=(numEpochs + 1, batch_size))

    "Get initial estimate of eigenvectors and check loss"
    with torch.no_grad():
        model.eigVec, loss, spectraTemp, regul = computeEigVectors(model, trainData[eigBatchIdx[0, :], :],
                                                                   trainLabels[eigBatchIdx[0, :]], lossFunction,
                                                                   alpha=alpha, cuda=cuda)
        trainSpectra.append(spectraTemp)  # store computed eigenspectra
        trainLoss.append(loss.cpu().item())  # store training loss
        _run.log_scalar("trainLoss", loss.item())
        _run.log_scalar("trainRegularizer", float(alpha * regul) )
        trainRegularizer.append(alpha * regul)  # store value of regularizer

        "Check on validation set"
        loss, regul = compute_loss(model, valData, valLabels, lossFunction, alpha, cuda=cuda)
        valLoss.append(loss.item())
        _run.log_scalar("valLoss", loss.item())
        valRegularizer.append(regul)
        prevVal = loss.item() + alpha * regul.item()  # use for early stopping
        prevModel = copy.deepcopy(model)

    patience = 0
    howMuchPatience = 4
    "Train that bad boy!"
    for epoch in tqdm(range(numEpochs), desc="Epochs", ascii=True, position=0, leave=False):
        batches = create_batches(batch_size=batch_size, numSamples=numSamples)  # create indices for batches
        for batch in tqdm(batches, desc='Train Batches', ascii=True, position=1, leave=False):
            optimizer.zero_grad()
            "Compute a forward pass through the network"
            loss, regul = compute_loss(model, trainData[batch, :], trainLabels[batch], lossFunction, alpha, cuda=cuda)
            lossR = loss + alpha * regul  # compute augmented loss function
            lossR.backward()  # backprop!
            optimizer.step()  # take a gradient step

        "Recompute estimated eigenvectors"
        with torch.no_grad():
            model.eigVec, loss, spectraTemp, regul = computeEigVectors(model, trainData[eigBatchIdx[epoch + 1, :], :],
                                                                       trainLabels[eigBatchIdx[epoch + 1, :]],
                                                                       lossFunction, alpha=alpha, cuda=cuda)
            trainSpectra.append(spectraTemp)  # store computed eigenspectra
            trainLoss.append(loss.cpu().item())  # store training loss
            _run.log_scalar("trainLoss", loss.item())
            trainRegularizer.append(alpha * regul)  # store value of regularizer
            if (epoch + 1) % epochSave == 0:
                "Check early stopping condition"
                loss, regul = compute_loss(model, valData, valLabels, lossFunction, alpha, cuda=cuda)
                currVal = loss.item() + alpha * regul.item()
                percentImprove = (currVal - prevVal) / prevVal
                if percentImprove > 0:
                    if patience > howMuchPatience:
                        model = prevModel
                        break
                    else:
                        patience += 1

                else:
                    patience = 0
                prevVal = currVal
                prevModel = copy.deepcopy(model)  # save for early stopping
                valLoss.append(loss.item())
                _run.log_scalar("valLoss", loss.item())
                valRegularizer.append(regul.item())
                _run.log_scalar("valRegularizer", regul.item())


    "Check accuracy on test set"
    outputs = model(testData.to(device))
    softMax = nn.Softmax(dim=1)
    probs = softMax(outputs.cpu())
    numCorrect = torch.sum(torch.argmax(probs, dim=1) == testLabels).detach().numpy() * 1.0
    testResult = numCorrect / testData.shape[0] * 100

    "Collect accuracy on validation set"
    outputs = model(valData.to(device))
    softMax = nn.Softmax(dim=1)
    probs = softMax(outputs).cpu()
    numCorrect = torch.sum(torch.argmax(probs, dim=1) == valLabels).detach().numpy() * 1.0
    valAcc = numCorrect / valData.shape[0] * 100
    _run.log_scalar("valAcc", valAcc.item())


    "Save everything for later analysis"
    model_data = {'parameters': model.cpu().state_dict(),
                  'training': (trainLoss, trainRegularizer, trainSpectra),
                  'val': (valLoss, valRegularizer, valAcc),
                  'test': testResult}

    if modelType == 'cnn':
        dims = dims[1:]  # first number is number of convolutional layers
    path = pathSave + modelType + '_' + activation + '_hidden=('
    for idx in range(len(dims) - 1):
        path = path + str(dims[idx][1]) + ','

    path = path + str(dims[-1][1]) + ')_lr=' + str(lr) + '_alpha=' + str(alpha) + '_batch_size=' \
           + str(batch_size) + '_seed=' + str(_seed) + '_epochs=' + str(numEpochs)
    torch.save(model_data, path)
    _run.add_artifact(path, "model_data.pt", content_type="application/octet-stream")  # saves the data dump as model_data
    # os.system('ls -l --block-size=M {}'.format(path))
    # shutil.rmtree(pathSave)
    # Returning the validation loss to do model comparision and selection
    return valAcc
Esempio n. 27
0
                cnn.input_x: x_batch,
                cnn.input_y: y_batch,
                cnn.dropout_keep_prob: 1.0
            }
            step, summaries, loss, accuracy = sess.run(
                [global_step, dev_summary_op, cnn.loss, cnn.accuracy],
                feed_dict)
            time_str = datetime.datetime.now().isoformat()
            print("{}: step {}, loss {:g}, acc {:g}".format(
                time_str, step, loss, accuracy))
            if writer:
                writer.add_summary(summaries, step)

        # Generate batches
        print("generating batches")
        batches = utils.create_batches(X_train, Y_train, FLAGS.batch_size,
                                       FLAGS.num_epochs)
        print("training")
        # Training loop. For each batch...
        for batch in batches:
            x_batch, y_batch = batch
            train_step(x_batch, y_batch)
            current_step = tf.train.global_step(sess, global_step)
            if current_step % FLAGS.evaluate_every == 0:
                print("\nEvaluation:")
                dev_step(X_test, Y_true, writer=dev_summary_writer)
                print("")
            if current_step % FLAGS.checkpoint_every == 0:
                path = saver.save(sess,
                                  checkpoint_prefix,
                                  global_step=current_step)
                print("Saved model checkpoint to {}\n".format(path))
Esempio n. 28
0
    def train_x(
            self,
            dev_set_with_lab,
            dev_set_without_lab,
            dev_set_y,
            train_set_with_lab,
            train_set_without_lab,
            train_set_y,
            test_set,
            test_set_y,
            to_label,
            model_name,  # 10
            lexicon=[],
            warm_up_period=100,
            n_dropout_rounds=100,
            max_learning_iterations=100,
            min_learning_iterations=35,
            no_improvement_iterations=15,
            semi_supervised=True,
            debug=True,
            it=0):
        sess = tf.Session()
        sess.run(tf.global_variables_initializer())
        is_training = False
        dev_batches_with_lab = utils.create_batches(len(dev_set_y),
                                                    self.batch_size,
                                                    shuffle=False)
        dev_batches_without_lab = utils.create_batches(
            len(dev_set_without_lab), self.batch_size, shuffle=False)
        test_batches = utils.create_batches(len(test_set_y),
                                            self.batch_size,
                                            shuffle=False)
        labeled_training_rounds = math.ceil(
            float(len(train_set_without_lab)) / float(len(train_set_with_lab)))
        labeled_dev_rounds = math.ceil(
            float(len(dev_set_without_lab)) / float(len(dev_set_with_lab)))

        # train_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/train',
        #                                    sess.graph)
        warm_up = 0
        min_alpha = 0.001  #
        best_print_ana_ppx = 1e10
        no_improvement_iters = 0
        stopped = False
        epoch = -1

        while not stopped:
            epoch += 1
            train_batches_with_lab = utils.create_batches(
                len(train_set_with_lab), self.batch_size, shuffle=True)
            train_batches_without_lab = utils.create_batches(
                len(train_set_without_lab), self.batch_size, shuffle=True)
            if warm_up < 1.:
                warm_up += 1. / warm_up_period
            else:
                warm_up = 1.

            optim = self.optim_all

            self.run_model(labeled_training_rounds,
                           train_batches_with_lab,
                           train_set_with_lab,
                           train_set_y,
                           train_batches_without_lab,
                           train_set_without_lab,
                           debug,
                           semi_supervised,
                           epoch,
                           warm_up,
                           min_alpha,
                           sess,
                           optim=optim,
                           keep_prop=0.75,
                           print_statement="training",
                           training=True)

            print_ana_loss, print_loss1, f1_measure, _ = self.run_model(
                labeled_dev_rounds,
                dev_batches_with_lab,
                dev_set_with_lab,
                dev_set_y,
                dev_batches_without_lab,
                dev_set_without_lab,
                debug,
                semi_supervised,
                epoch,
                warm_up,
                min_alpha,
                sess,
                optim=None,
                keep_prop=1.0,
                print_statement="dev",
                training=is_training)

            if print_ana_loss < best_print_ana_ppx:
                no_improvement_iters = 0
                best_print_ana_ppx = print_ana_loss
                #tf.train.Saver().save(sess, model_name + '/improved_model')

            else:
                no_improvement_iters += 1
                #print("No improvement: ", no_improvement_iters)
                if no_improvement_iters >= no_improvement_iterations:
                    break

            # -------------------------------
            # test
            if debug:
                self.run_model(1,
                               test_batches,
                               test_set,
                               test_set_y, [],
                               None,
                               debug,
                               semi_supervised,
                               epoch,
                               warm_up,
                               min_alpha,
                               sess,
                               optim=None,
                               keep_prop=1.0,
                               print_statement="TEST",
                               training=is_training)

        # print("load best dev f1 model...")
        #tf.train.Saver().restore(sess, model_name + '/improved_model')

        _, _, f1_measure, test_pred = self.run_model(1,
                                                     test_batches,
                                                     test_set,
                                                     test_set_y, [],
                                                     None,
                                                     debug,
                                                     semi_supervised,
                                                     epoch,
                                                     warm_up,
                                                     min_alpha,
                                                     sess,
                                                     optim=None,
                                                     keep_prop=1.0,
                                                     print_statement="test",
                                                     training=is_training)

        data_batch, mask = utils.fetch_data_without_idx_new(
            to_label, self.vocab_size)
        data_batch_y = utils.fetch_data_y_dummy(to_label, self.n_class, 0)
        input_feed = {
            self.x.name: data_batch,
            self.y.name: data_batch_y,
            self.mask.name: mask,
            self.keep_prob.name: 0.75,
            self.warm_up.name: warm_up,
            self.min_alpha.name: min_alpha,
            self.prob: 0.75,
            self.lab: np.zeros((1)),
            self.idx.name: np.zeros((1), dtype=np.int32),
            self.training.name: is_training
        }

        prediction = [
            sess.run(([self.out_y]), input_feed)
            for _ in range(n_dropout_rounds)
        ]
        return f1_measure, prediction, test_pred
Esempio n. 29
0
def train(hparams, model_design, X, Y, data, data_dir="models/mlp", splits=5):
    """
    
    
    """
    epochs = hparams["epochs"]

    kf = KFold(n_splits=splits, shuffle=False)
    kf.get_n_splits(X)

    #rmse_train = np.zeros((splits, epochs))
    #rmse_val = np.zeros((splits, epochs))
    mae_train = np.zeros((splits, epochs))
    mae_val = np.zeros((splits, epochs))

    i = 0

    #performance = []
    #y_tests = []
    #y_preds = []

    for train_index, test_index in kf.split(X):

        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = Y[train_index], Y[test_index]

        X_test = torch.tensor(X_test).type(dtype=torch.float)
        y_test = torch.tensor(y_test).type(dtype=torch.float)
        X_train = torch.tensor(X_train).type(dtype=torch.float)
        y_train = torch.tensor(y_train).type(dtype=torch.float)

        model = models.MLP(model_design["layer_sizes"])

        optimizer = optim.Adam(model.parameters(), lr=hparams["learningrate"])
        criterion = nn.MSELoss()

        #early_stopping = utils.EarlyStopping()

        for epoch in range(epochs):

            # Training
            model.train()

            x, y = utils.create_batches(X_train, y_train, hparams["batchsize"],
                                        hparams["history"])

            x = torch.tensor(x).type(dtype=torch.float)
            y = torch.tensor(y).type(dtype=torch.float)

            output = model(x)

            # Compute training loss
            loss = criterion(output, y)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Evaluate current model at test set
            model.eval()

            with torch.no_grad():
                pred_train = model(X_train)
                pred_test = model(X_test)
                #rmse_train[i, epoch] = utils.rmse(y_train, pred_train)
                #rmse_val[i, epoch] = utils.rmse(y_test, pred_test)
                val_loss = metrics.mean_absolute_error(y_test, pred_test)
                #early_stopping(val_loss)
                #if early_stopping.early_stop:
                #    break

                mae_train[i, epoch] = metrics.mean_absolute_error(
                    y_train, pred_train)
                mae_val[i, epoch] = val_loss

        # Predict with fitted model
        #with torch.no_grad():
        #    preds_train = model(X_train)
        #    preds_test = model(X_test)
        #    performance.append([utils.rmse(y_train, preds_train),
        #                        utils.rmse(y_test, preds_test),
        #                        metrics.mean_absolute_error(y_train, preds_train.numpy()),
        #                        metrics.mean_absolute_error(y_test, preds_test.numpy())])

        torch.save(model.state_dict(),
                   os.path.join(data_dir, f"{data}_model{i}.pth"))

        #y_tests.append(y_test.numpy())
        #y_preds.append(preds_test.numpy())

        i += 1

    running_losses = {
        "mae_train": mae_train,
        "mae_val": mae_val
    }  #, "rmse_val":rmse_val, "rmse_train":rmse_train, }

    return running_losses  #, performance #, y_tests, y_preds
        "-f", "--restore_file", default="./source_blstm_crf/source_model_crf",
        help="Path to rebuild the model and restore weights from checkpoint"
    )
    opts = optparser.parse_args()[0]

    batch_size = config.batch_size
    word2vec_emb_path = config.word2vec_emb_path
    glove_emb_path = config.glove_emb_path
    input_x, input_y = loader.prepare_input(config.datadir+config.train)
    if opts.char:
        char_emb, char_to_id, char_seq_len = utils.convert_to_char_emb(input_x)
        char_layer = BLSTM(config.char_lstm_size)
    emb_layer = Embedding(opts, word2vec_emb_path, glove_emb_path)
    seqlen, input_x = utils.convert_to_id(input_x, emb_layer.word_to_id)
    input_y, tag_to_id = utils.create_and_convert_tag_to_id(input_y)
    seqlen, inp = utils.create_batches(input_x, seqlen, input_y)

    num_labels = len(tag_to_id)
    lstm_size = 100
    blstm_layer = BLSTM(lstm_size)
    ff_layer = FeedForward(2*config.lstm_size, num_labels)

    if opts.char:
        #dimension of batch and sequence_len are collapsed as batch_size is 1.
        char_inp = tf.placeholder("float32", shape=[None, None, len(char_to_id)], name="char_input")
        char_seqlen = tf.placeholder("int32", shape=[None], name="char_seqlen")
    batch_input = tf.placeholder("int32", shape=[None, None], name="input")
    sequence_length = tf.placeholder("int32", shape=[None], name="seqlen")
    if opts.crf:
        labels = tf.placeholder("int32", shape=[None, None],  name="labels")
    else: