Example #1
0
    def train(self, train_data, dev_data, epochs):
        """
            This function trains the model. 

            No need to change this function.
        """
        acc = 0
        best_weights = {}
        best_acc = 0
        for n in range(epochs):
            shuffle(train_data)
            stdout.write("Epoch %u : " % (n + 1))
            for ex in train_data:
                self.N += 1
                self.estimate_ex(ex)
            sys_classes = self.classify(dev_data)
            acc, _ = evaluate(sys_classes, dev_data)

            sys_classes_1 = self.classify(train_data)
            t_acc, _ = evaluate(sys_classes_1, train_data)

            print("Train accuracy %.2f%%, Dev accuracy %.2f%%" % (t_acc, acc))

            if acc > best_acc:
                best_weights = self.W
                best_acc = acc
                print('New best val accuracy model for these parameters')
            if t_acc > 99:
                print(
                    'Training accuracy preeetty good for now pls stop mister')
                self.W = best_weights
                return best_acc

        self.W = best_weights
        return best_acc
Example #2
0
    def test(self, test_set):
        real_seq = list()
        for seq in test_set[0]:
            real_seq.append(list(map(lambda x: self.idx2words[x], seq)))
        real_label = list()
        for seq in test_set[1]:
            real_label.append(list(map(lambda x: self.idx2labels[x], seq)))
        real_test_set = (real_seq, real_label)

        slot_predict = None
        intent_predict = None
        saver = tf.train.Saver()
        with tf.Session(config=self.config) as sess:
            saver.restore(sess, self.model_path)
            feed_dict, seq_len_list = self.get_feed_dict(test_set[0])
            if self.slot_filling:
                if self.CRF:
                    logits, transition_params = sess.run(
                        [self.logits_slot, self.transition_params],
                        feed_dict=feed_dict)
                    slot_predicts = list()
                    for logit, seq_len in zip(logits, seq_len_list):
                        viterbi_seq, _ = viterbi_decode(
                            logit[:seq_len], transition_params)
                        slot_predicts.append(viterbi_seq)
                else:
                    slot_predicts = sess.run(self.labels_softmax,
                                             feed_dict=feed_dict)
                slot_predict = list()
                for i in range(len(test_set[0])):
                    seq_len = len(test_set[0][i])
                    predicted_seq = list(
                        map(lambda x: self.idx2labels[x],
                            slot_predicts[i][:seq_len]))
                    slot_predict.append(predicted_seq)
            if self.intent_detection:
                intent_predicts = sess.run(self.intents_softmax,
                                           feed_dict=feed_dict)
                for i in range(len(test_set[0])):
                    if test_set[2][i] == intent_predicts[i]:
                        # TODO intent eval
                        pass

        evaluate(real_test_set, slot_predict, intent_predict,
                 self.error_example_output, self.true_example_output,
                 self.slot_distinct)
Example #3
0
    def train(self, train_data, dev_data, mode, epochs):
        """
            This function trains the model. 

            No need to change this function.
        """
        for n in range(epochs):
            shuffle(train_data)
            stdout.write("Epoch %u : " % (n + 1))
            for ex in train_data:
                self.N += 1
                self.estimate_ex(ex, mode)
            sys_classes = self.classify(dev_data, mode)
            acc, _ = evaluate(sys_classes, dev_data)

            sys_classes_1 = self.classify(train_data, mode)
            t_acc, _ = evaluate(sys_classes_1, train_data)

            print("Train accuracy %.2f%%, Dev accuracy %.2f%%" % (t_acc, acc))

            # Let's not overfit too much :^)
            if t_acc >= 99.5:
                print('Training accuracy large enough, stopping training')
                break
Example #4
0
def main():
    train_utts = data.read_file("Brown_train_unseg.txt")
    lexicon = build_lexicon(train_utts[:])

    test_utts = data.read_file("Brown_test_unseg.txt")
    goldsegs_train = data.get_goldsegs("Brown_train_gold.txt")
    goldsegs_test = data.get_goldsegs("Brown_test_gold.txt")

    for utts, goldsegs, title in ((train_utts, goldsegs_train, "Training"),
                                  (test_utts, goldsegs_test, "Testing")):
        joineds, segs = get_segpoints(lexicon, utts)
        stats = data.evaluate(goldsegs, segs)
        print(title)
        print("P: %s\tR: %s\t\tF1: %s\n" %
              tuple([round(stat * 100, 2) for stat in stats]))
Example #5
0
def main():
    train_utts = data.read_file("Brown_train_unseg.txt")
    probdict = get_bigramprobdict(train_utts)

    test_utts = data.read_file("Brown_test_unseg.txt")
    goldsegs_train = data.get_goldsegs("Brown_train_gold.txt")
    goldsegs_test = data.get_goldsegs("Brown_test_gold.txt")

    for utts, goldsegs, title in ((train_utts, goldsegs_train, "Training"),
                                  (test_utts, goldsegs_test, "Testing")):
        sylls = [data.tokenize_syllables(utt) for utt in utts]
        segs = get_segpoints(sylls, probdict)

        # Use data.apply_boundaries to helpvisualize the segmentations for debugging and answering
        # analysis questions

        stats = data.evaluate(goldsegs, segs)
        print(title)
        print("P: %s\tR: %s\t\tF1: %s\n" %
              tuple([round(stat * 100, 2) for stat in stats]))
Example #6
0
def main():
    train_utts = data.read_file("Brown_train_unseg.txt")
    train_tokenized = [data.tokenize_syllables(utt) for utt in train_utts]
    baselinesegs_train = baseline_segs(train_tokenized)

    test_utts = data.read_file("Brown_test_unseg.txt")
    test_tokenized = [data.tokenize_syllables(utt) for utt in test_utts]
    baselinesegs_test = baseline_segs(test_tokenized)

    goldsegs_train = data.get_goldsegs("Brown_train_gold.txt")
    goldsegs_test = data.get_goldsegs("Brown_test_gold.txt")

    for basesegs, goldsegs, title in ((baselinesegs_train, goldsegs_train,
                                       "Training"),
                                      (baselinesegs_test, goldsegs_test,
                                       "Testing")):
        stats = data.evaluate(goldsegs, basesegs)
        print(title)
        print("P: %s\tR: %s\t\tF1: %s\n" %
              tuple([round(stat * 100, 2) for stat in stats]))
Example #7
0
pyp.ylim(0, 100)
pyp.xlim(0, 100)
pyp.scatter(outsT[0, :], outsT[1, :], 20)
pyp.gca().set_aspect('equal', adjustable='box')
r = np.corrcoef(outsT[0, :], outsT[1, :])[0, 1]
print('R2 value is ' + str(r * r))
slope, intercept, r_value, p_value, std_err = stats.linregress(
    outsT[0, :], outsT[1, :])
line = slope * outsT[0, :] + intercept
pyp.plot(outsT[0, :], line)
stng = 'y = ' + str(slope)[:5] + 'x + ' + str(intercept)[:5] + '\nR2 = ' + str(
    r * r)[:5]
pyp.text(15, 70, stng)

# %% Apply to validation set as pre-test
(outs, names) = data.evaluate(Model, 'val', sourceparam)

# %% Plot validation results
pyp.figure(figsize=(8, 8))
pyp.ylim(0, 100)
pyp.xlim(0, 100)
pyp.scatter(outs[0, :], outs[1, :], 20)
pyp.gca().set_aspect('equal', adjustable='box')
r = np.corrcoef(outs[0, :], outs[1, :])[0, 1]
print('R2 value is ' + str(r * r))
slope, intercept, r_value, p_value, std_err = stats.linregress(
    outs[0, :], outs[1, :])
line = slope * outs[0, :] + intercept
pyp.plot(outs[0, :], line)
stng = 'y = ' + str(slope)[:5] + 'x + ' + str(intercept)[:5] + '\nR2 = ' + str(
    r * r)[:5]
Example #8
0
def write(data, olution):
    name = file_name + '_' + str(evaluate(data, solution)) + output_extension
    path = join(output_solve_folder, name)
    write_data(path, solution)
Example #9
0
def train(c):
    c.setdefault(hebbian=False)
    net = eval(c.model)(c)

    emb_params = count_params(net.embed) + count_params(
        net.loss.projections) + count_params(net.loss.clusters)
    opt = get_opt(c, net)
    net, opt, step = c.init_model(net, opt=opt, step='max', train=True)
    step_lr = scheduler(c, opt, step)

    if c.get('distill'):
        data_tr_distill = DistillationSampleIterator(c, c.train_batch)
        iter_tr_distill = iter(data_tr_distill)
    else:
        data_tr = SampleIterator(c,
                                 c.train_batch,
                                 split='valid' if c.debug else 'train')
        iter_tr = iter(data_tr)
    data_val = SequentialIterator(c, c.eval_batch, split='valid')

    s = Namespace(net=net, opt=opt, step=step)
    c.on_train_start(s)

    c.log('Embedding has %s parameters' % emb_params)

    if c.hebbian:
        counters = [
            torch.ones(end - start, dtype=torch.long, device=c.device)
            for start, end in zip([0] + c.cutoffs, c.cutoffs + [c.n_vocab])
        ]
        temp_counters = [torch.zeros_like(x) for x in counters]

    best_val_loss = np.inf
    if s.results is not None and 'val_loss' in s.results.columns:
        best_val_loss = s.results['val_loss'].dropna().max()
    try:
        while step < s.step_max:
            step_lr(step)
            t_s = time()

            if c.get('distill'):
                hard_labels, soft_labels, soft_probs = next(iter_tr_distill)
                hard_labels = to_torch(hard_labels, c.device).t()

                soft_labels = to_torch(soft_labels, c.device).permute(1, 0,
                                                                      2)[1:]
                soft_probs = to_torch(soft_probs, c.device).permute(1, 0,
                                                                    2)[1:]

                inputs, hard_labels = hard_labels[:-1], hard_labels[1:]
                preds = net(inputs=inputs,
                            labels=hard_labels,
                            soft_labels=soft_labels,
                            soft_probs=soft_probs,
                            current_step=step)
            else:
                x = to_torch(next(iter_tr), c.device).t()
                inputs, labels = x[:-1], x[1:]
                preds = net(inputs, labels)
            loss = preds['loss']

            opt.zero_grad()
            if torch.isnan(loss):
                raise RuntimeError('Encountered nan loss during training')
            if c.opt_level == 'O0':
                loss.backward()
            else:
                with amp.scale_loss(loss, opt) as scaled_loss:
                    scaled_loss.backward()
            torch.nn.utils.clip_grad_norm_(net.parameters(),
                                           c.get('clip_grad', 0.5))
            opt.step()

            if c.hebbian:
                hebbian_weight_update(c, net, preds['hiddens'], counters,
                                      temp_counters)

            time_model = np.round(time() - t_s, 5)
            loss = from_torch(loss)
            perplexity = np.nan if loss > 5 else np.e**loss
            step_result = pd.Series(
                dict(
                    loss=loss,
                    perplexity=perplexity,
                    time=time_model,
                )).add_prefix('train_')
            step_result['lr'] = next(iter(opt.param_groups))['lr']
            if c.get('use_cache'):
                step_result['theta'] = from_torch(preds['theta'])
                step_result['lambda'] = from_torch(preds['lambda'])

            s.step = step = step + 1
            if step % c.step_eval == 0:
                step_result = step_result.append(
                    pd.Series(evaluate(c, data_val, net)).add_prefix('val_'))
                s.record_step = step_result['val_loss'] < best_val_loss
                clear_gpu_memory()
            s.step_result = step_result
            c.on_step_end(s)
    except Exception as e:
        import traceback
        err = traceback.format_exc()
        if c.main:
            c.log(err)
        else:
            print(err)
    finally:
        c.on_train_end(s)
Example #10
0
if __name__ == '__main__':
    c = Config.from_args().setdefault(model='model.Transformer')
    evals = [x for x in ['valid', 'test'] if c.get(x)]
    if len(evals):
        net = eval(c.model)(c)
        net, step = c.init_model(net, step=c.get('step', 'max'), train=False)
        print('Model at step', step)

        emb_params = count_params(net.embed) + count_params(
            net.loss.projections) + count_params(net.loss.clusters)
        print('Model has %s parameters. Embedding has %s parameters' %
              (count_params(net), emb_params))

        cache_search_path = c.res / ('cache_step%s_n%s.yaml' %
                                     (step, c.get('n_cache')))
        if c.get('use_cache_search', True) and cache_search_path.exists():
            for k in 'cache_theta_init', 'cache_lambda_init':
                if c.get(k):
                    c.unvar(k)
            params = cache_search_path.load()
            c.var(**params)
            print('Loaded cache search parameters')
            print(params)

        for split in evals:
            data = SequentialIterator(c, c.eval_batch, split=split)
            print(split, evaluate(c, data, net))
    else:
        train(c)
Example #11
0
    lambda_init = from_torch(state['loss.cache_lambda_inv_sigmoid'].sigmoid())
    theta_init = from_torch(F.softplus(state['loss.cache_theta_inv_softplus']))
    print('trained lambda', lambda_init)
    print('trained theta', theta_init)
else:  # initial cache parameters if we didn't train cache parameters
    lambda_init = 0.1
    theta_init = 0.016
    print('initial lambda', lambda_init)
    print('initial theta', theta_init)

c.var(use_cache=True,
      n_cache=c.n_cache,
      cache_lambda=lambda_init,
      cache_theta=theta_init)

ppl = evaluate(c, data, net)['perplexity']
lam, theta = lambda_init, theta_init
lam_delta, theta_delta = lam / 20, theta / 20
print('Initial val PPL=%s    lambda=%.3g (%.3g)    theta=%.3g (%.3g)' %
      (ppl, lam, lam_delta, theta, theta_delta))
while True:
    ppl_plus = evaluate(c.var(cache_lambda=lam + lam_delta, cache_theta=theta),
                        data, net)['perplexity']
    ppl_minus = evaluate(
        c.var(cache_lambda=lam - lam_delta, cache_theta=theta), data,
        net)['perplexity']
    new_ppl, new_lam = min((ppl_plus, lam + lam_delta),
                           (ppl_minus, (lam - lam_delta)))
    if new_ppl < ppl:
        ppl, lam = new_ppl, new_lam
        lam_delta *= 1.2
Example #12
0
                                       feed_dict=trainBatch)
 if (i + 1) % 20 == 0:
     tfSummaryWriter.add_summary(summary, i + 1)
 if (i + 1) % 100 == 0:
     print(
         "it. {0}/{1} (lr={5:.2e},{4:.2e}), loss={2:.4f}, time={3:.4f}".
         format(i + 1, maxIterN, trainBatchLoss,
                time.time() - timeStart, currLearningRate[0],
                currLearningRate[1]))
 if (i + 1) % 2000 == 0:
     # update image summaries
     if imageSummaries is not None:
         summary = sess.run(imageSummaries, feed_dict=trainBatch)
         tfSummaryWriter.add_summary(summary, i + 1)
     # evaluate on validation and test sets
     testAccuracy = data.evaluate(testData, imageRawBatch, pInitBatch,
                                  labelBatch, prediction, sess, params)
     testError = (1 - testAccuracy) * 100
     summary = sess.run(testErrorSummary,
                        feed_dict={testErrorPH: testError})
     tfSummaryWriter.add_summary(summary, i + 1)
     # save model
     savePath = tfSaver.save(
         sess,
         "models_{2}/{0}_it{1}k.ckpt".format(saveFname, (i + 1) // 1000,
                                             suffix))
     print("model saved: {0}".format(savePath))
 if (i + 1) % 10000 == 0:
     # save intermediate model
     tfSaverInterm.save(
         sess, "models_{2}/interm/{0}_it{1}k.ckpt".format(
             saveFname, (i + 1) // 1000, suffix))
Example #13
0
			imageRawBatch: trainData["image"][randIdx],
			labelBatch: trainData["label"][randIdx],
			learningRate: currLearningRate
		}
		# run one step
		_,trainBatchLoss,summary = sess.run([trainStep,loss,lossSummary],feed_dict=trainBatch)
		if (i+1)%20==0:
			tfSummaryWriter.add_summary(summary,i+1)
		if (i+1)%100==0:
			print("it. {0}/{1} (lr={5:.2e},{4:.2e}), loss={2:.4f}, time={3:.4f}"
				  .format(i+1,maxIterN,trainBatchLoss,time.time()-timeStart,currLearningRate[0],currLearningRate[1]))
		if (i+1)%2000==0:
			# update image summaries
			if imageSummaries is not None:
				summary = sess.run(imageSummaries,feed_dict=trainBatch)
				tfSummaryWriter.add_summary(summary,i+1)
			# evaluate on validation and test sets
			testAccuracy = data.evaluate(testData,imageRawBatch,pInitBatch,labelBatch,prediction,sess,opt)
			testError = (1-testAccuracy)*100
			summary = sess.run(testErrorSummary,feed_dict={testErrorPH: testError})
			tfSummaryWriter.add_summary(summary,i+1)
			# save model
			savePath = tfSaver.save(sess,"models_{2}/{0}_it{1}k.ckpt".format(opt.model,(i+1)//1000,opt.group))
			print("model saved: {0}".format(savePath))
		if (i+1)%10000==0:
			# save intermediate model
			tfSaverInterm.save(sess,"models_{2}/interm/{0}_it{1}k.ckpt".format(opt.model,(i+1)//1000,opt.group))
	# save final model
	tfSaverFinal.save(sess,"models_{1}/final/{0}.ckpt".format(opt.model,opt.group))
print("======= backpropagation done =======")
Example #14
0
def validate(X, y, Z, actual):
    predicted = train_and_predict(X, y, Z)
    return data.evaluate(predicted, actual)
Example #15
0
                                       feed_dict=trainBatch)
 if (i + 1) % 10 == 0:
     tfSummaryWriter.add_summary(summary, i + 1)
 if (i + 1) % 100 == 0:
     print(
         "it. {0}/{1} (lr={5:.2e},{4:.2e}), loss={2:.4f}, time={3:.4f}".
         format(i + 1, maxIterN, trainBatchLoss,
                time.time() - timeStart, currLearningRate[0],
                currLearningRate[1]))
 if (i + 1) % 5000 == 0:
     # update image summaries
     if imageSummaries is not None:
         summary = sess.run(imageSummaries, feed_dict=trainBatch)
         tfSummaryWriter.add_summary(summary, i + 1)
     # evaluate on validation and test sets
     validAccuracy = data.evaluate(validData, imageRawBatch, labelBatch,
                                   prediction, sess, params)
     validError = (1 - validAccuracy) * 100
     print('Iter {} Accuracy: {}'.format(i, validAccuracy))
     if validAccuracy > best_validation_accuracy:
         best_validation_accuracy = validAccuracy
     else:
         params.baseLR = params.baseLR / 10
         if params.baseLR <= 0.0001:
             params.baseLR = 0.0001
     summary = sess.run(validSummary,
                        feed_dict={validErrorPH: validError})
     tfSummaryWriter.add_summary(summary, i + 1)
     # save model
     savePath = tfSaver.save(
         sess,
         "models_{2}/{0}_it{1}k.ckpt".format(saveFname, (i + 1) // 1000,
Example #16
0
def train(c, net, compression_scheduler=None):
    import distiller.apputils as apputils
    from distiller.data_loggers import TensorBoardLogger, PythonLogger
    msglogger = apputils.config_pylogger('logging.conf', None)
    tflogger = TensorBoardLogger(msglogger.logdir)
    tflogger.log_gradients = True
    pylogger = PythonLogger(msglogger)
    c.setdefault(hebbian=False)

    emb_params = count_params(net.embed) + count_params(net.loss.projections) + count_params(net.loss.clusters)
    opt = get_opt(c, net)
    net, opt, step = c.init_model(net, opt=opt, step='max', train=True)
    step_lr = scheduler(c, opt, step)
    data_tr = SampleIterator(c, c.train_batch, split='valid' if c.debug else 'train')
    iter_tr = iter(data_tr)
    data_val = SequentialIterator(c, c.eval_batch, split='valid')

    s = Namespace(net=net, opt=opt, step=step)
    c.on_train_start(s)

    c.log('Embedding has %s parameters' % emb_params)

    if c.get("steps_per_epoch"):
        steps_per_epoch = c.steps_per_epoch
    else:
        steps_per_epoch = len(data_tr.tokens) // data_tr.bs // c.train_chunk
    print("#### steps per epoch %d ####" % steps_per_epoch)

    if c.hebbian:
        counters = [torch.ones(end - start, dtype=torch.long, device=c.device) for start, end in zip([0] + c.cutoffs, c.cutoffs + [c.n_vocab])]
        temp_counters = [torch.zeros_like(x) for x in counters]

    best_val_loss = np.inf
    if s.results is not None and 'val_loss' in s.results.columns:
        best_val_loss = s.results['val_loss'].dropna().max()
    try:
        while step < s.step_max:
            batch = step % steps_per_epoch
            epoch = step // steps_per_epoch
            if step % steps_per_epoch == 0:
                c.log("====> batch=%d, epoch=%d, step=%d" % (batch, epoch, step))
                if compression_scheduler:
                    compression_scheduler.on_epoch_begin(epoch)

            if compression_scheduler:
                compression_scheduler.on_minibatch_begin(epoch, minibatch_id=batch, minibatches_per_epoch=steps_per_epoch)

            step_lr(step)

            x = to_torch(next(iter_tr), c.device).t()

            t_s = time()
            inputs, labels = x[:-1], x[1:]
            preds = net(inputs, labels)
            loss = preds['loss']

            if compression_scheduler:
                _  = compression_scheduler.before_backward_pass(epoch, minibatch_id=batch,
                                                           minibatches_per_epoch=steps_per_epoch,
                                                           loss=loss, return_loss_components=False)

            opt.zero_grad()
            if torch.isnan(loss):
                raise RuntimeError('Encountered nan loss during training')
            loss.backward()
            torch.nn.utils.clip_grad_norm_(net.parameters(), c.get('clip_grad', 0.5))
            opt.step()

            if c.hebbian:
                hebbian_weight_update(c, net, preds['hiddens'], counters, temp_counters)

            time_model = np.round(time() - t_s, 5)

            loss = from_torch(loss)
            perplexity = np.nan if loss > 5 else np.e ** loss
            step_result = pd.Series(dict(
                loss=loss,
                perplexity=perplexity,
                time=time_model
            )).add_prefix('train_')
            step_result['lr'] = next(iter(opt.param_groups))['lr']
            if c.use_cache:
                step_result['theta'] = preds['theta']
                step_result['lambda'] = preds['lambda'].item()

            if compression_scheduler:
                compression_scheduler.on_minibatch_end(epoch, minibatch_id=batch, minibatches_per_epoch=steps_per_epoch)

            if step % steps_per_epoch == 0:
                if compression_scheduler:
                    compression_scheduler.on_epoch_end(epoch)

            s.step = step = step + 1
            if step % c.step_eval == 0:
                distiller.log_weights_sparsity(net, epoch, loggers=[tflogger, pylogger])
                t, total = distiller.weights_sparsity_tbl_summary(net, return_total_sparsity=True)
                c.log("total sparsity: %.3lf" % total)

                step_result = step_result.append(
                    pd.Series(evaluate(c, data_val, net)).add_prefix('val_')
                )
                s.record_step = step_result['val_loss'] < best_val_loss
                clear_gpu_memory()
            s.step_result = step_result
            c.on_step_end(s)
    except Exception as e:
        import traceback
        err = traceback.format_exc()
        if c.main:
            c.log(err)
        else:
            print(err)
    finally:
        c.on_train_end(s)
Example #17
0
    for l in [0.1, 0.01, 0.001, 1]:
        print(f'Trying with learning rate {l}')
        model = LogisticRegression(data["training"])
        model.Lambda = l
        val_acc = model.train(data["training"], data["development.gold"],
                              epochs)
        if val_acc > best_model[1]:
            best_model = (deepcopy(model), val_acc)
            print(
                f'Found new best model with lambda of {l} and val accuracy of {val_acc} '
            )

    if CLASSIFY_TEST_SET:
        print("Labeling test set with the best val accuracy model.")
        test_output = best_model[0].classify(data["test.input"])
        acc, fscores = evaluate(test_output, data["test.gold"])
        print("Final test accuracy: %.2f" % acc)
        print("Per class F1-fscore:")
        for c in fscores:
            print(" %s %.2f" % (c, fscores[c]))
        write_semeval(data["test.input"], test_output, output_file)
    print(f'Best val accuracy was {best_model[1]}')
    """
        Write your code for analyzing model weights here.

        You can use the Python dict encoder which will be defined.
    """

    model = best_model[0]
    for c in model.Y:
        for w in ['happy', 'great', 'awful', 'sucks']:
Example #18
0
def train(c):
    import distiller
    net = Transformer(c)

    opt = get_opt(c, net)
    net, opt, step = c.init_model(net, opt=opt, step='max', train=True)

    step_lr = scheduler(c, opt, step)
    data_tr = SampleIterator(c,
                             c.train_batch,
                             split='valid' if c.debug else 'train')
    iter_tr = iter(data_tr)
    data_val = SequentialIterator(c, c.eval_batch, split='valid')
    data_test = SequentialIterator(c, c.eval_batch, split='test')

    print('Before quantization')
    tbl, sparsity = distiller.weights_sparsity_tbl_summary(
        net, return_total_sparsity=True)
    step_result = pd.Series(evaluate(c, data_val, net)).add_prefix('val_')
    step_result = step_result.append(
        pd.Series(evaluate(c, data_test, net)).add_prefix('test_'))
    step_result['sparsity'] = sparsity
    print(step_result)

    compression_scheduler = distiller.config.file_config(net, opt, c.compress)

    print('After initial quantization')
    s = Namespace(net=net, opt=opt, step=step)
    c.on_train_start(s)

    tbl, sparsity = distiller.weights_sparsity_tbl_summary(
        net, return_total_sparsity=True)
    step_result = pd.Series(evaluate(c, data_val, net)).add_prefix('val_')
    step_result = step_result.append(
        pd.Series(evaluate(c, data_test, net)).add_prefix('test_'))
    step_result['sparsity'] = sparsity
    print(step_result)

    npm = []
    for name, param in net.named_parameters():
        if param.dim() in [2, 4] and any(type in name
                                         for type in ['weight', 'bias']):
            npm.append((name, param, param.abs() == 0))

    best_val_loss = np.inf
    if s.results is not None and 'val_loss' in s.results.columns:
        best_val_loss = s.results['val_loss'].dropna().max()
    try:
        steps_per_epoch = c.step_eval
        while step < s.step_max:
            epoch = step // steps_per_epoch
            batch = step % steps_per_epoch

            if batch == 0:
                compression_scheduler.on_epoch_begin(epoch)
            compression_scheduler.on_minibatch_begin(epoch, batch,
                                                     steps_per_epoch)

            step_lr(step)

            x = to_torch(next(iter_tr), c.device).t()

            t_s = time()
            inputs, labels = x[:-1], x[1:]
            preds = net(inputs, labels)
            loss = preds['loss']

            compression_scheduler.before_backward_pass(epoch, batch,
                                                       steps_per_epoch, loss,
                                                       False)

            opt.zero_grad()

            loss.backward()
            torch.nn.utils.clip_grad_norm_(net.parameters(),
                                           c.get('clip_grad', 0.5))

            compression_scheduler.before_parameter_optimization(
                epoch, batch, steps_per_epoch, opt)
            opt.step()
            for name, param, mask in npm:
                param.data[mask] = 0
            compression_scheduler.on_minibatch_end(epoch, batch,
                                                   steps_per_epoch)

            if (batch + 1) == steps_per_epoch:
                compression_scheduler.on_epoch_end(epoch)

            time_model = np.round(time() - t_s, 5)

            loss = from_torch(loss)
            perplexity = np.nan if loss > 5 else np.e**loss
            step_result = pd.Series(
                dict(
                    loss=loss,
                    perplexity=perplexity,
                    time=time_model,
                )).add_prefix('train_')
            step_result['lr'] = next(iter(opt.param_groups))['lr']

            s.step = step = step + 1
            if step % c.step_eval == 0:
                tbl, sparsity = distiller.weights_sparsity_tbl_summary(
                    net, return_total_sparsity=True)
                step_result = step_result.append(
                    pd.Series(evaluate(c, data_val, net)).add_prefix('val_'))
                step_result = step_result.append(
                    pd.Series(evaluate(c, data_test, net)).add_prefix('test_'))
                step_result['sparsity'] = sparsity
                s.record_step = step_result['val_loss'] < best_val_loss
                clear_gpu_memory()
            s.step_result = step_result
            c.on_step_end(s)
    except Exception as e:
        import traceback
        err = traceback.format_exc()
        if c.main:
            c.log(err)
        else:
            print(err)
    finally:
        c.on_train_end(s)
    return net, step
    data = read_20newsgroup_datasets(data_dir)
    shuffle(data["train"])
    print("Extracting features.")
    extract_features(data)

    # You can explore the effect of lambda using the test set.
    lambdas = {10:0.1, 50:0.0001, 100:0.0001}

    for labeled_size in [10,50,100]:
        print("Experiment with %u labeled examples:" % labeled_size)
        labeled_train_data = data['train'][:labeled_size]
        unlabeled_train_data = data['train'][labeled_size:]

        print("  Training fully supervised model.")
        model = NaiveBayes(data['train'])
        model.train(labeled_train_data, unlabeled_train_data, semisupervised=0)

        test_output = model.classify(data["test"])
        acc, fscores = evaluate(test_output, data["test"])
        print("  Test accuracy: %.2f" % acc)
        print()

        print(" Training semi-supervised model.")
        model = NaiveBayes(data['train'],lambdas[labeled_size])
        model.train(labeled_train_data, unlabeled_train_data, semisupervised=1)

        test_output = model.classify(data["test"])
        acc, fscores = evaluate(test_output, data["test"])
        print("  Test accuracy: %.2f" % acc)
        print()