def get_mcdnn_predictions(model_pkl_url, dataset_list):

    dataset_size = 10000
    batch_size = 128
    model = serial.load(model_pkl_url)

    dataset = load_dataset('test', dataset_list)
    it = dataset.iterator(mode='sequential', batch_size=128)
    # loro
    inputs = model.get_input_space().make_theano_batch()
    assert len(inputs) == 2 or len(inputs) == 3
    f_model = theano.function(inputs, model.fprop(inputs), name='morte')
    # where to save the predictions
    y_predictions = np.zeros((dataset_size, 10))

    # print len(inputs), inputs
    i = 0
    try:
        while 1:

            batch_start = i
            batch_end = i+batch_size if i+batch_size < dataset_size else dataset_size


            if len(inputs) == 2:
                x1_batch, x2_batch, y_batch = it.next()
                y = f_model(x1_batch, x2_batch)
            else:
                x1_batch, x2_batch, x3_batch, y_batch = it.next()
                y = f_model(x1_batch, x2_batch, x3_batch)


            y_predictions[batch_start:batch_end] = y

            # print batch_start, ':', batch_end, '   ', get_statistics(np.argmax(y_batch, axis=1), y)
            i += batch_size
    except StopIteration:
        pass

    # save predicition for this column ( still onehot)
    print "save "+'csv_prediction/prediction_multicolumn_'+"_".join(dataset_list)+'.csv'
    with open('csv_prediction/prediction_multicolumn_'+"_".join(dataset_list)+'.csv', 'w') as file_handle:
        np.savetxt(file_handle, y_predictions, delimiter=',')
    # print "Column ", key
    y_ground_truth = np.float64(np.genfromtxt('csv_prediction/prediction_ground_truth.csv', delimiter=','))

    print "total\t ", get_statistics(y_ground_truth, y_predictions)
Esempio n. 2
0
def train():
    # Check NNabla version
    if utils.get_nnabla_version_integer() < 11900:
        raise ValueError(
            'Please update the nnabla version to v1.19.0 or latest version since memory efficiency of core engine is improved in v1.19.0'
        )

    parser, args = get_train_args()

    # Get context.
    ctx = get_extension_context(args.context, device_id=args.device_id)
    comm = CommunicatorWrapper(ctx)
    nn.set_default_context(comm.ctx)
    ext = import_extension_module(args.context)

    # Monitors
    # setting up monitors for logging
    monitor_path = args.output
    monitor = Monitor(monitor_path)

    monitor_best_epoch = MonitorSeries('Best epoch', monitor, interval=1)
    monitor_traing_loss = MonitorSeries('Training loss', monitor, interval=1)
    monitor_validation_loss = MonitorSeries('Validation loss',
                                            monitor,
                                            interval=1)
    monitor_lr = MonitorSeries('learning rate', monitor, interval=1)
    monitor_time = MonitorTimeElapsed("training time per iteration",
                                      monitor,
                                      interval=1)

    if comm.rank == 0:
        print("Mixing coef. is {}, i.e., MDL = {}*TD-Loss + FD-Loss".format(
            args.mcoef, args.mcoef))
        if not os.path.isdir(args.output):
            os.makedirs(args.output)

    # Initialize DataIterator for MUSDB.
    train_source, valid_source, args = load_datasources(parser, args)

    train_iter = data_iterator(train_source,
                               args.batch_size,
                               RandomState(args.seed),
                               with_memory_cache=False,
                               with_file_cache=False)

    valid_iter = data_iterator(valid_source,
                               1,
                               RandomState(args.seed),
                               with_memory_cache=False,
                               with_file_cache=False)

    if comm.n_procs > 1:
        train_iter = train_iter.slice(rng=None,
                                      num_of_slices=comm.n_procs,
                                      slice_pos=comm.rank)

        valid_iter = valid_iter.slice(rng=None,
                                      num_of_slices=comm.n_procs,
                                      slice_pos=comm.rank)

    # Calculate maxiter per GPU device.
    max_iter = int((train_source._size // args.batch_size) // comm.n_procs)
    weight_decay = args.weight_decay * comm.n_procs

    print("max_iter", max_iter)

    # Calculate the statistics (mean and variance) of the dataset
    scaler_mean, scaler_std = utils.get_statistics(args, train_source)

    max_bin = utils.bandwidth_to_max_bin(train_source.sample_rate, args.nfft,
                                         args.bandwidth)

    unmix = OpenUnmix_CrossNet(input_mean=scaler_mean,
                               input_scale=scaler_std,
                               nb_channels=args.nb_channels,
                               hidden_size=args.hidden_size,
                               n_fft=args.nfft,
                               n_hop=args.nhop,
                               max_bin=max_bin)

    # Create input variables.
    mixture_audio = nn.Variable([args.batch_size] +
                                list(train_source._get_data(0)[0].shape))
    target_audio = nn.Variable([args.batch_size] +
                               list(train_source._get_data(0)[1].shape))

    vmixture_audio = nn.Variable(
        [1] + [2, valid_source.sample_rate * args.valid_dur])
    vtarget_audio = nn.Variable([1] +
                                [8, valid_source.sample_rate * args.valid_dur])

    # create training graph
    mix_spec, M_hat, pred = unmix(mixture_audio)
    Y = Spectrogram(*STFT(target_audio, n_fft=unmix.n_fft, n_hop=unmix.n_hop),
                    mono=(unmix.nb_channels == 1))
    loss_f = mse_loss(mix_spec, M_hat, Y)
    loss_t = sdr_loss(mixture_audio, pred, target_audio)
    loss = args.mcoef * loss_t + loss_f
    loss.persistent = True

    # Create Solver and set parameters.
    solver = S.Adam(args.lr)
    solver.set_parameters(nn.get_parameters())

    # create validation graph
    vmix_spec, vM_hat, vpred = unmix(vmixture_audio, test=True)
    vY = Spectrogram(*STFT(vtarget_audio, n_fft=unmix.n_fft,
                           n_hop=unmix.n_hop),
                     mono=(unmix.nb_channels == 1))
    vloss_f = mse_loss(vmix_spec, vM_hat, vY)
    vloss_t = sdr_loss(vmixture_audio, vpred, vtarget_audio)
    vloss = args.mcoef * vloss_t + vloss_f
    vloss.persistent = True

    # Initialize Early Stopping
    es = utils.EarlyStopping(patience=args.patience)

    # Initialize LR Scheduler (ReduceLROnPlateau)
    lr_scheduler = ReduceLROnPlateau(lr=args.lr,
                                     factor=args.lr_decay_gamma,
                                     patience=args.lr_decay_patience)
    best_epoch = 0

    # Training loop.
    for epoch in trange(args.epochs):
        # TRAINING
        losses = utils.AverageMeter()
        for batch in range(max_iter):
            mixture_audio.d, target_audio.d = train_iter.next()
            solver.zero_grad()
            loss.forward(clear_no_need_grad=True)
            if comm.n_procs > 1:
                all_reduce_callback = comm.get_all_reduce_callback()
                loss.backward(clear_buffer=True,
                              communicator_callbacks=all_reduce_callback)
            else:
                loss.backward(clear_buffer=True)
            solver.weight_decay(weight_decay)
            solver.update()
            losses.update(loss.d.copy(), args.batch_size)
        training_loss = losses.avg

        # clear cache memory
        ext.clear_memory_cache()

        # VALIDATION
        vlosses = utils.AverageMeter()
        for batch in range(int(valid_source._size // comm.n_procs)):
            x, y = valid_iter.next()
            dur = int(valid_source.sample_rate * args.valid_dur)
            sp, cnt = 0, 0
            loss_tmp = nn.NdArray()
            loss_tmp.zero()
            while 1:
                vmixture_audio.d = x[Ellipsis, sp:sp + dur]
                vtarget_audio.d = y[Ellipsis, sp:sp + dur]
                vloss.forward(clear_no_need_grad=True)
                cnt += 1
                sp += dur
                loss_tmp += vloss.data
                if x[Ellipsis,
                     sp:sp + dur].shape[-1] < dur or x.shape[-1] == cnt * dur:
                    break
            loss_tmp = loss_tmp / cnt
            if comm.n_procs > 1:
                comm.all_reduce(loss_tmp, division=True, inplace=True)
            vlosses.update(loss_tmp.data.copy(), 1)
        validation_loss = vlosses.avg

        # clear cache memory
        ext.clear_memory_cache()

        lr = lr_scheduler.update_lr(validation_loss, epoch=epoch)
        solver.set_learning_rate(lr)
        stop = es.step(validation_loss)

        if comm.rank == 0:
            monitor_best_epoch.add(epoch, best_epoch)
            monitor_traing_loss.add(epoch, training_loss)
            monitor_validation_loss.add(epoch, validation_loss)
            monitor_lr.add(epoch, lr)
            monitor_time.add(epoch)

            if validation_loss == es.best:
                # save best model
                nn.save_parameters(os.path.join(args.output, 'best_xumx.h5'))
                best_epoch = epoch

        if stop:
            print("Apply Early Stopping")
            break
Esempio n. 3
0
    def __test__(self, epoch, text_seqs, class_list):

        assert len(text_seqs) == len(class_list)

        start_time = time.time()
        step_time = time.time()

        test_steps = len(text_seqs) // config.batch_size

        topk_list = list()
        pred_class_list = list()

        all_loss = np.zeros(1)

        for cstep in range(test_steps):

            text_seqs_mini = text_seqs[cstep * config.batch_size : (cstep + 1) * config.batch_size]
            class_idx_or_mini = [_ for _ in class_list[cstep * config.batch_size : (cstep + 1) * config.batch_size]]
            class_idx_mini = [self.seen_class_map2index[_] for _ in class_list[cstep * config.batch_size : (cstep + 1) * config.batch_size]]

            encode_seqs_id_mini, encode_seqs_mat_mini = self.prepro_encode(text_seqs_mini, False)

            pred_mat = np.zeros([config.batch_size, len(self.class_dict)])

            test_loss, out  = self.sess.run([
                self.model.test_loss,
                self.model.test_net.outputs,
            ], feed_dict={
                self.model.encode_seqs: encode_seqs_mat_mini,
                self.model.category_target_index: class_idx_mini
            })

            all_loss[0] += test_loss

            pred = np.array([_ / np.sum(_) for _ in np.exp(out)])

            for i in range(len(self.seen_class)):
                pred_mat[:, self.full_class_map2index[self.seen_class[i]]] = pred[:, i]

            topk = self.get_pred_class_topk(pred_mat, k=1)
            topk_list.append(topk)
            pred_class_list.append(pred_mat)

            if cstep % config.cstep_print == 0 and cstep > 0:
                tmp_topk = np.concatenate(topk_list, axis=0)
                tmp_topk = self.get_one_hot_results(np.array(tmp_topk[(cstep + 1 - config.cstep_print) * config.batch_size : (cstep + 1) * config.batch_size]))
                tmp_gt = self.get_one_hot_results(np.reshape(np.array(class_list[(cstep + 1 - config.cstep_print) * config.batch_size : (cstep + 1) * config.batch_size]), newshape=(-1, 1)))
                tmp_stats = utils.get_statistics(tmp_topk, tmp_gt, single_label_pred=True)

                print(
                    "[Test] Epoch: [%3d][%4d/%4d] time: %.4f, loss: %s \n %s" %
                    (epoch, cstep, test_steps, time.time() - step_time, all_loss / (cstep + 1), utils.dict_to_string_4_print(tmp_stats))
                )
                step_time = time.time()


        prediction_topk = np.concatenate(topk_list, axis=0)

        # np.set_printoptions(threshold=np.nan, linewidth=100000)
        # print(class_list[: 200])
        # print(np.squeeze(prediction_topk[: 200]))
        # print(class_list[-200: ])
        # print(np.squeeze(prediction_topk[-200: ]))

        prediction_topk = self.get_one_hot_results(np.array(prediction_topk[: test_steps * config.batch_size]))
        ground_truth = self.get_one_hot_results(np.reshape(np.array(class_list[: test_steps * config.batch_size]), newshape=(-1, 1)))

        stats = utils.get_statistics(prediction_topk, ground_truth, single_label_pred=True)

        print(
            "[Test Sum] Epoch: [%3d] time: %.4f, loss: %s \n %s" %
            (epoch, time.time() - start_time, all_loss / test_steps , utils.dict_to_string_4_print(stats))
        )

        return stats, prediction_topk, ground_truth, np.array([0]), np.array([0])
Esempio n. 4
0
def get_details(url):
    html_content = get_html_using_content(url)
    data = parse_html_using_tag(html_content, 'p')
    statistics = get_statistics(data)
    display(statistics)
Esempio n. 5
0
    for k in xrange(K):
        model = HMSVMModel(features_no_fold[k], labels_no_fold[k],
                           SMT_TWO_STATE)
        model.set_use_plifs(True)
        sosvm = DualLibQPBMSOSVM(model, loss, labels_no_fold[k],
                                 reg * (K - 1) * num_fold_examples)
        sosvm.set_verbose(True)
        print '\ton fold %d' % k
        t0 = time.time()
        sosvm.train()
        print '\t\tElapsed: training took ' + str(time.time() - t0)
        w.append(sosvm.get_w())
        t1 = time.time()
        prediction = sosvm.apply(models[k].get_features())
        print '\t\tElapsed: prediction took ' + str(time.time() - t1)
        accuracy = evaluator.evaluate(prediction, models[k].get_labels())
        print str(accuracy * 100) + '%'
        statistics = utils.get_statistics(models[k].get_labels(), prediction)
        custom_accuracy = (100. * statistics['success_count']) / (
            num_fold_examples * example_len)
        print '\t\t%.2f\t1s: (%5d, %5d)\t0s: (%5d, %5d)' % (
            custom_accuracy, statistics['true_1_count'],
            statistics['pred_1_count'], statistics['true_0_count'],
            statistics['pred_0_count'])
        accuracies.append(accuracy)
    print '\toverall success rate of ' + str(
        numpy.mean(accuracies) * 100) + '%'
    W.append(w)

pickle.dump(W, open('W_DualLibQPBMSOSVM_%s.p' % data_file, 'wb'))
Esempio n. 6
0
for reg in regularizations:
	print 'training SO-SVM with regularization %.2f' % reg
	accuracies = []
	w = []
	for k in xrange(K):
		model = HMSVMModel(features_no_fold[k], labels_no_fold[k], SMT_TWO_STATE)
		model.set_use_plifs(True)
		sosvm = PrimalMosekSOSVM(model, loss, labels_no_fold[k])
		sosvm.set_regularization(reg)
		print '\ton fold %d' % k,
		sosvm.io.set_loglevel(MSG_DEBUG)
		t0 = time.time()
		sosvm.train()
		print 'Elapsed: training took ' + str(time.time()-t0)
		w.append(sosvm.get_w())
		t1 = time.time()
		prediction = sosvm.apply(models[k].get_features())
		print 'Elapsed: prediction took ' + str(time.time()-t1)
		accuracy = evaluator.evaluate(prediction, models[k].get_labels())
		print str(accuracy*100) + '%'
		statistics = utils.get_statistics(models[k].get_labels(), prediction)
		custom_accuracy = (100.*statistics['success_count'])/(num_fold_examples*example_len)
		print '\t\t%.2f\t1s: (%5d, %5d)\t0s: (%5d, %5d)' % (custom_accuracy,
				statistics['true_1_count'], statistics['pred_1_count'],
				statistics['true_0_count'], statistics['pred_0_count'])
		accuracies.append(accuracy)
	print '\toverall success rate of ' + str(numpy.mean(accuracies)*100) + '%'
	W.append(w)

pickle.dump(W, open('W_PrimalMosekSOSVM_%s.p' % data_file, 'wb'))
def analysis():
    """

    :return:
    """

    y_ground_truth = np.float64(np.genfromtxt('csv_prediction/prediction_ground_truth.csv', delimiter=','))

    y_gcn_predictions = np.float64(np.genfromtxt('csv_prediction/prediction_gcn.csv', delimiter=','))
    y_toronto_predictions = np.float64(np.genfromtxt('csv_prediction/prediction_toronto.csv', delimiter=','))
    y_zca_predictions = np.float64(np.genfromtxt('csv_prediction/prediction_zca.csv', delimiter=','))
    y_multi_gcn_tor = np.float64(np.genfromtxt('csv_prediction/prediction_multicolumn_gcn_toronto.csv', delimiter=','))
    y_multi_gcn_zca = np.float64(np.genfromtxt('csv_prediction/prediction_multicolumn_gcn_zca.csv', delimiter=','))
    y_multi_zca_tor = np.float64(np.genfromtxt('csv_prediction/prediction_multicolumn_toronto_zca.csv', delimiter=','))
    y_multi_gcn_tor_zca = np.float64(np.genfromtxt('csv_prediction/prediction_multicolumn_gcn_toronto_zca.csv', delimiter=','))
    y_multi_naive_gcn_tor = np.float64(np.genfromtxt('csv_prediction/prediction_multicolumn_naive_toronto_gcn.csv', delimiter=','))
    y_multi_naive_gcn_zca = np.float64(np.genfromtxt('csv_prediction/prediction_multicolumn_naive_zca_gcn.csv', delimiter=','))
    y_multi_naive_zca_tor = np.float64(np.genfromtxt('csv_prediction/prediction_multicolumn_naive_toronto_zca.csv', delimiter=','))
    y_multi_naive_gcn_tor_zca = np.float64(np.genfromtxt('csv_prediction/prediction_multicolumn_naive_toronto_zca_gcn.csv', delimiter=','))

    print "_____Results __________\t______________________"
    print "_______METHOD__________\t_____MEAN____VAR______"
    print "Single GCN             \t ", get_statistics(y_ground_truth, y_gcn_predictions)
    print "Single TOR             \t ", get_statistics(y_ground_truth, y_toronto_predictions)
    print "Single ZCA             \t ", get_statistics(y_ground_truth, y_zca_predictions)
    print "------------------------------------------------------------------"
    print "Multi GCN_TOR          \t ", get_statistics(y_ground_truth, y_multi_gcn_tor)
    print "Multi GCN_ZCA          \t ", get_statistics(y_ground_truth, y_multi_gcn_zca)
    print "Multi ZCA_TOR          \t ", get_statistics(y_ground_truth, y_multi_zca_tor)
    print "Multi GCN_TOR_ZCA      \t ", get_statistics(y_ground_truth, y_multi_gcn_tor_zca)
    print "-------------------------------------------------------------------"
    print "Multi-Naive GCN_TOR    \t ", get_statistics(y_ground_truth, y_multi_naive_gcn_tor)
    print "Multi-Naive GCN_ZCA    \t ", get_statistics(y_ground_truth, y_multi_naive_gcn_zca)
    print "Multi-Naive ZCA_TOR    \t ", get_statistics(y_ground_truth, y_multi_naive_zca_tor)
    print "Multi-Naive GCN_TOR_ZCA\t ", get_statistics(y_ground_truth, y_multi_naive_gcn_tor_zca)
    print "_______________________________________"

    plot_single_cm(y_ground_truth, y_gcn_predictions, "Single GCN")
    plot_single_cm(y_ground_truth, y_toronto_predictions, "Single Toronto")
    plot_single_cm(y_ground_truth, y_zca_predictions, "Single ZCA")

    plot_single_cm(y_ground_truth, y_multi_gcn_tor, "Multi GCN_TOR")
    plot_single_cm(y_ground_truth, y_multi_gcn_zca, "Multi GCN_ZCA")
    plot_single_cm(y_ground_truth, y_multi_zca_tor, "Multi ZCA_TOR")
    plot_single_cm(y_ground_truth, y_multi_gcn_tor_zca, "Multi GCN_TOR_ZCA")

    plot_single_cm(y_ground_truth, y_multi_naive_gcn_tor, "Multi-Naive GCN_TOR")
    plot_single_cm(y_ground_truth, y_multi_naive_gcn_zca, "Multi-Naive GCN_ZCA")
    plot_single_cm(y_ground_truth, y_multi_naive_zca_tor, "Multi-Naive ZCA_TOR")
    plot_single_cm(y_ground_truth, y_multi_naive_gcn_tor_zca, "Multi-Naive GCN_TOR_ZCA")
Esempio n. 8
0
def train():
    # Check NNabla version
    if utils.get_nnabla_version_integer() < 11900:
        raise ValueError(
            'Please update the nnabla version to v1.19.0 or latest version since memory efficiency of core engine is improved in v1.19.0'
        )

    parser, args = get_train_args()

    # Get context.
    ctx = get_extension_context(args.context, device_id=args.device_id)
    comm = CommunicatorWrapper(ctx)
    nn.set_default_context(comm.ctx)
    ext = import_extension_module(args.context)

    # Monitors
    # setting up monitors for logging
    monitor_path = args.output
    monitor = Monitor(monitor_path)

    monitor_best_epoch = MonitorSeries('Best epoch', monitor, interval=1)
    monitor_traing_loss = MonitorSeries('Training loss', monitor, interval=1)
    monitor_validation_loss = MonitorSeries('Validation loss',
                                            monitor,
                                            interval=1)
    monitor_lr = MonitorSeries('learning rate', monitor, interval=1)
    monitor_time = MonitorTimeElapsed("training time per iteration",
                                      monitor,
                                      interval=1)

    if comm.rank == 0:
        if not os.path.isdir(args.output):
            os.makedirs(args.output)

    # Initialize DataIterator for MUSDB18.
    train_source, valid_source, args = load_datasources(parser, args)

    train_iter = data_iterator(
        train_source,
        args.batch_size,
        RandomState(args.seed),
        with_memory_cache=False,
    )

    valid_iter = data_iterator(
        valid_source,
        1,
        RandomState(args.seed),
        with_memory_cache=False,
    )

    if comm.n_procs > 1:
        train_iter = train_iter.slice(rng=None,
                                      num_of_slices=comm.n_procs,
                                      slice_pos=comm.rank)

        valid_iter = valid_iter.slice(rng=None,
                                      num_of_slices=comm.n_procs,
                                      slice_pos=comm.rank)

    # Calculate maxiter per GPU device.
    # Change max_iter, learning_rate and weight_decay according no. of gpu devices for multi-gpu training.
    default_batch_size = 16
    train_scale_factor = (comm.n_procs * args.batch_size) / default_batch_size
    max_iter = int((train_source._size // args.batch_size) // comm.n_procs)
    weight_decay = args.weight_decay * train_scale_factor
    args.lr = args.lr * train_scale_factor

    # Calculate the statistics (mean and variance) of the dataset
    scaler_mean, scaler_std = utils.get_statistics(args, train_source)

    # clear cache memory
    ext.clear_memory_cache()

    max_bin = utils.bandwidth_to_max_bin(train_source.sample_rate, args.nfft,
                                         args.bandwidth)

    # Get X-UMX/UMX computation graph and variables as namedtuple
    model = get_model(args, scaler_mean, scaler_std, max_bin=max_bin)

    # Create Solver and set parameters.
    solver = S.Adam(args.lr)
    solver.set_parameters(nn.get_parameters())

    # Initialize Early Stopping
    es = utils.EarlyStopping(patience=args.patience)

    # Initialize LR Scheduler (ReduceLROnPlateau)
    lr_scheduler = ReduceLROnPlateau(lr=args.lr,
                                     factor=args.lr_decay_gamma,
                                     patience=args.lr_decay_patience)
    best_epoch = 0

    # AverageMeter for mean loss calculation over the epoch
    losses = utils.AverageMeter()

    # Training loop.
    for epoch in trange(args.epochs):
        # TRAINING
        losses.reset()
        for batch in range(max_iter):
            model.mixture_audio.d, model.target_audio.d = train_iter.next()
            solver.zero_grad()
            model.loss.forward(clear_no_need_grad=True)
            if comm.n_procs > 1:
                all_reduce_callback = comm.get_all_reduce_callback()
                model.loss.backward(clear_buffer=True,
                                    communicator_callbacks=all_reduce_callback)
            else:
                model.loss.backward(clear_buffer=True)
            solver.weight_decay(weight_decay)
            solver.update()
            losses.update(model.loss.d.copy(), args.batch_size)
        training_loss = losses.get_avg()

        # clear cache memory
        ext.clear_memory_cache()

        # VALIDATION
        losses.reset()
        for batch in range(int(valid_source._size // comm.n_procs)):
            x, y = valid_iter.next()
            dur = int(valid_source.sample_rate * args.valid_dur)
            sp, cnt = 0, 0
            loss_tmp = nn.NdArray()
            loss_tmp.zero()
            while 1:
                model.vmixture_audio.d = x[Ellipsis, sp:sp + dur]
                model.vtarget_audio.d = y[Ellipsis, sp:sp + dur]
                model.vloss.forward(clear_no_need_grad=True)
                cnt += 1
                sp += dur
                loss_tmp += model.vloss.data
                if x[Ellipsis,
                     sp:sp + dur].shape[-1] < dur or x.shape[-1] == cnt * dur:
                    break
            loss_tmp = loss_tmp / cnt
            if comm.n_procs > 1:
                comm.all_reduce(loss_tmp, division=True, inplace=True)
            losses.update(loss_tmp.data.copy(), 1)
        validation_loss = losses.get_avg()

        # clear cache memory
        ext.clear_memory_cache()

        lr = lr_scheduler.update_lr(validation_loss, epoch=epoch)
        solver.set_learning_rate(lr)
        stop = es.step(validation_loss)

        if comm.rank == 0:
            monitor_best_epoch.add(epoch, best_epoch)
            monitor_traing_loss.add(epoch, training_loss)
            monitor_validation_loss.add(epoch, validation_loss)
            monitor_lr.add(epoch, lr)
            monitor_time.add(epoch)

            if validation_loss == es.best:
                best_epoch = epoch
                # save best model
                if args.umx_train:
                    nn.save_parameters(os.path.join(args.output,
                                                    'best_umx.h5'))
                else:
                    nn.save_parameters(
                        os.path.join(args.output, 'best_xumx.h5'))

        if args.umx_train:
            # Early stopping for UMX after `args.patience` (140) number of epochs
            if stop:
                print("Apply Early Stopping")
                break