Exemple #1
0
    def train(self, epochs, wd, params=None, init_epochs=0, bs=4):
        trainer = Trainer(self.net.collect_params(params), self.optimizer,
                          {'wd': wd})
        metrics = mx.metric.create(self.metrics)
        self.history = [[], []]
        iteration = 1
        val_iter = 1
        avg_mom = 0.98
        tavg_loss, vavg_loss = 0., 0.

        for epoch in range(epochs):
            for data, label in self.loader[0]:
                data = data.as_in_context(self.ctx)
                label = label.as_in_context(self.ctx)
                with autograd.record():
                    output = self.net(data)
                    loss = self.criterion(output, label)
                lr = self.scheduler(iteration)
                trainer.set_learning_rate(lr)
                loss.backward()
                trainer.step(bs)
                tavg_loss = tavg_loss * avg_mom + \
                    (1 - avg_mom) * (nd.mean(loss).asscalar())
                self.history[0].append(tavg_loss / (1 - avg_mom**iteration))
                iteration += 1

            metrics.reset()

            for data, label in self.loader[1]:
                data = data.as_in_context(self.ctx)
                label = label.as_in_context(self.ctx)
                output = self.net(data)
                loss = self.criterion(output, label)
                vavg_loss = vavg_loss * avg_mom + \
                    (1 - avg_mom) * (nd.mean(loss).asscalar())
                self.history[1].append(vavg_loss / (1 - avg_mom**val_iter))
                val_iter += 1
                metrics.update(preds=output, labels=label)
            status = [init_epochs + epoch + 1] + \
                [self.history[0][-1], self.history[1][-1]]
            if self.metrics is not None:
                status.append(metrics.get()[1])
            print('{}'.format(status))
        return self.history
def train(net, train_dataloader, valid_dataloader, ctx_list, args):
    """Training pipline """
    # optimizer
    trainer = Trainer(net.collect_params(), 'sgd', {
        'learning_rate': args.lr,
        'wd': args.wd,
        'momentum': args.momentum
    })
    # loss
    acc_metric = AccuracyMetric()
    loss_metric = mx.metric.Loss('SoftMaxCrossEntropyLoss')
    valid_metric = ValidMetric()
    cross_entropy_loss = gloss.SoftmaxCrossEntropyLoss()

    metric1 = [loss_metric]
    metric2 = [acc_metric]

    # create a logging
    logging.basicConfig()
    # get a logger
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    log_file_path = args.save_prefix + '_train.log'
    log_dir = os.path.dirname(log_file_path)
    if log_dir and not os.path.exists(log_dir):
        os.makedirs(log_dir)
    fd = logging.FileHandler(log_file_path)
    logger.addHandler(fd)
    logger.info(args)
    if args.verbose:
        logger.info('Trainabel paramters:')
        logger.info(net.collect_params().keys())
    logger.info('Start training from [Epoch {}]'.format(args.start_epoch))
    logger.info('Traing on {}'.format(ctx_list))
    # create
    best_acc = [0]
    lr_steps = sorted(
        [int(step) for step in args.lr_decay_epoch.split(',') if step.strip()])
    lr_decay = float(args.lr_decay)

    for epoch in range(args.start_epoch, args.epochs):
        ttime = time.time()
        btime = time.time()

        # lr_decay
        if lr_steps and epoch >= lr_steps[0]:
            new_lr = trainer.learning_rate * lr_decay
            lr_steps.pop(0)
            trainer.set_learning_rate(new_lr)
            logger.info('[Epoch {}] set learning rate to {:.4f}'.format(
                epoch, new_lr))

        acc_metric.reset()
        if args.hybrid:
            net.hybridize(static_alloc=True)
        # get mini-batch data
        # batch [data,label]
        for i, batch in enumerate(train_dataloader):
            batch_size = len(batch[0])
            batch = split_and_load_data(batch, ctx_list, batch_size)

            losses = []
            metrics = []
            with autograd.record():
                for data, cls_label in zip(*batch):
                    # forward
                    pred_scores = net(data)
                    # loss
                    loss = cross_entropy_loss(pred_scores, cls_label)
                    # record loss and preds
                    losses.append(loss)
                    metrics.append([[cls_label], [pred_scores]])
            # backward
            autograd.backward(losses)
            # optimizer params
            trainer.step(batch_size)
            # update metrics...
            for record in metrics:
                acc_metric.update(record[0], record[1])
            for record in losses:
                loss_metric.update(0, record)
            if args.log_interval and not (i + 1) % args.log_interval:
                # logging

                info = ','.join([
                    '{}={:.3f}'.format(*metric.get())
                    for metric in metric1 + metric2
                ])
                msg = '[Epoch {}][Batch {}],Speed: {:.3f} samples/sec,{}'.format(
                    epoch, i,
                    args.log_interval * batch_size / (time.time() - btime),
                    info)
                logger.info(msg)
                btime = time.time()

        info = ','.join(['{}={:.3f}'.format(*loss_metric.get())])
        msg = '[Epoch {}] Traning cost : {:.3f},{}'.format(
            epoch,
            time.time() - ttime, info)
        logger.info(msg)
        if args.val_interval and not (epoch + 1) % args.val_interval:
            name, current_acc = evaluate(net, valid_dataloader, valid_metric,
                                         ctx_list, args.hybrid)
            info = '{}={:.3f}'.format(name, current_acc)
            msg = '[Epoch {}] Validation {}.'.format(epoch, info)
            logger.info(msg)
        else:
            current_acc = 0

        save_parameters(net, logger, best_acc, current_acc, epoch,
                        args.save_interval, args.save_prefix)
    def train_model_for_ml(self):
        """
        训练模型, 多标签
        """
        base_net = self.get_base_net()  # 基础网络
        train_data, len_td = self.get_train_data(self.batch_size)  # 训练数据,按批次获取
        val_data, len_vd = self.get_val_data(self.batch_size)  # 训练数据,按批次获取

        trainer = Trainer(base_net.collect_params(), 'rmsprop',
                          {'learning_rate': 1e-4})
        loss_func = SigmoidBinaryCrossEntropyLoss()

        lr_steps = [10, 20, 30, np.inf]  # 逐渐降低学习率
        lr_factor = 0.75
        lr_counter = 0

        n_batch = int(len_td / self.batch_size)

        self.print_info('训练 - 样本数:{}, 批次样本: {}, 批次数: {}'.format(
            len_td, self.batch_size, n_batch))

        for epoch in range(self.epochs):

            if epoch == lr_steps[lr_counter]:  # 逐渐降低学习率
                trainer.set_learning_rate(trainer.learning_rate * lr_factor)
                lr_counter += 1

            e_loss, e_r, e_p, e_f1 = 0, 0, 0, 0  # epoch

            for i, batch in enumerate(train_data):

                data, labels = batch[0], batch[1].astype('float32')

                data = split_and_load(data,
                                      ctx_list=self.ctx,
                                      batch_axis=0,
                                      even_split=False)
                labels = split_and_load(labels,
                                        ctx_list=self.ctx,
                                        batch_axis=0,
                                        even_split=False)

                with autograd.record():  # 梯度求导
                    outputs = [base_net(X) for X in data]
                    bc_loss = [
                        loss_func(yhat, y) for yhat, y in zip(outputs, labels)
                    ]

                for l in bc_loss:
                    l.backward()

                trainer.step(self.batch_size)

                batch_loss = sum([l.mean().asscalar() for l in bc_loss]) / len(
                    bc_loss)  # batch的loss
                e_loss += batch_loss

                br, bp, bf1 = self.get_batch_rpf(outputs, labels)

                e_r += br
                e_p += bp
                e_f1 += bf1

                self.print_info(
                    'batch: {}, loss: {:.5f}, recall: {:.2f}, precision: {:.2f}, f1: {:.2f}'
                    .format(i, batch_loss, br, bp, bf1))

                n_batch = i + 1  # 批次数

            e_loss /= n_batch
            e_r /= n_batch
            e_p /= n_batch
            e_f1 /= n_batch

            self.print_info(
                'epoch: {}, loss: {:.5f}, recall: {:.2f}, precision: {:.2f}, f1: {:.2f}'
                .format(epoch, e_loss, e_r, e_p, e_f1))
            e_r, e_p, e_f1 = self.val_net(base_net, val_data, len_vd)

            self.save_net_and_params(base_net, epoch, e_f1,
                                     name='multilabel')  # 存储网络
def get_manifold(X):
	from mxnet import nd, Context
	from mxnet import ndarray as F
	from mxnet.gluon import Block, nn
	from mxnet.initializer import Uniform

	class Model(Block):
		def __init__(self, num_dim, **kwargs):
			super(Model, self).__init__(**kwargs)
			wi1 = Uniform(0.25)
			wi2 = Uniform(0.1)
			with self.name_scope():
				self.encoder1 = nn.Dense(num_dim//4, in_units=num_dim, weight_initializer=wi1)
				self.encoder2 = nn.Dense(num_dim//16, in_units=num_dim//4, weight_initializer=wi1)
				self.encoder3 = nn.Dense(num_dim//64, in_units=num_dim//16, weight_initializer=wi2)
				self.encoder4 = nn.Dense(num_dim//256, in_units=num_dim//64, weight_initializer=wi2)
				self.decoder4 = nn.Dense(num_dim//64, in_units=num_dim//256, weight_initializer=wi2)
				self.decoder3 = nn.Dense(num_dim//16, in_units=num_dim//64, weight_initializer=wi2)
				self.decoder2 = nn.Dense(num_dim//4, in_units=num_dim//16, weight_initializer=wi1)
				self.decoder1 = nn.Dense(num_dim, in_units=num_dim//4, weight_initializer=wi1)
			self.layers = [(self.encoder1,self.decoder1),
						(self.encoder2,self.decoder2),
						(self.encoder3,self.decoder3),
						(self.encoder4,self.decoder4)]

			for layer in self.layers:
				self.register_child(layer[0])
				self.register_child(layer[1])
				
		def onelayer(self, x, layer):
			xx = F.tanh(layer[0](x))
			#xx = nn.HybridLambda('tanh')(layer[0](x))
 
			return layer[1](xx)
		
		def oneforward(self, x, layer):
			return F.tanh(layer[0](x))
		
		def forward(self, x):
			n_layer = len(self.layers)
			for i in range(n_layer):
				x = F.tanh(self.layers[i][0](x))
			for i in range(n_layer-1):
				x = F.tanh(self.layers[n_layer-i-1][1](x))
			return self.layers[0][1](x)
		
		def manifold(self, x):
			n_layer = len(self.layers)
			for i in range(n_layer-1):
				x = F.tanh(self.layers[i][0](x))
			return self.layers[n_layer-1][0](x)

	from mxnet import autograd
	from mxnet import gpu, cpu
	from mxnet.gluon import Trainer
	from mxnet.gluon.loss import L2Loss

	# Stacked AutoEncoder
	#model.initialize(ctx=[cpu(0),cpu(1),cpu(2),cpu(3)])
	#ctx = [gpu(1)]
	#ctx = [cpu(i) for i in range(16)]
	with  Context(gpu(0)) as ctx:
		model = Model(X.shape[1])
		model.initialize(ctx=ctx)#,cpu(2),cpu(3)])

		# Select Trainign Algorism
		trainer = Trainer(model.collect_params(),'adam')
		loss_func = L2Loss()

		# Start Pretraining
		print('start pretraining of StackedAE...')
		loss_n = [] # for log

		buffer = nd.array(X.values)
		for layer_id, layer in enumerate(model.layers):
			print('layer %d of %d...'%(layer_id+1,len(model.layers)))
			trainer.set_learning_rate(0.02)
			for epoch in range(1, epochs[layer_id] + 1):
				# random indexs for all datas
				indexs = np.random.permutation(buffer.shape[0])
				for bs in range(0,buffer.shape[0],batch_size):
					be = min(buffer.shape[0],bs+batch_size)
					data = buffer[indexs[bs:be]]
					# forward
					with autograd.record():
						output = model.onelayer(data, layer)
						# make loss
						loss = loss_func(output, data)
						# for log
						loss_n.append(np.mean(loss.asnumpy()))
						del output
					# backward
					loss.backward()
					# step training to one batch
					trainer.step(batch_size, ignore_stale_grad=True)
					del data, loss
				# show log
				print('%d/%d epoch loss=%f...'%(epoch,epochs[layer_id],np.mean(loss_n)))
				loss_n = []
				del bs, be, indexs
			buffer = model.oneforward(buffer, layer)
		del layer, loss_n, buffer

		print('start training of StackedAE...')
		loss_n = []
		buffer = nd.array(X.values)
		trainer.set_learning_rate(0.02)
		for epoch in range(1, epochs[-1] + 1):
			# random indexs for all datas
			indexs = np.random.permutation(buffer.shape[0])
			for bs in range(0,buffer.shape[0],batch_size):
				be = min(buffer.shape[0],bs+batch_size)
				data = buffer[indexs[bs:be]]
				# forward
				with autograd.record():
					output = model(data)
					# make loss
					loss = loss_func(output, data)
					# for log
					loss_n.append(np.mean(loss.asnumpy()))
					del output
				# backward
				loss.backward()
				# step training to one batch
				trainer.step(batch_size, ignore_stale_grad=True)
				del data, loss
			# show log
			print('%d/%d epoch loss=%f...'%(epoch,epochs[-1],np.mean(loss_n)))
			loss_n = []
			del bs, be, indexs
		del trainer, loss_func, loss_n, buffer

		print('making manifold...')
		manifold_X = pd.DataFrame()
		for bs in range(0,X.shape[0],batch_size):
			be = min(X.shape[0],bs + batch_size)
			nx = nd.array(X.iloc[bs:be].values)
			df = pd.DataFrame(model.manifold(nx).asnumpy())
			manifold_X = manifold_X.append(df, ignore_index=True, sort=False)
			del be, df, nx
		del model, bs
		return manifold_X
    encoder.hybridize()
    decoder.hybridize()
    merger.hybridize()
    print("net has been hybridized")
    print(
        '[INFO] %s Recover complete. Current epoch #%d, Best IoU = %.4f at epoch #%d.'
        % (dt.now(), init_epoch, best_iou, best_epoch))

# Training loop
for epoch_idx in range(int(init_epoch), cfg.TRAIN.NUM_EPOCHES):
    epoch_start_time = time.time()
    #losses
    encoder_losses = utils.network_utils.AverageMeter()
    refiner_losses = utils.network_utils.AverageMeter()
    if epoch_idx % cfg.TRAIN.ENCODER_LR_MILESTONES[0] == 0:
        encoder_trainer.set_learning_rate(cfg.TRAIN.ENCODER_LEARNING_RATE *
                                          cfg.TRAIN.GAMMA)
        decoder_trainer.set_learning_rate(cfg.TRAIN.DECODER_LEARNING_RATE *
                                          cfg.TRAIN.GAMMA)
        merger_trainer.set_learning_rate(cfg.TRAIN.ENCODER_LEARNING_RATE *
                                         cfg.TRAIN.GAMMA)
        refiner_trainer.set_learning_rate(cfg.TRAIN.DECODER_LEARNING_RATE *
                                          cfg.TRAIN.GAMMA)
    n_batches = len(train_data_loader)

    for batch_idx, (idx, rendering_images,
                    ground_truth_volumes) in enumerate(train_data_loader):
        # Measure data time
        # Get data from data loader
        rendering_images = rendering_images.as_in_context(ctx)
        ground_truth_volumes = ground_truth_volumes.as_in_context(ctx)
        # Train the encoder, decoder, refiner, and merger
Exemple #6
0
def train_ssd300_coco(net, train_data_loader, val_data_loader, eval_metric,
                      ctx, consts, logger):
    net.collect_params().reset_ctx(ctx)
    net_optimizer = Trainer(net.collect_params(),
                            optimizer='sgd',
                            optimizer_params={
                                'learning_rate': consts.LR,
                                'wd': consts.WD,
                                'momentum': consts.MOMENTUM
                            })

    lr_decay = float(consts.LR_DECAY)
    lr_steps = sorted(
        [float(ls) for ls in consts.LR_DECAY_EPOCH if ls.strip()])

    mbox_loss = SSDMultiBoxLoss()
    ce_metric = mx.metric.Loss('CrossEntropy')
    smoothl1_metric = mx.metric.Loss('SmoothL1')

    best_mean_avg_prec = [0]
    logger.info(consts)
    logger.info(f'Starting from [Epoch {consts.START_EPOCH}]')

    for epoch in range(consts.START_EPOCH, consts.EPOCHS):
        while lr_steps and epoch >= lr_steps[0]:
            new_lr = net_optimizer.learning_rate * lr_decay
            lr_steps.pop(0)
            net_optimizer.set_learning_rate(new_lr)
            logger.info(f'[Epoch {epoch}] learning rate = {new_lr}')
        ce_metric.reset()
        smoothl1_metric.reset()
        epoch_tic = time.time()
        batch_tic = time.time()
        net.hybridize(static_alloc=True, static_shape=True)

        for i, batch in enumerate(train_data_loader):
            data = utils.split_and_load(batch[0], ctx_list=ctx)
            cls_targets = utils.split_and_load(batch[1], ctx_list=ctx)
            box_targets = utils.split_and_load(batch[2], ctx_list=ctx)

            with autograd.record():
                cls_predictions = []
                box_predictions = []

                for x in data:
                    cls_prediction, box_prediction, _ = net(x)
                    cls_predictions.append(cls_prediction)
                    box_predictions.append(box_prediction)

                sum_loss, cls_loss, box_loss = mbox_loss(
                    cls_predictions, box_predictions, cls_targets, box_targets)
                autograd.backward(sum_loss)

            net_optimizer.step(1)

            ce_metric.update(0, [l * consts.BATCH_SIZE for l in cls_loss])
            smoothl1_metric.update(0,
                                   [l * consts.BATCH_SIZE for l in box_loss])

            if not (i + 1) % consts.LOG_INTERVAL:
                ce_name, ce_loss = ce_metric.get()
                sl1_name, sl1_loss = smoothl1_metric.get()
                t_now = time.time()
                speed = consts.BATCH_SIZE / (t_now - batch_tic)
                logger.info(
                    f'[Epoch {epoch}][Batch {i}], Speed: {speed:.3f} samples/sec, '
                    f'{ce_name}={ce_loss:.3f}, {sl1_name}={sl1_loss:.3f}')

            batch_tic = time.time()

        ce_name, ce_loss = ce_metric.get()
        sl1_name, sl1_loss = smoothl1_metric.get()
        epoch_time = time.time() - epoch_tic
        logger.info(f'[Epoch {epoch}], epoch time: {epoch_time:.3f},'
                    f'{ce_name}={ce_loss:.3f}, {sl1_name}={sl1_loss:.3f}')

        if not epoch % consts.VAL_INTERVAL or not epoch % consts.SAVE_INTERVAL:
            mean_avg_prec_name, mean_avg_prec = validate_ssd300_coco(
                net, val_data_loader, ctx, eval_metric)
            val_msg = '\n'.join([
                f'{k}={v}' for k, v in zip(mean_avg_prec_name, mean_avg_prec)
            ])
            logger.info(f'[Epoch {epoch}] validation: \n{val_msg}')
            curr_mean_avg_prec = float(mean_avg_prec[-1])
        else:
            curr_mean_avg_prec = 0

        save_params(net, best_mean_avg_prec, curr_mean_avg_prec, epoch,
                    consts.SAVE_INTERVAL, consts.SAVE_PREFIX)
Exemple #7
0
        loss_list.append(sum(loss_list_tmp) / len(loss_list_tmp))

        test_loss_list_tmp = []
        for x, y in testing_dataloader:
            output = net(x)
            test_loss_list_tmp.append(loss(output, y).asscalar())

        test_loss_list.append(
            sum(test_loss_list_tmp) / len(test_loss_list_tmp))

        print('epoch: %s' % (epoch))
        print('current epoch is %s' % (epoch + 1))
        print('training loss(MSE):', loss_list[-1])
        print('testing loss(MSE):', test_loss_list[-1])
        print('time:', time.time() - t)
        print()

        with open('results.log', 'a') as f:
            f.write('training loss(MSE): %s' % (loss_list[-1]))
            f.write('\n')
            f.write('testing loss(MSE): %s' % (test_loss_list[-1]))
            f.write('\n\n')

        if (epoch + 1) % 5 == 0:
            filename = 'stgcn_params/stgcn.params_%s' % (epoch)
            net.save_params(filename)

        if (epoch + 1) % decay_interval == 0:
            trainer.set_learning_rate(trainer.learning_rate * decay_rate)