def train(self, epochs, wd, params=None, init_epochs=0, bs=4): trainer = Trainer(self.net.collect_params(params), self.optimizer, {'wd': wd}) metrics = mx.metric.create(self.metrics) self.history = [[], []] iteration = 1 val_iter = 1 avg_mom = 0.98 tavg_loss, vavg_loss = 0., 0. for epoch in range(epochs): for data, label in self.loader[0]: data = data.as_in_context(self.ctx) label = label.as_in_context(self.ctx) with autograd.record(): output = self.net(data) loss = self.criterion(output, label) lr = self.scheduler(iteration) trainer.set_learning_rate(lr) loss.backward() trainer.step(bs) tavg_loss = tavg_loss * avg_mom + \ (1 - avg_mom) * (nd.mean(loss).asscalar()) self.history[0].append(tavg_loss / (1 - avg_mom**iteration)) iteration += 1 metrics.reset() for data, label in self.loader[1]: data = data.as_in_context(self.ctx) label = label.as_in_context(self.ctx) output = self.net(data) loss = self.criterion(output, label) vavg_loss = vavg_loss * avg_mom + \ (1 - avg_mom) * (nd.mean(loss).asscalar()) self.history[1].append(vavg_loss / (1 - avg_mom**val_iter)) val_iter += 1 metrics.update(preds=output, labels=label) status = [init_epochs + epoch + 1] + \ [self.history[0][-1], self.history[1][-1]] if self.metrics is not None: status.append(metrics.get()[1]) print('{}'.format(status)) return self.history
def train(net, train_dataloader, valid_dataloader, ctx_list, args): """Training pipline """ # optimizer trainer = Trainer(net.collect_params(), 'sgd', { 'learning_rate': args.lr, 'wd': args.wd, 'momentum': args.momentum }) # loss acc_metric = AccuracyMetric() loss_metric = mx.metric.Loss('SoftMaxCrossEntropyLoss') valid_metric = ValidMetric() cross_entropy_loss = gloss.SoftmaxCrossEntropyLoss() metric1 = [loss_metric] metric2 = [acc_metric] # create a logging logging.basicConfig() # get a logger logger = logging.getLogger() logger.setLevel(logging.INFO) log_file_path = args.save_prefix + '_train.log' log_dir = os.path.dirname(log_file_path) if log_dir and not os.path.exists(log_dir): os.makedirs(log_dir) fd = logging.FileHandler(log_file_path) logger.addHandler(fd) logger.info(args) if args.verbose: logger.info('Trainabel paramters:') logger.info(net.collect_params().keys()) logger.info('Start training from [Epoch {}]'.format(args.start_epoch)) logger.info('Traing on {}'.format(ctx_list)) # create best_acc = [0] lr_steps = sorted( [int(step) for step in args.lr_decay_epoch.split(',') if step.strip()]) lr_decay = float(args.lr_decay) for epoch in range(args.start_epoch, args.epochs): ttime = time.time() btime = time.time() # lr_decay if lr_steps and epoch >= lr_steps[0]: new_lr = trainer.learning_rate * lr_decay lr_steps.pop(0) trainer.set_learning_rate(new_lr) logger.info('[Epoch {}] set learning rate to {:.4f}'.format( epoch, new_lr)) acc_metric.reset() if args.hybrid: net.hybridize(static_alloc=True) # get mini-batch data # batch [data,label] for i, batch in enumerate(train_dataloader): batch_size = len(batch[0]) batch = split_and_load_data(batch, ctx_list, batch_size) losses = [] metrics = [] with autograd.record(): for data, cls_label in zip(*batch): # forward pred_scores = net(data) # loss loss = cross_entropy_loss(pred_scores, cls_label) # record loss and preds losses.append(loss) metrics.append([[cls_label], [pred_scores]]) # backward autograd.backward(losses) # optimizer params trainer.step(batch_size) # update metrics... for record in metrics: acc_metric.update(record[0], record[1]) for record in losses: loss_metric.update(0, record) if args.log_interval and not (i + 1) % args.log_interval: # logging info = ','.join([ '{}={:.3f}'.format(*metric.get()) for metric in metric1 + metric2 ]) msg = '[Epoch {}][Batch {}],Speed: {:.3f} samples/sec,{}'.format( epoch, i, args.log_interval * batch_size / (time.time() - btime), info) logger.info(msg) btime = time.time() info = ','.join(['{}={:.3f}'.format(*loss_metric.get())]) msg = '[Epoch {}] Traning cost : {:.3f},{}'.format( epoch, time.time() - ttime, info) logger.info(msg) if args.val_interval and not (epoch + 1) % args.val_interval: name, current_acc = evaluate(net, valid_dataloader, valid_metric, ctx_list, args.hybrid) info = '{}={:.3f}'.format(name, current_acc) msg = '[Epoch {}] Validation {}.'.format(epoch, info) logger.info(msg) else: current_acc = 0 save_parameters(net, logger, best_acc, current_acc, epoch, args.save_interval, args.save_prefix)
def train_model_for_ml(self): """ 训练模型, 多标签 """ base_net = self.get_base_net() # 基础网络 train_data, len_td = self.get_train_data(self.batch_size) # 训练数据,按批次获取 val_data, len_vd = self.get_val_data(self.batch_size) # 训练数据,按批次获取 trainer = Trainer(base_net.collect_params(), 'rmsprop', {'learning_rate': 1e-4}) loss_func = SigmoidBinaryCrossEntropyLoss() lr_steps = [10, 20, 30, np.inf] # 逐渐降低学习率 lr_factor = 0.75 lr_counter = 0 n_batch = int(len_td / self.batch_size) self.print_info('训练 - 样本数:{}, 批次样本: {}, 批次数: {}'.format( len_td, self.batch_size, n_batch)) for epoch in range(self.epochs): if epoch == lr_steps[lr_counter]: # 逐渐降低学习率 trainer.set_learning_rate(trainer.learning_rate * lr_factor) lr_counter += 1 e_loss, e_r, e_p, e_f1 = 0, 0, 0, 0 # epoch for i, batch in enumerate(train_data): data, labels = batch[0], batch[1].astype('float32') data = split_and_load(data, ctx_list=self.ctx, batch_axis=0, even_split=False) labels = split_and_load(labels, ctx_list=self.ctx, batch_axis=0, even_split=False) with autograd.record(): # 梯度求导 outputs = [base_net(X) for X in data] bc_loss = [ loss_func(yhat, y) for yhat, y in zip(outputs, labels) ] for l in bc_loss: l.backward() trainer.step(self.batch_size) batch_loss = sum([l.mean().asscalar() for l in bc_loss]) / len( bc_loss) # batch的loss e_loss += batch_loss br, bp, bf1 = self.get_batch_rpf(outputs, labels) e_r += br e_p += bp e_f1 += bf1 self.print_info( 'batch: {}, loss: {:.5f}, recall: {:.2f}, precision: {:.2f}, f1: {:.2f}' .format(i, batch_loss, br, bp, bf1)) n_batch = i + 1 # 批次数 e_loss /= n_batch e_r /= n_batch e_p /= n_batch e_f1 /= n_batch self.print_info( 'epoch: {}, loss: {:.5f}, recall: {:.2f}, precision: {:.2f}, f1: {:.2f}' .format(epoch, e_loss, e_r, e_p, e_f1)) e_r, e_p, e_f1 = self.val_net(base_net, val_data, len_vd) self.save_net_and_params(base_net, epoch, e_f1, name='multilabel') # 存储网络
def get_manifold(X): from mxnet import nd, Context from mxnet import ndarray as F from mxnet.gluon import Block, nn from mxnet.initializer import Uniform class Model(Block): def __init__(self, num_dim, **kwargs): super(Model, self).__init__(**kwargs) wi1 = Uniform(0.25) wi2 = Uniform(0.1) with self.name_scope(): self.encoder1 = nn.Dense(num_dim//4, in_units=num_dim, weight_initializer=wi1) self.encoder2 = nn.Dense(num_dim//16, in_units=num_dim//4, weight_initializer=wi1) self.encoder3 = nn.Dense(num_dim//64, in_units=num_dim//16, weight_initializer=wi2) self.encoder4 = nn.Dense(num_dim//256, in_units=num_dim//64, weight_initializer=wi2) self.decoder4 = nn.Dense(num_dim//64, in_units=num_dim//256, weight_initializer=wi2) self.decoder3 = nn.Dense(num_dim//16, in_units=num_dim//64, weight_initializer=wi2) self.decoder2 = nn.Dense(num_dim//4, in_units=num_dim//16, weight_initializer=wi1) self.decoder1 = nn.Dense(num_dim, in_units=num_dim//4, weight_initializer=wi1) self.layers = [(self.encoder1,self.decoder1), (self.encoder2,self.decoder2), (self.encoder3,self.decoder3), (self.encoder4,self.decoder4)] for layer in self.layers: self.register_child(layer[0]) self.register_child(layer[1]) def onelayer(self, x, layer): xx = F.tanh(layer[0](x)) #xx = nn.HybridLambda('tanh')(layer[0](x)) return layer[1](xx) def oneforward(self, x, layer): return F.tanh(layer[0](x)) def forward(self, x): n_layer = len(self.layers) for i in range(n_layer): x = F.tanh(self.layers[i][0](x)) for i in range(n_layer-1): x = F.tanh(self.layers[n_layer-i-1][1](x)) return self.layers[0][1](x) def manifold(self, x): n_layer = len(self.layers) for i in range(n_layer-1): x = F.tanh(self.layers[i][0](x)) return self.layers[n_layer-1][0](x) from mxnet import autograd from mxnet import gpu, cpu from mxnet.gluon import Trainer from mxnet.gluon.loss import L2Loss # Stacked AutoEncoder #model.initialize(ctx=[cpu(0),cpu(1),cpu(2),cpu(3)]) #ctx = [gpu(1)] #ctx = [cpu(i) for i in range(16)] with Context(gpu(0)) as ctx: model = Model(X.shape[1]) model.initialize(ctx=ctx)#,cpu(2),cpu(3)]) # Select Trainign Algorism trainer = Trainer(model.collect_params(),'adam') loss_func = L2Loss() # Start Pretraining print('start pretraining of StackedAE...') loss_n = [] # for log buffer = nd.array(X.values) for layer_id, layer in enumerate(model.layers): print('layer %d of %d...'%(layer_id+1,len(model.layers))) trainer.set_learning_rate(0.02) for epoch in range(1, epochs[layer_id] + 1): # random indexs for all datas indexs = np.random.permutation(buffer.shape[0]) for bs in range(0,buffer.shape[0],batch_size): be = min(buffer.shape[0],bs+batch_size) data = buffer[indexs[bs:be]] # forward with autograd.record(): output = model.onelayer(data, layer) # make loss loss = loss_func(output, data) # for log loss_n.append(np.mean(loss.asnumpy())) del output # backward loss.backward() # step training to one batch trainer.step(batch_size, ignore_stale_grad=True) del data, loss # show log print('%d/%d epoch loss=%f...'%(epoch,epochs[layer_id],np.mean(loss_n))) loss_n = [] del bs, be, indexs buffer = model.oneforward(buffer, layer) del layer, loss_n, buffer print('start training of StackedAE...') loss_n = [] buffer = nd.array(X.values) trainer.set_learning_rate(0.02) for epoch in range(1, epochs[-1] + 1): # random indexs for all datas indexs = np.random.permutation(buffer.shape[0]) for bs in range(0,buffer.shape[0],batch_size): be = min(buffer.shape[0],bs+batch_size) data = buffer[indexs[bs:be]] # forward with autograd.record(): output = model(data) # make loss loss = loss_func(output, data) # for log loss_n.append(np.mean(loss.asnumpy())) del output # backward loss.backward() # step training to one batch trainer.step(batch_size, ignore_stale_grad=True) del data, loss # show log print('%d/%d epoch loss=%f...'%(epoch,epochs[-1],np.mean(loss_n))) loss_n = [] del bs, be, indexs del trainer, loss_func, loss_n, buffer print('making manifold...') manifold_X = pd.DataFrame() for bs in range(0,X.shape[0],batch_size): be = min(X.shape[0],bs + batch_size) nx = nd.array(X.iloc[bs:be].values) df = pd.DataFrame(model.manifold(nx).asnumpy()) manifold_X = manifold_X.append(df, ignore_index=True, sort=False) del be, df, nx del model, bs return manifold_X
encoder.hybridize() decoder.hybridize() merger.hybridize() print("net has been hybridized") print( '[INFO] %s Recover complete. Current epoch #%d, Best IoU = %.4f at epoch #%d.' % (dt.now(), init_epoch, best_iou, best_epoch)) # Training loop for epoch_idx in range(int(init_epoch), cfg.TRAIN.NUM_EPOCHES): epoch_start_time = time.time() #losses encoder_losses = utils.network_utils.AverageMeter() refiner_losses = utils.network_utils.AverageMeter() if epoch_idx % cfg.TRAIN.ENCODER_LR_MILESTONES[0] == 0: encoder_trainer.set_learning_rate(cfg.TRAIN.ENCODER_LEARNING_RATE * cfg.TRAIN.GAMMA) decoder_trainer.set_learning_rate(cfg.TRAIN.DECODER_LEARNING_RATE * cfg.TRAIN.GAMMA) merger_trainer.set_learning_rate(cfg.TRAIN.ENCODER_LEARNING_RATE * cfg.TRAIN.GAMMA) refiner_trainer.set_learning_rate(cfg.TRAIN.DECODER_LEARNING_RATE * cfg.TRAIN.GAMMA) n_batches = len(train_data_loader) for batch_idx, (idx, rendering_images, ground_truth_volumes) in enumerate(train_data_loader): # Measure data time # Get data from data loader rendering_images = rendering_images.as_in_context(ctx) ground_truth_volumes = ground_truth_volumes.as_in_context(ctx) # Train the encoder, decoder, refiner, and merger
def train_ssd300_coco(net, train_data_loader, val_data_loader, eval_metric, ctx, consts, logger): net.collect_params().reset_ctx(ctx) net_optimizer = Trainer(net.collect_params(), optimizer='sgd', optimizer_params={ 'learning_rate': consts.LR, 'wd': consts.WD, 'momentum': consts.MOMENTUM }) lr_decay = float(consts.LR_DECAY) lr_steps = sorted( [float(ls) for ls in consts.LR_DECAY_EPOCH if ls.strip()]) mbox_loss = SSDMultiBoxLoss() ce_metric = mx.metric.Loss('CrossEntropy') smoothl1_metric = mx.metric.Loss('SmoothL1') best_mean_avg_prec = [0] logger.info(consts) logger.info(f'Starting from [Epoch {consts.START_EPOCH}]') for epoch in range(consts.START_EPOCH, consts.EPOCHS): while lr_steps and epoch >= lr_steps[0]: new_lr = net_optimizer.learning_rate * lr_decay lr_steps.pop(0) net_optimizer.set_learning_rate(new_lr) logger.info(f'[Epoch {epoch}] learning rate = {new_lr}') ce_metric.reset() smoothl1_metric.reset() epoch_tic = time.time() batch_tic = time.time() net.hybridize(static_alloc=True, static_shape=True) for i, batch in enumerate(train_data_loader): data = utils.split_and_load(batch[0], ctx_list=ctx) cls_targets = utils.split_and_load(batch[1], ctx_list=ctx) box_targets = utils.split_and_load(batch[2], ctx_list=ctx) with autograd.record(): cls_predictions = [] box_predictions = [] for x in data: cls_prediction, box_prediction, _ = net(x) cls_predictions.append(cls_prediction) box_predictions.append(box_prediction) sum_loss, cls_loss, box_loss = mbox_loss( cls_predictions, box_predictions, cls_targets, box_targets) autograd.backward(sum_loss) net_optimizer.step(1) ce_metric.update(0, [l * consts.BATCH_SIZE for l in cls_loss]) smoothl1_metric.update(0, [l * consts.BATCH_SIZE for l in box_loss]) if not (i + 1) % consts.LOG_INTERVAL: ce_name, ce_loss = ce_metric.get() sl1_name, sl1_loss = smoothl1_metric.get() t_now = time.time() speed = consts.BATCH_SIZE / (t_now - batch_tic) logger.info( f'[Epoch {epoch}][Batch {i}], Speed: {speed:.3f} samples/sec, ' f'{ce_name}={ce_loss:.3f}, {sl1_name}={sl1_loss:.3f}') batch_tic = time.time() ce_name, ce_loss = ce_metric.get() sl1_name, sl1_loss = smoothl1_metric.get() epoch_time = time.time() - epoch_tic logger.info(f'[Epoch {epoch}], epoch time: {epoch_time:.3f},' f'{ce_name}={ce_loss:.3f}, {sl1_name}={sl1_loss:.3f}') if not epoch % consts.VAL_INTERVAL or not epoch % consts.SAVE_INTERVAL: mean_avg_prec_name, mean_avg_prec = validate_ssd300_coco( net, val_data_loader, ctx, eval_metric) val_msg = '\n'.join([ f'{k}={v}' for k, v in zip(mean_avg_prec_name, mean_avg_prec) ]) logger.info(f'[Epoch {epoch}] validation: \n{val_msg}') curr_mean_avg_prec = float(mean_avg_prec[-1]) else: curr_mean_avg_prec = 0 save_params(net, best_mean_avg_prec, curr_mean_avg_prec, epoch, consts.SAVE_INTERVAL, consts.SAVE_PREFIX)
loss_list.append(sum(loss_list_tmp) / len(loss_list_tmp)) test_loss_list_tmp = [] for x, y in testing_dataloader: output = net(x) test_loss_list_tmp.append(loss(output, y).asscalar()) test_loss_list.append( sum(test_loss_list_tmp) / len(test_loss_list_tmp)) print('epoch: %s' % (epoch)) print('current epoch is %s' % (epoch + 1)) print('training loss(MSE):', loss_list[-1]) print('testing loss(MSE):', test_loss_list[-1]) print('time:', time.time() - t) print() with open('results.log', 'a') as f: f.write('training loss(MSE): %s' % (loss_list[-1])) f.write('\n') f.write('testing loss(MSE): %s' % (test_loss_list[-1])) f.write('\n\n') if (epoch + 1) % 5 == 0: filename = 'stgcn_params/stgcn.params_%s' % (epoch) net.save_params(filename) if (epoch + 1) % decay_interval == 0: trainer.set_learning_rate(trainer.learning_rate * decay_rate)