def train(model, features, X, X_train, y_train, epochs): cross_entropy = SigmoidBinaryCrossEntropyLoss(from_sigmoid=True) trainer = Trainer(model.collect_params(), 'sgd', { 'learning_rate': 0.001, 'momentum': 1 }) feature_representations = [features(X).asnumpy()] for e in range(1, epochs + 1): cum_loss = 0 cum_preds = [] for i, x in enumerate(X_train.flatten()): y = array(y_train)[i] with autograd.record(): preds = model(X)[x] loss = cross_entropy(preds, y) # logger.debug("x:[{}], y:[{}], pred:[{}], loss:[{}]".format(x,y,preds,loss.asscalar())) loss.backward() trainer.step(1) cum_loss += loss.asscalar() cum_preds += [preds.asscalar()] feature_representations.append(features(X).asnumpy()) if (e % (epochs // 10)) == 0: logger.debug(f"Epoch {e}/{epochs} -- Loss: {cum_loss: f}") logger.debug(cum_preds) return feature_representations
class MLPEstimator(TrainableEstimator): def __init__( self, input_shape: Sequence[int], hidden_dims: Sequence[int], hidden_activation: Optional[str], lr: float, ): input_dim, = input_shape # must be 1D self.model = make_mlp(input_dim, hidden_dims, 1, hidden_activation) self.model.initialize() self.trainer = Trainer(self.model.collect_params(), mx.optimizer.Adam(lr)) self.loss_fn = loss.L2Loss() def batch_estimate(self, data: np.ndarray) -> np.ndarray: tensor = mx.nd.array(data) result = self.model(tensor) return result.asnumpy() def batch_estimate_and_update(self, data: np.ndarray, targets: np.ndarray) -> np.ndarray: input_tensor = mx.nd.array(data) target_tensor = mx.nd.array(targets) with mx.autograd.record(): result = self.model(input_tensor) loss = self.loss_fn(result, target_tensor) loss.backward() self.trainer.step(input_tensor.shape[0]) return result.asnumpy()
def main(): iBatchSize = 10 iEpoch = 10 ndX, ndY = create_data() # type:ArrayDataset adDataset = gdata.ArrayDataset(ndX, ndY) # type:DataLoader dlDataIter = gdata.DataLoader(adDataset, batch_size=iBatchSize, shuffle=True) net = create_net() net.initialize(init.Normal(sigma=0.01)) loss = gloss.L2Loss() trainer = Trainer(net.collect_params(), "sgd", {"learning_rate": 0.01}) for iIndex in range(iEpoch): for ndBatchX, ndBatchY in dlDataIter: with autograd.record(): l = loss(net(ndBatchX), ndBatchY) l.backward() trainer.step(iBatchSize) l = loss(net(ndX), ndY) print("[Log] Epoch:%d Loss:%f" % (iIndex, l.mean().asnumpy())) print(net[0].weight.data()) print(net[0].bias.data())
def get_Eout(T, M, lr): net = nn.Sequential() net.add(nn.Dense(M, activation='tanh')) net.add(nn.Dense(1, activation='tanh')) net.initialize(init.Normal(sigma=0.01)) trainer = Trainer(net.collect_params(), 'sgd', {'learning_rate': lr}) loss = gloss.L2Loss() for t in range(T): if t % 1000 == 0: print('t:', t) for X, y in train_iter: with autograd.record(): l = loss(net(X), y) l.backward() trainer.step(batch_size) l = loss(net(X), y) print('训练完成,开始检验') for tX, ty in test_iter: l = loss(net(tX), ty).mean().asscalar() print('Eout;', l) return l, M
def fit(nn: nn.Block, xs, ys, batchsize=20, draw_pars=None, drfunc=None): """训练函数""" ds = mxdata.ArrayDataset(xs, ys) dl = mxdata.DataLoader(ds, batch_size=batchsize, shuffle=True) ### tr = Trainer(nn.collect_params(), optimizer="rmsprop", optimizer_params={"learning_rate": 0.001}) ### lfunc = loss.L2Loss() for i in range(2000): for data, label in dl: with ag.record(): y = nn(data) ls = lfunc(y, label).mean() #type:nd.NDArray ls.backward() tr.step(batch_size=batchsize, ignore_stale_grad=True) print(f"Loss值:{ls.asscalar()}") # if ls.asscalar()<0.1: # break # 绘图 if draw_pars is not None and drfunc is not None and ls.asscalar() < 10: plt.ion() plt.gcf().clear() drfunc(draw_pars[4], nn, draw_pars[0], draw_pars[1], draw_pars[2], draw_pars[3]) plt.pause(0.5)
def main(): tensorboard_writer = TensorboardStatsWriter(save_path='save/logs') console_writer = ConsoleStatsWriter(save_path='save/console') game = gym.make('Acrobot-v1') game._max_episode_steps = 2.5e3 agent = GymAdapter( PolicyGradAgent(observation_space=game.observation_space, action_space=game.action_space, reward_range=game.reward_range, entropy_weight=5e-3, discount=0.995)) agent.initialize(init.Xavier()) with agent, tensorboard_writer, console_writer: rollout = play(game, agent, render=True, blur_weight=0.99) eps = episodes(rollout) trainer = Trainer(agent.collect_params(), 'adam', dict(learning_rate=5e-3)) for obs, acts, rews, infos in eps: with autograd.record(): loss, stats = agent.loss(obs, acts, rews, infos) loss.backward() trainer.step(len(obs)) console_writer.write(stats) tensorboard_writer.write(stats)
def train(self, source_path, target_path, lda_path, batch_size=16, epoch_num=15, optimizer='adam', learning_rate=0.01): """ 根据定义好的模型,训练模型 :param epoch_num: 训练迭代数据轮次 :param optimizer: 优化器,或者是优化器名字str :param learning_rate: 学习率 :return: """ source_list, target_list = os.listdir(source_path), os.listdir( target_path) if len(source_list) != len(target_list): raise ValueError('source and target file not match') data_size = len(source_list) del source_list del target_list if self.encoder_type == 'parse': print('single-pass mode in parse encoder') batch_size = 1 trainer = Trainer(self.model.collect_params(), optimizer, {'learning_rate': learning_rate}) print('Reading data...') data = self._data_generator_one_batch(source_path, target_path, lda_path) best_score = 9999 for epoch in range(epoch_num): loss_sum = 0.0 with tqdm(total=len(data)) as pbar: for x, y, lda in data: with autograd.record(): logits = self.model(x, y, lda) loss = self.sequence_loss(logits, y) loss.backward() trainer.step(1) loss_sum += loss.asscalar() pbar.update(1) self.global_step += 1 loss_mean = loss_sum / len(data) print('epoch {}, loss:{}'.format(epoch, loss_mean)) if loss_mean < best_score: self.model.collect_params().save('best.model')
def train(self, epochs, wd, params=None, init_epochs=0, bs=4): trainer = Trainer(self.net.collect_params(params), self.optimizer, {'wd': wd}) metrics = mx.metric.create(self.metrics) self.history = [[], []] iteration = 1 val_iter = 1 avg_mom = 0.98 tavg_loss, vavg_loss = 0., 0. for epoch in range(epochs): for data, label in self.loader[0]: data = data.as_in_context(self.ctx) label = label.as_in_context(self.ctx) with autograd.record(): output = self.net(data) loss = self.criterion(output, label) lr = self.scheduler(iteration) trainer.set_learning_rate(lr) loss.backward() trainer.step(bs) tavg_loss = tavg_loss * avg_mom + \ (1 - avg_mom) * (nd.mean(loss).asscalar()) self.history[0].append(tavg_loss / (1 - avg_mom**iteration)) iteration += 1 metrics.reset() for data, label in self.loader[1]: data = data.as_in_context(self.ctx) label = label.as_in_context(self.ctx) output = self.net(data) loss = self.criterion(output, label) vavg_loss = vavg_loss * avg_mom + \ (1 - avg_mom) * (nd.mean(loss).asscalar()) self.history[1].append(vavg_loss / (1 - avg_mom**val_iter)) val_iter += 1 metrics.update(preds=output, labels=label) status = [init_epochs + epoch + 1] + \ [self.history[0][-1], self.history[1][-1]] if self.metrics is not None: status.append(metrics.get()[1]) print('{}'.format(status)) return self.history
def optimize(transform_net, loss_net, contents_images, style_img, weights, epochs): # contents_images: (num_iter, batchsize, c, h, w) # style_img: (batchsize=1, c, h, w) # weights = [content_weight, style_weight, tv_weight] trainer = Trainer(transform_net.collect_params(), 'adam', {"learning_rate": 0.001}) _, styles_features = loss_network.extract_features(style_img, loss_net) styles_features_gram = loss_network.gram(styles_features) min_loss = 0.0 for i in range(epochs): start = time.time() num = 0 total_l_sum, contents_l_sum, styles_l_sum, tv_l_sum = 0.0, 0.0, 0.0, 0.0 for contents_img in contents_images: contents_features, _ = loss_network.extract_features( contents_img[0], loss_net) with autograd.record(): res_img = transform_net(contents_img[0]) contents_features_h, style_features_h = loss_network.extract_features( res_img, loss_net) total_l, contents_l, styles_l, tv_l = loss_network.compute_loss( res_img, weights, contents_features_h, style_features_h, contents_features, styles_features_gram) total_l.backward() trainer.step(1) total_l_sum += total_l contents_l_sum += contents_l styles_l_sum += styles_l tv_l_sum += tv_l num += 1 if total_l < min_loss: # 目前: 只要得到的结果更好就保存 min_loss = total_l transform_net.save_params("best_params_%d" % (i + 1)) if (i + 1) % 50 == 0: print( 'epoch %3d, total loss %.2f, content loss %.2f, style loss %.2f, TV loss %.2f, %.2f sec' % (i + 1, total_l_sum / num, contents_l_sum / num, styles_l_sum / num, tv_l_sum / num, time.time() - start))
def train(net, train_dataloader, val_dataloader, epochs, context): max_f1 = 0 best_epoch = -1 trainer = Trainer(net.collect_params(), 'ftml', {'learning_rate': options.lr}) l2_loss_fn = gluon.loss.L2Loss() loss_fn = gluon.loss.SoftmaxCrossEntropyLoss() for e in range(epochs): cum_loss = 0 total_items = 0 for i, data in enumerate(train_dataloader): total_items += data[0].shape[0] for idx in range(0, len(data)): data[idx] = data[idx].astype(np.float32).reshape( (-1, 1)).as_in_context(context) with autograd.record(): output, decoded = net(*data[:-1]) l2_loss = l2_loss_fn(decoded, mx.nd.concat(*data[:-1], dim=1)) classification_loss = loss_fn(output, data[len(data) - 1]) loss = l2_loss + classification_loss loss.backward() trainer.step(1) cum_loss += loss.mean().asscalar() train_f1 = evaluate(net, train_dataloader, context) val_f1 = evaluate(net, val_dataloader, context) print('Epoch [{}]: Train F1 {:.3f}, Val F1 {:.3f}. Train loss {:.6f}'. format(e, train_f1, val_f1, cum_loss / total_items)) if val_f1 > max_f1: net.save_parameters('best_model.params') max_f1 = val_f1 best_epoch = e print('Best model found on epoch {}, Val F1 {:.3f}'.format( best_epoch, max_f1))
def train(self, n_epoch, verbose=True): print("Training Started for RFN") learning_rate = 0.001 loss_function = L2Loss() optimizer = 'adam' trainer = Trainer(self.params, optimizer, {'learning_rate': learning_rate}) losses = [] for epoch in range(1, n_epoch + 1): city = self.train_cities[epoch % len(self.train_cities)] with autograd.record(): y_pred = self.rfn(city.X_V, city.X_E, city.X_B, city.N_node_primal, city.N_edge_primal, city.N_mask_primal, city.N_node_dual, city.N_edge_dual, city.N_common_node, city.N_mask_dual) loss = loss_function(y_pred, city.y) loss.backward() trainer.step(batch_size=len(city.y)) if verbose: print(f'Loss at Epoch {epoch}: {loss.mean().asscalar()}') losses.append(loss.mean().asscalar()) return losses
def train_model(): epochs = 5 configs = get_configs() is_gpu = configs['is_gpu'] batch_size = configs['batch_size'] ctx = get_context(is_gpu) print("gpu: {}, batch_size: {}".format(is_gpu, batch_size)) base_net = get_base_net(ctx=ctx) trainer = Trainer(base_net.collect_params(), 'rmsprop', {'learning_rate': 1e-3}) bc_loss = SigmoidBinaryCrossEntropyLoss() triplet_loss = TripletLoss(margin=0) train_data = get_train_data(batch_size=batch_size) # train data triplet_train_data = get_triplet_train_data(batch_size=batch_size) # 训练数据 for epoch in range(epochs): train_loss = 0 # 训练loss total_right, total_all = 0, 0 for i, (batch, tp_batch) in enumerate(zip(train_data, triplet_train_data)): data, labels = batch[0], batch[1].astype('float32') tp_data, tp_labels = tp_batch[0], tp_batch[1].astype('float32') # print(data.shape, labels.shape) # print(tp_data.shape, tp_labels.shape) data = data.as_in_context(context=ctx) labels = labels.as_in_context(context=ctx) tp_data = tp_data.as_in_context(context=ctx) tp_labels = tp_labels.as_in_context(context=ctx) tp_data = mx.nd.transpose(tp_data, (1, 0, 2, 3, 4)) tp_labels = mx.nd.transpose(tp_labels, (1, 0, 2)) # print(tp_data.shape, tp_labels.shape) anc_ins, pos_ins, neg_ins = tp_data[0, :], tp_data[1, :], tp_data[2, :] # print(anc_ins.shape, pos_ins.shape, neg_ins.shape) with autograd.record(): outputs = base_net(data) v_bc_loss = bc_loss(outputs, labels) inter1 = base_net(anc_ins) inter2 = base_net(pos_ins) inter3 = base_net(neg_ins) v_triplet_loss = triplet_loss(inter1, inter2, inter3) # 交叉熵 autograd.backward([v_bc_loss, v_triplet_loss]) trainer.step(batch_size) print('bc: {}, triplet: {}'.format(np.sum(v_bc_loss.asnumpy()), np.sum(v_triplet_loss.asnumpy()))) train_loss += v_bc_loss.mean().asscalar() acc, nr, na = get_batch_acc(outputs, labels) total_right += nr total_all += na if i != 0: # batch 0 doesn't have train_loss. print('batch: %s, loss: %s, acc: %s' % (i, train_loss / i, acc)) else: print('batch: %s' % i) train_loss /= len(train_data) print('epoch: %s, loss: %s, acc: %s' % (epoch, train_loss, total_right / total_all))
data = data.as_in_context(context) i += 1 # train with real batch_size = data.shape[0] with autograd.record(): errD_real = netD(data) # train with fake noise = mx.ndarray.random.normal(shape=(opt.batchSize, nz, 1, 1), ctx=context) fake = netG(noise) errD_fake = netD(fake.detach()) errD = errD_real - errD_fake errD.backward() trainerD.step(1) ############################ # (2) Update G network ########################### # in case our last batch was the tail batch of the dataloader, # make sure we feed a full batch of noise noise = mx.ndarray.random.normal(shape=(opt.batchSize, nz, 1, 1), ctx=context) with autograd.record(): fake = netG(noise) errG = netD(fake) errG.backward() trainerG.step(1) gen_iterations += 1 print('[%d/%d][%d/%d][%d] Loss_D: %f Loss_G: %f Loss_D_real: %f Loss_D_fake %f'
def train( backbone, root_dir, train_index_fp, pretrain_model, optimizer, epochs=50, lr=0.001, wd=5e-4, momentum=0.9, batch_size=4, ctx=mx.cpu(), verbose_step=5, output_dir='ckpt', ): output_dir = os.path.join(output_dir, backbone) os.makedirs(output_dir, exist_ok=True) num_kernels = 3 dataset = StdDataset(root_dir=root_dir, train_idx_fp=train_index_fp, num_kernels=num_kernels - 1) if not isinstance(ctx, (list, tuple)): ctx = [ctx] batch_size = batch_size * len(ctx) loader = DataLoader(dataset, batch_size=batch_size, shuffle=True) net = PSENet(base_net_name=backbone, num_kernels=num_kernels, ctx=ctx, pretrained=True) # initial params net.initialize(mx.init.Xavier(), ctx=ctx) net.collect_params("extra_.*_weight|decoder_.*_weight").initialize( mx.init.Xavier(), ctx=ctx, force_reinit=True) net.collect_params("extra_.*_bias|decoder_.*_bias").initialize( mx.init.Zero(), ctx=ctx, force_reinit=True) if pretrain_model is not None: net.load_parameters(pretrain_model, ctx=ctx, allow_missing=True, ignore_extra=True) # pse_loss = DiceLoss(lam=0.7, num_kernels=num_kernels) pse_loss = DiceLoss_with_OHEM(lam=0.7, num_kernels=num_kernels, debug=False) # lr_scheduler = ls.PolyScheduler( # max_update=icdar_loader.length * epochs // batch_size, base_lr=lr # ) max_update = len(dataset) * epochs // batch_size lr_scheduler = ls.MultiFactorScheduler( base_lr=lr, step=[max_update // 3, max_update * 2 // 3], factor=0.1) optimizer_params = { 'learning_rate': lr, 'wd': wd, 'momentum': momentum, 'lr_scheduler': lr_scheduler, } if optimizer.lower() == 'adam': optimizer_params.pop('momentum') trainer = Trainer(net.collect_params(), optimizer=optimizer, optimizer_params=optimizer_params) summary_writer = SummaryWriter(output_dir) for e in range(epochs): cumulative_loss = 0 num_batches = 0 for i, item in enumerate(loader): item_ctxs = [split_and_load(field, ctx) for field in item] loss_list = [] for im, gt_text, gt_kernels, training_masks, ori_img in zip( *item_ctxs): gt_text = gt_text[:, ::4, ::4] gt_kernels = gt_kernels[:, :, ::4, ::4] training_masks = training_masks[:, ::4, ::4] with autograd.record(): kernels_pred = net(im) # 第0个是对complete text的预测 loss = pse_loss(gt_text, gt_kernels, kernels_pred, training_masks) loss_list.append(loss) mean_loss = [] for loss in loss_list: loss.backward() mean_loss.append(mx.nd.mean(to_cpu(loss)).asscalar()) mean_loss = np.mean(mean_loss) trainer.step(batch_size) if i % verbose_step == 0: global_steps = dataset.length * e + i * batch_size summary_writer.add_scalar('loss', mean_loss, global_steps) summary_writer.add_scalar( 'c_loss', mx.nd.mean(to_cpu(pse_loss.C_loss)).asscalar(), global_steps, ) summary_writer.add_scalar( 'kernel_loss', mx.nd.mean(to_cpu(pse_loss.kernel_loss)).asscalar(), global_steps, ) summary_writer.add_scalar('pixel_accuracy', pse_loss.pixel_acc, global_steps) if i % 1 == 0: logger.info( "step: {}, lr: {}, " "loss: {}, score_loss: {}, kernel_loss: {}, pixel_acc: {}, kernel_acc: {}" .format( i * batch_size, trainer.learning_rate, mean_loss, mx.nd.mean(to_cpu(pse_loss.C_loss)).asscalar(), mx.nd.mean(to_cpu(pse_loss.kernel_loss)).asscalar(), pse_loss.pixel_acc, pse_loss.kernel_acc, )) cumulative_loss += mean_loss num_batches += 1 summary_writer.add_scalar('mean_loss_per_epoch', cumulative_loss / num_batches, global_steps) logger.info("Epoch {}, mean loss: {}\n".format( e, cumulative_loss / num_batches)) net.save_parameters( os.path.join(output_dir, model_fn_prefix(backbone, e))) summary_writer.add_image('complete_gt', to_cpu(gt_text[0:1, :, :]), global_steps) summary_writer.add_image('complete_pred', to_cpu(kernels_pred[0:1, 0, :, :]), global_steps) summary_writer.add_images( 'kernels_gt', to_cpu(gt_kernels[0:1, :, :, :]).reshape(-1, 1, 0, 0), global_steps, ) summary_writer.add_images( 'kernels_pred', to_cpu(kernels_pred[0:1, 1:, :, :]).reshape(-1, 1, 0, 0), global_steps, ) summary_writer.close()
correct = 0. L = 0. count = 0 L_eval = 0. correct_eval = 0. count_eval = 0 ## training for i, data in enumerate(dataloader): points, target = data points = points.transpose((0, 2, 1)) with ag.record(): pred, _ = classifier(points.as_in_context(ctx)) loss = L_loss(pred, target.as_in_context(ctx)) loss.backward() optimizer.step(batch_size=opt.batchSize) pred_choice = pred.argmax(1) correct += (target[:, 0] == pred_choice.as_in_context( mx.cpu())).sum().asscalar() L += loss.mean().asscalar() count += 1 # logger.info('[epoch: %d] train loss: %f accuracy: %f' %(epoch, L / count, correct/ float(len(dataset)))) ## evaluating for j, data in enumerate(testdataloader): points, target = data points = points.transpose((0, 2, 1)) pred, _ = classifier(points.as_in_context(ctx)) loss = L_loss(pred, target.as_in_context(ctx)) pred_choice = pred.argmax(1) correct_eval += (target[:, 0] == pred_choice.as_in_context(
def get_manifold(X): from mxnet import nd, Context from mxnet import ndarray as F from mxnet.gluon import Block, nn from mxnet.initializer import Uniform class Model(Block): def __init__(self, num_dim, **kwargs): super(Model, self).__init__(**kwargs) wi1 = Uniform(0.25) wi2 = Uniform(0.1) with self.name_scope(): self.encoder1 = nn.Dense(num_dim//4, in_units=num_dim, weight_initializer=wi1) self.encoder2 = nn.Dense(num_dim//16, in_units=num_dim//4, weight_initializer=wi1) self.encoder3 = nn.Dense(num_dim//64, in_units=num_dim//16, weight_initializer=wi2) self.encoder4 = nn.Dense(num_dim//256, in_units=num_dim//64, weight_initializer=wi2) self.decoder4 = nn.Dense(num_dim//64, in_units=num_dim//256, weight_initializer=wi2) self.decoder3 = nn.Dense(num_dim//16, in_units=num_dim//64, weight_initializer=wi2) self.decoder2 = nn.Dense(num_dim//4, in_units=num_dim//16, weight_initializer=wi1) self.decoder1 = nn.Dense(num_dim, in_units=num_dim//4, weight_initializer=wi1) self.layers = [(self.encoder1,self.decoder1), (self.encoder2,self.decoder2), (self.encoder3,self.decoder3), (self.encoder4,self.decoder4)] for layer in self.layers: self.register_child(layer[0]) self.register_child(layer[1]) def onelayer(self, x, layer): xx = F.tanh(layer[0](x)) #xx = nn.HybridLambda('tanh')(layer[0](x)) return layer[1](xx) def oneforward(self, x, layer): return F.tanh(layer[0](x)) def forward(self, x): n_layer = len(self.layers) for i in range(n_layer): x = F.tanh(self.layers[i][0](x)) for i in range(n_layer-1): x = F.tanh(self.layers[n_layer-i-1][1](x)) return self.layers[0][1](x) def manifold(self, x): n_layer = len(self.layers) for i in range(n_layer-1): x = F.tanh(self.layers[i][0](x)) return self.layers[n_layer-1][0](x) from mxnet import autograd from mxnet import gpu, cpu from mxnet.gluon import Trainer from mxnet.gluon.loss import L2Loss # Stacked AutoEncoder #model.initialize(ctx=[cpu(0),cpu(1),cpu(2),cpu(3)]) #ctx = [gpu(1)] #ctx = [cpu(i) for i in range(16)] with Context(gpu(0)) as ctx: model = Model(X.shape[1]) model.initialize(ctx=ctx)#,cpu(2),cpu(3)]) # Select Trainign Algorism trainer = Trainer(model.collect_params(),'adam') loss_func = L2Loss() # Start Pretraining print('start pretraining of StackedAE...') loss_n = [] # for log buffer = nd.array(X.values) for layer_id, layer in enumerate(model.layers): print('layer %d of %d...'%(layer_id+1,len(model.layers))) trainer.set_learning_rate(0.02) for epoch in range(1, epochs[layer_id] + 1): # random indexs for all datas indexs = np.random.permutation(buffer.shape[0]) for bs in range(0,buffer.shape[0],batch_size): be = min(buffer.shape[0],bs+batch_size) data = buffer[indexs[bs:be]] # forward with autograd.record(): output = model.onelayer(data, layer) # make loss loss = loss_func(output, data) # for log loss_n.append(np.mean(loss.asnumpy())) del output # backward loss.backward() # step training to one batch trainer.step(batch_size, ignore_stale_grad=True) del data, loss # show log print('%d/%d epoch loss=%f...'%(epoch,epochs[layer_id],np.mean(loss_n))) loss_n = [] del bs, be, indexs buffer = model.oneforward(buffer, layer) del layer, loss_n, buffer print('start training of StackedAE...') loss_n = [] buffer = nd.array(X.values) trainer.set_learning_rate(0.02) for epoch in range(1, epochs[-1] + 1): # random indexs for all datas indexs = np.random.permutation(buffer.shape[0]) for bs in range(0,buffer.shape[0],batch_size): be = min(buffer.shape[0],bs+batch_size) data = buffer[indexs[bs:be]] # forward with autograd.record(): output = model(data) # make loss loss = loss_func(output, data) # for log loss_n.append(np.mean(loss.asnumpy())) del output # backward loss.backward() # step training to one batch trainer.step(batch_size, ignore_stale_grad=True) del data, loss # show log print('%d/%d epoch loss=%f...'%(epoch,epochs[-1],np.mean(loss_n))) loss_n = [] del bs, be, indexs del trainer, loss_func, loss_n, buffer print('making manifold...') manifold_X = pd.DataFrame() for bs in range(0,X.shape[0],batch_size): be = min(X.shape[0],bs + batch_size) nx = nd.array(X.iloc[bs:be].values) df = pd.DataFrame(model.manifold(nx).asnumpy()) manifold_X = manifold_X.append(df, ignore_index=True, sort=False) del be, df, nx del model, bs return manifold_X
rcnn_loss2_soft = (rcnn_box_loss(box_preds_soft, box_targets_soft, box_masks_soft) * box_preds_soft.size / box_preds_soft.shape[0] / num_rcnn_pos_soft) + teacher_bounded_regression_loss(box_targets, box_targets_soft, box_preds_soft) # compute backward gradient autograd.backward([rpn_loss1, rpn_loss2, rcnn_loss1, rcnn_loss2, rpn_loss1_hard, rpn_loss2_hard, rcnn_loss1_hard, rcnn_loss1_soft, rcnn_loss2_soft]) mu = 0.5 # balancing coefficient loss.append([rpn_loss1.asnumpy().item(), rpn_loss2.asnumpy().item(), rcnn_loss1.asnumpy().item(), rcnn_loss2.asnumpy().item(), rpn_loss1_hard.asnumpy().item(), rpn_loss2_hard.asnumpy().item(), mu*rcnn_loss1_hard.asnumpy().item() + (1-mu)*rcnn_loss1_soft.asnumpy().item(), rcnn_loss2_soft.asnumpy().item()]) # make an optimization step trainer.step(batch_size=1) distil_trainer.step(batch_size=1) end = time.time() # save the models each 500 image if ((batch_idx+1) % 500) == 0: student.save_parameters(f'params/model_{batch_idx}.params') distil_student.save_parameters(f'params/model_distil_{batch_idx}.params') # display the loss metrics loss = np.mean(loss, axis=0).tolist() print(f'batch: {batch_idx} | loss no distil: {sum(loss[:4])} | loss distil: {sum(loss[4:])} | time: {end-start}') # add the loss metrics to tensorboard writer.add_scalar('RPN/Classification loss no distil', loss[0], batch_idx) writer.add_scalar('RPN/Classification loss distil', loss[4], batch_idx)
class CGANTrainer: def __init__(self, opt, train_dataset, **networks): """ :param opt: global options :param train_dataset: dataset for training GAN G and D :param networks: GAN G and D """ self.opt = opt self.ctx = try_gpu() #try_gpus(self.opt.gpus) self.iter = 0 self.dataloader = train_dataset self.netG = networks['netG'] self.netD = networks['netD'] def _init_networks(self): # self.netG.initialize(init.Orthogonal(scale=self.opt.init_gain), ctx=self.ctx) # self.netD.initialize(init.Orthogonal(scale=self.opt.init_gain), ctx=self.ctx) # init_z = nd.ones(shape=(self.opt.batch_size, self.opt.z_dim), ctx=self.ctx) # init_label = nd.ones(shape=(self.opt.batch_size, 1), ctx=self.ctx) self.netG.initialize(init.Xavier(factor_type='in', magnitude=0.01), ctx=self.ctx) self.netD.initialize(init.Xavier(factor_type='in', magnitude=0.01), ctx=self.ctx) init_z = nd.random.uniform(-1, 1, shape=(self.opt.batch_size, self.opt.z_dim), ctx=self.ctx) init_label = nd.random.uniform(-1, 1, shape=(self.opt.batch_size, 1), ctx=self.ctx) gen_img = self.netG(init_z, init_label) _ = self.netD(gen_img, init_label) def _define_loss(self): self.loss_f = loss.SigmoidBinaryCrossEntropyLoss() def _define_optimizers(self): self.trainerG = Trainer(self.netG.collect_params(), optimizer='adam', optimizer_params={ 'learning_rate': self.opt.lr, 'beta1': self.opt.beta1, 'beta2': self.opt.beta2 }) self.trainerD = Trainer(self.netD.collect_params(), optimizer='adam', optimizer_params={ 'learning_rate': self.opt.lr, 'beta1': self.opt.beta1, 'beta2': self.opt.beta2 }) def train(self): """Entry of Training process.""" print("Random Seed: ", self.opt.manualSeed) random.seed(self.opt.manualSeed) mx.random.seed(self.opt.manualSeed) # initialize netGs, netDs self._init_networks() # define loss functions self._define_loss() # optimizers self._define_optimizers() print("Start training ...") self.real_mask = nd.ones(shape=(self.opt.batch_size, ), ctx=self.ctx) self.fake_mask = nd.zeros(shape=(self.opt.batch_size, ), ctx=self.ctx) for epoch in range(self.opt.num_epochs): self._train_on_epoch(epoch) self._do_checkpoints(epoch) def _train_on_epoch(self, epoch): """ train on one epoch (and do some checkpoint operations) :param epoch: :return: """ for i, (real, label) in enumerate(self.dataloader): self.real_img = real.as_in_context(self.ctx) self.label = label.as_in_context(self.ctx) batch_start = time.time() self._train_on_batch() batch_time = time.time() - batch_start self._monitor_on_batch(batch_time) self.iter += 1 @ex.capture def _monitor_on_batch(self, batch_time, _log, _run): _log.info(f"loss D: {self.loss_D.asnumpy()[0]:.4f}\t" f"loss G: {self.loss_G.asnumpy()[0]:.4f}\t" f"time: {batch_time:.2f}s") _run.log_scalar("loss D", self.loss_D.asnumpy()[0], self.iter) _run.log_scalar("loss G", self.loss_G.asnumpy()[0], self.iter) def _do_checkpoints(self, epoch): # do checkpoints # self.netG.save_parameters('{0}/netG_epoch_{1}.param'.format(self.opt.experiment, epoch)) # self.netD.save_parameters('{0}/netD_epoch_{1}.param'.format(self.opt.experiment, epoch)) if self.iter % 100 == 0: save_images( self.real_img.asnumpy().transpose(0, 2, 3, 1), '{0}/real_samples_{1}.png'.format(self.opt.experiment, epoch)) save_images( self.gen_img.asnumpy().transpose(0, 2, 3, 1), '{0}/fake_samples_{1}.png'.format(self.opt.experiment, epoch)) def _train_on_batch(self): """Calculate losses, gradients, and update network weights; called in every training iteration. 1. Forward pass: Cal predictions and losses 2. Backward pass: Cal gradients 3. Update parameters """ ############################ # Update D network # # From D perspective, the goal is to: # maximize log(D(real_image)) + log(1 - D(fake_image)) ############################ with autograd.record(): z = nd.random.normal(0, 1, shape=(self.opt.batch_size, self.opt.z_dim), ctx=self.ctx) gen_img = self.netG(z, self.label) fake_pred = self.netD(gen_img.detach(), self.label) # negative samples for D real_pred = self.netD(self.real_img, self.label) # positive samples for D loss_D_real = self.loss_f(real_pred, self.real_mask) loss_D_fake = self.loss_f(fake_pred, self.fake_mask) self.loss_D = 0.5 * (loss_D_real + loss_D_fake) self.loss_D.backward() self.trainerD.step(1) ############################ # Update G network # # From G perspective, the goal is to: # maximize log(D(fake_image)) ############################ with autograd.record(): z = nd.random.normal(0, 1, shape=(self.opt.batch_size, self.opt.z_dim), ctx=self.ctx) labels = nd.random.randint(0, self.opt.num_classes, (self.opt.batch_size, ), ctx=self.ctx) self.gen_img = self.netG(z, labels) fake_pred = self.netD(self.gen_img, labels) self.loss_G = self.loss_f(fake_pred, self.real_mask) self.loss_G.backward() self.trainerG.step(1)
def train(netG, netD, dataloader, opt): ''' Entry of Training process :return: ''' print("Random Seed: ", opt.manualSeed) random.seed(opt.manualSeed) mx.random.seed(opt.manualSeed) ctx = try_gpu() print("ctx: ", ctx) # initialize netG, netD netG.initialize(init.Xavier(factor_type='in', magnitude=0.01), ctx=ctx) custom_init_weights(netG.base) if opt.netG != '': # load checkpoint if needed netG.load_parameters(opt.netG) print(netG) netD.initialize(mx.init.Xavier(factor_type='in', magnitude=0.01), ctx=ctx) if opt.netD != '': netD.load_parameters(opt.netD) print(netD) # A pass forward to initialize netG, netD (because of defered initialization) init_x = nd.array(np.ones(shape=(opt.batchSize, opt.nz, 1, 1)), ctx=ctx) # batchsize=8, nz=100 init_x = netG(init_x) _ = netD(init_x) # optimizer settings trainer_G = Trainer(netG.collect_params(), optimizer='adam', optimizer_params={ 'learning_rate': opt.lrG, 'beta1': opt.beta1, 'beta2': 0.999 }) trainer_D = Trainer(netD.collect_params(), optimizer='adam', optimizer_params={ 'learning_rate': opt.lrD, 'beta1': opt.beta1, 'beta2': 0.999 }) print("Start training ...") #input = mx.nd.zeros((opt.batchSize, 3, opt.imageSize, opt.imageSize)) #noise = mx.nd.zeros((opt.batchSize, opt.nz, 1, 1)) fixed_noise = mx.ndarray.random.normal(shape=(opt.batchSize, opt.nz, 1, 1)) sw = SummaryWriter(logdir='./logs', flush_secs=5) gen_iterations = 0 for epoch in range(opt.num_iter): data_iter = iter(dataloader) i = 0 while i < len(dataloader): start_time = time.time() ############################ # (1) Update D network ########################### # train the discriminator Diters times if gen_iterations < 25 or gen_iterations % 500 == 0: Diters = 100 else: Diters = opt.Diters j = 0 while j < Diters and i < len(dataloader): j += 1 # clamp parameters to a cube for p in netD.collect_params(): param = netD.collect_params(p)[p] param.set_data( mx.nd.clip(param.data(), opt.clamp_lower, opt.clamp_upper)) data = next(data_iter)[0] data = data.as_in_context(ctx) i += 1 with autograd.record(): # train with real errD_real = netD(data) # train with fake noise = mx.ndarray.random.normal(shape=(opt.batchSize, opt.nz, 1, 1), ctx=ctx) fake = netG(noise) errD_fake = netD(fake.detach()) errD = errD_real - errD_fake errD.backward() trainer_D.step(1) ############################ # (2) Update G network ########################### # in case our last batch was the tail batch of the dataloader, # make sure we feed a full batch of noise noise = mx.ndarray.random.normal(shape=(opt.batchSize, opt.nz, 1, 1), ctx=ctx) with autograd.record(): fake = netG(noise) errG = netD(fake) errG.backward() trainer_G.step(1) gen_iterations += 1 print( '[%d/%d][%d/%d][%d] Loss_D: %f Loss_G: %f Loss_D_real: %f Loss_D_fake %f, time:[%f]' % (epoch, opt.num_iter, i, len(dataloader), gen_iterations, errD.asnumpy()[0], errG.asnumpy()[0], errD_real.asnumpy()[0], errD_fake.asnumpy()[0], time.time() - start_time)) sw.add_scalar(tag='loss_D', value=-errD.asnumpy()[0], global_step=gen_iterations) if gen_iterations % 500 == 0: real_imgs = data * 0.5 + 0.5 # data are normalized (mean=0.5, std=0.5)in Dataset.Transforms save_images( real_imgs.asnumpy().transpose(0, 2, 3, 1), '{0}/real_samples{1}.png'.format(opt.experiment, gen_iterations)) fake = netG(fixed_noise.as_in_context(ctx)) fake = fake * 0.5 + 0.5 save_images( fake.asnumpy().transpose(0, 2, 3, 1), '{0}/fake_samples_{1}.png'.format(opt.experiment, gen_iterations)) # do checkpointing netG.save_parameters('{0}/netG_epoch_{1}.param'.format( opt.experiment, epoch)) netD.save_parameters('{0}/netD_epoch_{1}.param'.format( opt.experiment, epoch))
def train_model_for_tl(self): """ 训练Triplet Loss模型 :return: 当前模型 """ net_path = os.path.join( DATA_DIR, 'model', 'epoch-24-0.54-20180920182658.params-symbol.json') params_path = os.path.join( DATA_DIR, 'model', 'epoch-24-0.54-20180920182658.params-0024.params') hash_num = 128 # base_net = gluon.nn.SymbolBlock.imports(net_path, ['data'], params_path) base_net = self.get_base_net() with base_net.name_scope(): base_net.output = Dense(units=hash_num) # 全连接层 base_net.output.initialize(Xavier(), ctx=self.ctx) # 初始化 base_net.collect_params().reset_ctx(self.ctx) base_net.hybridize() train_data, train_len = self.get_tl_train_data(self.batch_size) val_data, val_len = self.get_tl_val_data(self.batch_size) self.print_info("Triplet Loss 训练样本数: {}".format(train_len)) self.print_info("Triplet Loss 验证样本数: {}".format(val_len)) triplet_loss = gluon.loss.TripletLoss(margin=10.0) trainer = Trainer(base_net.collect_params(), 'rmsprop', {'learning_rate': 1e-4}) for epoch in range(self.epochs): e_loss, final_i = 0, 0 for i, batch in enumerate(train_data): data, labels = batch[0], batch[1].astype('float32') data = split_and_load(data, ctx_list=self.ctx, batch_axis=0, even_split=False) labels = split_and_load(labels, ctx_list=self.ctx, batch_axis=0, even_split=False) data_loss = [] with autograd.record(): # 梯度求导 for X in data: anchor_ins, pos_ins, neg_ins = [], [], [] for b_X in X: anchor_ins.append(nd.expand_dims(b_X[0], axis=0)) pos_ins.append(nd.expand_dims(b_X[1], axis=0)) neg_ins.append(nd.expand_dims(b_X[2], axis=0)) anchor_ins = nd.concatenate(anchor_ins, axis=0) pos_ins = nd.concatenate(pos_ins, axis=0) neg_ins = nd.concatenate(neg_ins, axis=0) inter1 = base_net(anchor_ins) inter2 = base_net(pos_ins) inter3 = base_net(neg_ins) loss = triplet_loss(inter1, inter2, inter3) # TripletLoss data_loss.append(loss) for l in data_loss: l.backward() curr_loss = np.mean( [mx.nd.mean(loss).asscalar() for loss in data_loss]) self.print_info("batch: {}, loss: {}".format(i, curr_loss)) e_loss += curr_loss final_i = i + 1 trainer.step(self.batch_size) self.print_info("epoch: {}, loss: {}".format( epoch, safe_div(e_loss, final_i))) dist_acc = self.evaluate_net(base_net, val_data) # 评估epoch的性能 self.save_net_and_params(base_net, epoch, dist_acc, name='tripletloss') # 存储网络
def train(hyperparameters, channel_input_dirs, num_gpus, hosts): batch_size = hyperparameters.get("batch_size", 64) epochs = hyperparameters.get("epochs", 3) mx.random.seed(42) training_dir = channel_input_dirs['training'] with open("{}/train/data.p".format(training_dir), "rb") as pickle: train_nd = load(pickle) with open("{}/validation/data.p".format(training_dir), "rb") as pickle: validation_nd = load(pickle) train_data = gluon.data.DataLoader(train_nd, batch_size, shuffle=True) validation_data = gluon.data.DataLoader(validation_nd, batch_size, shuffle=True) net = Sequential() # http: // gluon.mxnet.io / chapter03_deep - neural - networks / plumbing.html # What's-the-deal-with-name_scope()? with net.name_scope(): net.add( Conv2D(channels=32, kernel_size=(3, 3), padding=0, activation="relu")) net.add( Conv2D(channels=32, kernel_size=(3, 3), padding=0, activation="relu")) net.add(MaxPool2D(pool_size=(2, 2))) net.add(Dropout(.25)) net.add(Flatten()) net.add(Dense(8)) ctx = mx.gpu() if num_gpus > 0 else mx.cpu() # Also known as Glorot net.collect_params().initialize(Xavier(magnitude=2.24), ctx=ctx) loss = SoftmaxCrossEntropyLoss() # kvstore type for multi - gpu and distributed training. if len(hosts) == 1: kvstore = "device" if num_gpus > 0 else "local" else: kvstore = "dist_device_sync'" if num_gpus > 0 else "dist_sync" trainer = Trainer(net.collect_params(), optimizer="adam", kvstore=kvstore) smoothing_constant = .01 for e in range(epochs): moving_loss = 0 for i, (data, label) in enumerate(train_data): data = data.as_in_context(ctx) label = label.as_in_context(ctx) with autograd.record(): output = net(data) loss_result = loss(output, label) loss_result.backward() trainer.step(batch_size) curr_loss = nd.mean(loss_result).asscalar() moving_loss = (curr_loss if ((i == 0) and (e == 0)) else (1 - smoothing_constant) * moving_loss + smoothing_constant * curr_loss) validation_accuracy = measure_performance(net, ctx, validation_data) train_accuracy = measure_performance(net, ctx, train_data) print("Epoch %s. Loss: %s, Train_acc %s, Test_acc %s" % (e, moving_loss, train_accuracy, validation_accuracy)) return net
class Train(object): def __init__(self): object.__init__(self) self.model = model() self.loss = None self.trainer = None self.learning_rate = 0.003 self.weight_url = '/home//jim//Dress//Data//weights.params' self.logdir_url = '/home//jim//Dress//log' self.sw = None def set_weight_name(self, name): self.weight_url = self.weight_url.replace('fill_it', name) def load_weights(self, weight_url): if os.path.exists(self.weight_url): self.model.load_parameters(self.weight_url) def init_model(self): # 新模型初始化 if os.path.exists(self.weight_url): print(u'已含有旧权重文件,正在载入继续训练并更新') self.model.load_parameters(self.weight_url, allow_missing=True, ignore_extra=True) else: #self.model.collect_params("conv*|dense*").initialize(init.Xavier()) #self.model.initialize(init=init.Xavier()) self.model.initialize(init=init.MSRAPrelu()) #y_hat, image = self.model(nd.random.uniform(shape=(8,3,512,512))) #self.model.hybridize() # 加入模型框架图 #y_hat, image = self.model(nd.random.uniform(shape=(8,3,512,512))) #self.sw.add_graph(self.model) # 显示模型 print(u'model construct display:') print(self.model) def model_compile(self): # 编译模型 self.loss = gloss.SoftmaxCrossEntropyLoss(axis=1) lr_sch = lr_scheduler.FactorScheduler(step=100, factor=0.9) optimizer_params = { 'learning_rate': self.learning_rate, 'lr_scheduler': lr_sch } self.trainer = Trainer(self.model.collect_params(), optimizer='adam', optimizer_params=optimizer_params) #self.trainer.set_learning_rate = self.learning_rate def predict(self, x): x = nd.expand_dims(x, axis=0) x = nd.transpose(x, axes=(0, 3, 1, 2)) #print(u'单个样本预测输入的形状:{}'.format(x.shape)) # 初始化及编译模型准备训练 self.init_model() self.model_compile() #self.setup_debug() # 进入预测 with autograd.predict_mode(): #print(u'训练模式:{}'.format(autograd.is_training())) output = self.model(x) return output def acc(self, output, label): # output: (batch, num_output) float32 ndarray # label: (batch, label) float32 ndarray return (nd.argmax_channel(output) == label).mean().asscalar() def fit(self, dataset_train, dataset_val, epochs=1000): # 初始化及编译模型准备训练 self.setup_debug() self.init_model() self.model_compile() # 进入训练 train_data, valid_data = dataset_train, dataset_val batch_size = 2 global_step = 0 for epoch in nd.arange(epochs): train_loss, train_acc, val_acc = 0., 0., 0. tic = time.time() train_step = 0 for data, label_image, label in train_data: data = nd.transpose(data, axes=(0, 3, 1, 2)) label_image = nd.transpose(label_image, axes=(0, 3, 1, 2)) # show img as y #show_data(data, label) # forward + backward with autograd.record(): # print(u'训练模式:{}'.format(autograd.is_training())) ''' print(u'输入数据形态{}, type = {}'.format(data.shape, data.dtype)) print('input max:{},min:{}'.format(data.max(),data.min())) ''' # output, mid_image = self.model(data) ''' print(u'输出y_hat数据形态{}, type = {}'.format(output.shape, output.dtype)) print(u'输出label数据形态{}, type = {}'.format(label.shape, label.dtype)) print('output max:{},min:{}'.format(output.max().asscalar(), output.min().asscalar())) print('label max:{},min:{}'.format(label.max().asscalar(), label.min().asscalar())) ''' # loss = self.loss(output, label) #print('loss shape : {}'.format(loss.shape)) #print('loss value: {}'.format(loss)) # loss.backward() # update parameters self.trainer.step(batch_size) # calculate training metrics train_loss += np.mean(loss.asnumpy()) #print('train_loss shape:{}'.format(train_loss.shape)) #print('train_loss:{}'.format(train_loss)) train_acc += self.acc(output, label) #print('train_acc:{}'.format(train_acc)) # caculate countor train_step += 1 # check_bug #print(u'train set check:') #self.check_bug(label, output) # callback train_loss_mean = train_loss / train_step train_acc_mean = train_acc / train_step # self.call_back(global_step = global_step, \ train_step = train_step, \ train_loss_mean = train_loss_mean, \ train_acc_mean = train_acc_mean, \ train_data = data, \ train_mid_image = mid_image[:,0:3,:,:], \ train_output = None, \ train_label = label_image, \ val_step = 0, \ val_acc_mean = None, \ val_data = None, \ val_mid_image = None, \ val_output = None, \ val_label = None \ ) #break # calculate validation accuracy val_step = 0 for val_data, val_label_image, val_label in valid_data: val_data = nd.transpose(val_data, axes=(0, 3, 1, 2)) val_label_image = nd.transpose(val_label_image, axes=(0, 3, 1, 2)) with autograd.predict_mode(): print(u'训练模式:{}'.format(autograd.is_training())) val_output, val_mid_image = self.model(val_data) # val_acc += self.acc(val_output, val_label) #print(u'val set check:') #self.check_bug(label, output) val_step += 1 val_acc_mean = val_acc / val_step # self.call_back(global_step = global_step, \ train_step = train_step, \ train_loss_mean = train_loss_mean, \ train_acc_mean = train_acc_mean, \ train_data = data, \ train_mid_image = mid_image[:,0:3,:,:], \ train_output = None, \ train_label = label_image, \ val_step = val_step, \ val_acc_mean = val_acc_mean, \ val_data = val_data, \ val_mid_image = val_mid_image[:,0:3,:,:], \ val_output = None, \ val_label = val_label_image \ ) #break # display rel print("Epoch {}: loss {}, train acc {}, test acc {}, using {} sec". format(epoch, train_loss / train_step, train_acc / train_step, val_acc / val_step, time.time() - tic)) # save weights self.model.save_parameters(self.weight_url) # global_step += 1 def check_bug(self, label, output): # 真阴、真阳、假阴、假阳统计 np_label = label.asnumpy() output = output.argmax(axis=1) np_output = output.asnumpy() for i in [0, 1, 2]: index = np.ix_(np_label == i)[0] np_label_class_number = index.shape[0] post_in_output = np_output[index] #print(post_in_output) for p in [0, 1, 2]: np_output_class_number = post_in_output[post_in_output == p].shape[0] if i == p: print(u'action = {}, real have {} predict have {}'.format( i, np_label_class_number, np_output_class_number)) print('---onetime check over---') def setup_debug(self): self.sw = SummaryWriter(logdir=self.logdir_url, flush_secs=10) def image_255_to_01(self, img): img = nd.transpose(img, axes=(0, 2, 3, 1)) for i in nd.arange(img.shape[0]): image = img[i, :, :, :] rgb_mean = nd.array([0.485, 0.456, 0.406]) rgb_std = nd.array([0.229, 0.224, 0.225]) image = (image.astype('float32') / 255.0 - rgb_mean) / rgb_std img = img.clip(0.0, 1.0) img = nd.transpose(img, axes=(0, 3, 1, 2)) return img def y_hat_to_image(self, y_hat): y_hat = nd.argmax_channel(y_hat) print('y_hat shape = {}'.format(y_hat.shape)) rel = nd.ones((y_hat.shape[0], 3, y_hat.shape[1], y_hat.shape[2])) for b in nd.arange(rel.shape[0]): for x in nd.arange(rel.shape[2]): for y in nd.arange(rel.shape[3]): rel[b, :, x, y] = d2l.VOC_COLORMAP[int(y_hat[b, x, y])] return rel def call_back( self, global_step=0, train_step=0, train_loss_mean=None, train_acc_mean=None, train_data=None, train_mid_image=None, train_output=None, train_label=None, val_step=0, val_acc_mean=None, val_data=None, val_mid_image=None, val_output=None, val_label=None, ): # 加入模型框架图 #if global_step == 0: #self.model.hybridize() #self.sw.add_graph(self.model) # 加入某层输出图像变化 if train_data is not None: #train_output = self.image_255_to_01(train_output) train_mid_image = self.image_255_to_01(train_mid_image) train_label = self.image_255_to_01(train_label) # self.sw.add_image('train_input_image', train_data, train_step) self.sw.add_image('train_mid_image', train_mid_image, train_step) #self.sw.add_image('train_output', train_output, train_step) self.sw.add_image('train_label', train_label, train_step) if val_data is not None: val_data = self.image_255_to_01(val_data) val_mid_image = self.image_255_to_01(val_mid_image) #val_output = self.image_255_to_01(val_output) val_label = self.image_255_to_01(val_label) # self.sw.add_image('val_input_image', val_data, val_step) self.sw.add_image('val_mid_image', val_mid_image, val_step) #self.sw.add_image('val_output', val_output, val_step) self.sw.add_image('val_label', val_label, val_step) # 加入学习次数-loss、acc变化曲线图 if train_loss_mean is not None: self.sw.add_scalar(tag = 'Loss_and_acc', \ value = {'train_loss': train_loss_mean, 'train_acc': train_acc_mean}, \ global_step = train_step) if val_acc_mean is not None: self.sw.add_scalar(tag = 'val_acc', \ value = {'val_acc' : val_acc_mean}, \ global_step = val_step) # 加入某个层权重分布变化等高图 grads = [ i.grad() for i in self.model.collect_params('.*weight|.*bias').values() ] param_names = [ name for name in self.model.collect_params('.*weight|.*bias').keys() ] assert len(grads) == len(param_names) # logging the gradients of parameters for checking convergence for i, name in enumerate(param_names): self.sw.add_histogram(tag=name, values=grads[i], global_step=train_step, bins=20)
data = split_and_load(data, ctx_list=ctx, batch_axis=0) label = split_and_load(label, ctx_list=ctx, batch_axis=0) output = [] losses = [] with ag.record(): for x, y in zip(data, label): z = model(x) # computes softmax cross entropy loss l = loss_fn(z, y) output.append(z) losses.append(l) # backpropagate the error for one iteration. for l in losses: l.backward() # Update network weights trainer.step(BATCH_SIZE) # Update metric metric.update(label, output) str1 = 'Epoch [{}], Accuracy {:.4f}'.format(epoch, metric.get()[1]) str2 = '~Samples/Sec {:.4f}'.format(BATCH_SIZE * (i + 1) / (time.time() - tick_0)) print('%s %s' % (str1, str2)) metric.reset() elapsed = time.perf_counter() - start print('elapsed: {:0.3f}'.format(elapsed)) # use Accuracy as the evaluation metric metric = Accuracy() for data, label in test_data: data = split_and_load(data, ctx_list=ctx, batch_axis=0)
def train(cfg, ctx_lst, project_name, log_interval=5, no_val=False, lr=None, wd=None): wandb.init(job_type='train', dir=my_tools.root_dir(), config=cfg, project=project_name) if lr and wd: wandb.config.lr = lr wandb.config.wd = wd ctx = my_tools.get_contexts(ctx_lst) wandb.config.ctx = ctx data_factory = DataFactory(wandb.config.data_name) model_factory = ModelFactory(wandb.config.model_name) norm_layer, norm_kwargs = my_tools.get_norm_layer(wandb.config.norm, len(ctx)) model_kwargs = { 'nclass': data_factory.num_class, 'backbone': wandb.config.backbone, 'pretrained_base': wandb.config.backbone_init.get('manner') == 'cls', 'aux': wandb.config.aux, 'crop_size': wandb.config.crop_size, 'base_size': wandb.config.base_size, 'dilate': wandb.config.dilate, 'norm_layer': norm_layer, 'norm_kwargs': norm_kwargs, } net = model_factory.get_model( model_kwargs, resume=wandb.config.resume, lr_mult=wandb.config.lr_mult, backbone_init_manner=wandb.config.backbone_init.get('manner'), backbone_ckpt=wandb.config.backbone_init.get('backbone_ckpt'), prior_classes=wandb.config.backbone_init.get('prior_classes'), ctx=ctx) if net.symbolize: net.hybridize() num_worker = 0 if platform.system() == 'Windows' else 16 train_set = data_factory.seg_dataset( split='train', # sometimes would be 'trainval' mode='train', transform=my_tools.image_transform(), base_size=wandb.config.base_size, crop_size=wandb.config.crop_size) train_iter = DataLoader(train_set, wandb.config.bs_train, shuffle=True, last_batch='discard', num_workers=num_worker) val_set = data_factory.seg_dataset(split='val', mode='val', transform=my_tools.image_transform(), base_size=wandb.config.base_size, crop_size=wandb.config.crop_size) val_iter = DataLoader(val_set, wandb.config.bs_val, shuffle=False, last_batch='keep', num_workers=num_worker) wandb.config.num_train = len(train_set) wandb.config.num_valid = len(val_set) criterion = _get_criterion(wandb.config.aux, wandb.config.aux_weight) criterion.initialize(ctx=ctx) wandb.config.criterion = type(criterion) if wandb.config.optimizer == 'adam': trainer = Trainer(net.collect_params(), 'adam', optimizer_params={ 'learning_rate': wandb.config.lr, 'wd': wandb.config.wd, 'beta1': wandb.config.adam.get('adam_beta1'), 'beta2': wandb.config.adam.get('adam_beta2') }) elif wandb.config.optimizer in ('sgd', 'nag'): scheduler = _lr_scheduler( mode=wandb.config.lr_scheduler, base_lr=wandb.config.lr, target_lr=wandb.config.target_lr, nepochs=wandb.config.epochs, iters_per_epoch=len(train_iter), step_epoch=wandb.config.step.get('step_epoch'), step_factor=wandb.config.step.get('step_factor'), power=wandb.config.poly.get('power')) trainer = Trainer(net.collect_params(), wandb.config.optimizer, optimizer_params={ 'lr_scheduler': scheduler, 'wd': wandb.config.wd, 'momentum': wandb.config.momentum, 'multi_precision': True }) else: raise RuntimeError(f"Unknown optimizer: {wandb.config.optimizer}") metric = SegmentationMetric(data_factory.num_class) logger = get_logger(name='train', level=10) t_start = my_tools.get_strftime() logger.info(f'Training start: {t_start}') for k, v in wandb.config.items(): logger.info(f'{k}: {v}') logger.info('-----> end hyper-parameters <-----') wandb.config.start_time = t_start best_score = .0 best_epoch = 0 for epoch in range(wandb.config.epochs): train_loss = .0 tbar = tqdm(train_iter) for i, (data, target) in enumerate(tbar): gpu_datas = split_and_load(data, ctx_list=ctx) gpu_targets = split_and_load(target, ctx_list=ctx) with autograd.record(): loss_gpus = [ criterion(*net(gpu_data), gpu_target) for gpu_data, gpu_target in zip(gpu_datas, gpu_targets) ] for loss in loss_gpus: autograd.backward(loss) trainer.step(wandb.config.bs_train) nd.waitall() train_loss += sum([loss.mean().asscalar() for loss in loss_gpus]) / len(loss_gpus) tbar.set_description( 'Epoch-%d [training], loss %.5f, %s' % (epoch, train_loss / (i + 1), my_tools.get_strftime('%Y-%m-%d %H:%M:%S'))) if (i % log_interval == 0) or (i + 1 == len(train_iter)): wandb.log({ f'train_loss_batch, interval={log_interval}': train_loss / (i + 1) }) wandb.log({ 'train_loss_epoch': train_loss / (len(train_iter)), 'custom_step': epoch }) if not no_val: val_loss = .0 vbar = tqdm(val_iter) for i, (data, target) in enumerate(vbar): gpu_datas = split_and_load(data=data, ctx_list=ctx, even_split=False) gpu_targets = split_and_load(data=target, ctx_list=ctx, even_split=False) loss_gpus = [] for gpu_data, gpu_target in zip(gpu_datas, gpu_targets): gpu_output = net(gpu_data) loss_gpus.append(criterion(*gpu_output, gpu_target)) metric.update(gpu_target, gpu_output[0]) val_loss += sum([loss.mean().asscalar() for loss in loss_gpus]) / len(loss_gpus) vbar.set_description( 'Epoch-%d [validation], PA %.4f, mIoU %.4f' % (epoch, metric.get()[0], metric.get()[1])) nd.waitall() pix_acc, mean_iou = metric.get() wandb.log({ 'val_PA': pix_acc, 'val_mIoU': mean_iou, 'val_loss': val_loss / len(val_iter), 'custom_step': epoch }) metric.reset() if mean_iou > best_score: my_tools.save_checkpoint( model=net, model_name=wandb.config.model_name.lower(), backbone=wandb.config.backbone.lower(), data_name=wandb.config.data_name.lower(), time_stamp=wandb.config.start_time, is_best=True) best_score = mean_iou best_epoch = epoch logger.info( f'Best val mIoU={round(best_score * 100, 2)} at epoch: {best_epoch}') wandb.config.best_epoch = best_epoch my_tools.save_checkpoint(model=net, model_name=wandb.config.model_name.lower(), backbone=wandb.config.backbone.lower(), data_name=wandb.config.data_name.lower(), time_stamp=wandb.config.start_time, is_best=False)
class Model: __slots__ = ('net', 'ctx', 'trainer', 'loss_fun', 'metric', 'history') def __init__(self, net, ctx): self.net = net self.ctx = ctx self.trainer = Trainer(net.collect_params(), 'adam', {'learning_rate': 0.01}) self.loss_fun = None self.metric = None self.history = plot_history.TrainingHistory(['acc']) def compile(self, trainer, loss_fun, metric): self.trainer = trainer self.loss_fun = loss_fun self.metric = metric def test(self, dataloader): self.metric.reset() for data, label in dataloader: if self.ctx != mx.cpu(): data = data.copyto(self.ctx) label = label.copyto(self.ctx) outputs = self.net(data) loss = self.loss_fun(outputs, label) self.metric.update(labels=label, preds=outputs) loss = loss.asnumpy().mean() return loss, self.metric.get() def summary(self): print(self.net) def fit(self, train_data, epochs, val_data=None): if val_data: self.history = plot_history.TrainingHistory( ['acc_train', 'acc_val']) length = len(train_data) for epoch in range(1, 1 + epochs): self.metric.reset() tic = time.time() for step, (data, label) in enumerate(train_data): if self.ctx != mx.cpu(): data = data.copyto(self.ctx) label = label.copyto(self.ctx) batch_size = data.shape[0] with autograd.record(): outputs = self.net(data) loss = self.loss_fun(outputs, label) loss.backward() self.trainer.step(batch_size) self.metric.update(labels=label, preds=outputs) loss = loss.asnumpy().mean() _, acc = self.metric.get() t = time.time() step += 1 if val_data: loss, (_, val_acc) = self.test(val_data) print( f'Epoch:{epoch}/{epochs} step:{step}/{length} acc:{acc} val_acc:{val_acc} loss:{loss} time:{t - tic}' ) self.history.update([acc, val_acc]) else: print( f'Epoch:{epoch}/{epochs} step:{step}/{length} acc:{acc} loss:{loss} time:{t - tic}' ) self.history.update([acc]) return self.history
def main(): data_p = Path('/storage/data/').resolve() checkpoint_p = Path('./checkpoints/').resolve() checkpoint_p.mkdir(parents=True, exist_ok=True) logs_p = Path('./logs/').resolve() shutil.rmtree(logs_p, ignore_errors=True) encoder = SevenPlaneEncoder((19, 19)) builder = SGFDatasetBuilder(data_p, encoder=encoder) builder.download_and_prepare() train_itr = builder.train_dataset(batch_size=BATCH_SIZE, max_worker=cpu_count(), factor=FACTOR) test_itr = builder.test_dataset(batch_size=BATCH_SIZE, max_worker=cpu_count(), factor=FACTOR) # build model betago = Model() # convert to half-presicion floating point FP16 # NOTE: all NVIDIA GPUs with compute capability 6.1 have a low-rate FP16 performance == FFP16 is not the fast path on these GPUs # data passed to split_and_load() must be float16 too #betago.cast('float16') # hybridize for speed betago.hybridize(static_alloc=True, static_shape=True) # print graph shape = (1, ) + encoder.shape() mx.viz.print_summary(betago(mx.sym.var('data')), shape={'data': shape}) # pin GPUs ctx = [mx.gpu(i) for i in range(GPU_COUNT)] # optimizer opt_params = { 'learning_rate': 0.001, 'beta1': 0.9, 'beta2': 0.999, 'epsilon': 1e-08 } opt = mx.optimizer.create('adam', **opt_params) # initialize parameters # MXNet initializes the weight matrices uniformly by drawing from [−0.07,0.07], bias parameters are all set to 0 # 'Xavier': initializer is designed to keep the scale of gradients roughly the same in all layers betago.initialize(mx.init.Xavier(magnitude=2.3), ctx=ctx, force_reinit=True) # fetch and broadcast parameters params = betago.collect_params() # trainer trainer = Trainer(params=params, optimizer=opt, kvstore='device') # loss function loss_fn = SoftmaxCrossEntropyLoss() # use accuracy as the evaluation metric metric = Accuracy() with mxb.SummaryWriter(logdir='./logs') as sw: # add graph to MXBoard #betago.forward(mx.nd.ones(shape, ctx=ctx[0])) #betago.forward(mx.nd.ones(shape, ctx=ctx[1])) #sw.add_graph(betago) profiler.set_config(profile_all=True, aggregate_stats=True, continuous_dump=True, filename='profile_output.json') start = time.perf_counter() # train for e in range(EPOCHS): if 0 == e: profiler.set_state('run') tick = time.time() # reset the train data iterator. train_itr.reset() # loop over the train data iterator for i, batch in enumerate(train_itr): if 0 == i: tick_0 = time.time() # splits train data into multiple slices along batch_axis # copy each slice into a context data = split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0, even_split=False) # splits train label into multiple slices along batch_axis # copy each slice into a context label = split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0, even_split=False) outputs = [] losses = [] # inside training scope with ag.record(): for x, y in zip(data, label): z = betago(x) # computes softmax cross entropy loss l = loss_fn(z, y) outputs.append(z) losses.append(l) # backpropagate the error for one iteration for l in losses: l.backward() # make one step of parameter update. # trainer needs to know the batch size of data # to normalize the gradient by 1/batch_size trainer.step(BATCH_SIZE) # updates internal evaluation metric.update(label, outputs) # Print batch metrics if 0 == i % PRINT_N and 0 < i: # checkpointing betago.save_parameters( str(checkpoint_p.joinpath( 'betago-{}.params'.format(e)))) sw.add_scalar(tag='Accuracy', value={'naive': metric.get()[1]}, global_step=i - PRINT_N) sw.add_scalar(tag='Speed', value={ 'naive': BATCH_SIZE * (PRINT_N) / (time.time() - tick) }, global_step=i - PRINT_N) print( 'epoch[{}] batch [{}], accuracy {:.4f}, samples/sec: {:.4f}' .format(e, i, metric.get()[1], BATCH_SIZE * (PRINT_N) / (time.time() - tick))) tick = time.time() if 0 == e: profiler.set_state('stop') profiler.dump() # gets the evaluation result print('epoch [{}], accuracy {:.4f}, samples/sec: {:.4f}'.format( e, metric.get()[1], BATCH_SIZE * (i + 1) / (time.time() - tick_0))) # reset evaluation result to initial state metric.reset() elapsed = time.perf_counter() - start print('elapsed: {:0.3f}'.format(elapsed)) # use Accuracy as the evaluation metric metric = Accuracy() for batch in test_itr: data = split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0) label = split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0) outputs = [] for x in data: outputs.append(betago(x)) metric.update(label, outputs) print('validation %s=%f' % metric.get())
def train(net, train_dataloader, valid_dataloader, ctx_list, args): """Training pipline """ # optimizer trainer = Trainer(net.collect_params(), 'sgd', { 'learning_rate': args.lr, 'wd': args.wd, 'momentum': args.momentum }) # loss acc_metric = AccuracyMetric() loss_metric = mx.metric.Loss('SoftMaxCrossEntropyLoss') valid_metric = ValidMetric() cross_entropy_loss = gloss.SoftmaxCrossEntropyLoss() metric1 = [loss_metric] metric2 = [acc_metric] # create a logging logging.basicConfig() # get a logger logger = logging.getLogger() logger.setLevel(logging.INFO) log_file_path = args.save_prefix + '_train.log' log_dir = os.path.dirname(log_file_path) if log_dir and not os.path.exists(log_dir): os.makedirs(log_dir) fd = logging.FileHandler(log_file_path) logger.addHandler(fd) logger.info(args) if args.verbose: logger.info('Trainabel paramters:') logger.info(net.collect_params().keys()) logger.info('Start training from [Epoch {}]'.format(args.start_epoch)) logger.info('Traing on {}'.format(ctx_list)) # create best_acc = [0] lr_steps = sorted( [int(step) for step in args.lr_decay_epoch.split(',') if step.strip()]) lr_decay = float(args.lr_decay) for epoch in range(args.start_epoch, args.epochs): ttime = time.time() btime = time.time() # lr_decay if lr_steps and epoch >= lr_steps[0]: new_lr = trainer.learning_rate * lr_decay lr_steps.pop(0) trainer.set_learning_rate(new_lr) logger.info('[Epoch {}] set learning rate to {:.4f}'.format( epoch, new_lr)) acc_metric.reset() if args.hybrid: net.hybridize(static_alloc=True) # get mini-batch data # batch [data,label] for i, batch in enumerate(train_dataloader): batch_size = len(batch[0]) batch = split_and_load_data(batch, ctx_list, batch_size) losses = [] metrics = [] with autograd.record(): for data, cls_label in zip(*batch): # forward pred_scores = net(data) # loss loss = cross_entropy_loss(pred_scores, cls_label) # record loss and preds losses.append(loss) metrics.append([[cls_label], [pred_scores]]) # backward autograd.backward(losses) # optimizer params trainer.step(batch_size) # update metrics... for record in metrics: acc_metric.update(record[0], record[1]) for record in losses: loss_metric.update(0, record) if args.log_interval and not (i + 1) % args.log_interval: # logging info = ','.join([ '{}={:.3f}'.format(*metric.get()) for metric in metric1 + metric2 ]) msg = '[Epoch {}][Batch {}],Speed: {:.3f} samples/sec,{}'.format( epoch, i, args.log_interval * batch_size / (time.time() - btime), info) logger.info(msg) btime = time.time() info = ','.join(['{}={:.3f}'.format(*loss_metric.get())]) msg = '[Epoch {}] Traning cost : {:.3f},{}'.format( epoch, time.time() - ttime, info) logger.info(msg) if args.val_interval and not (epoch + 1) % args.val_interval: name, current_acc = evaluate(net, valid_dataloader, valid_metric, ctx_list, args.hybrid) info = '{}={:.3f}'.format(name, current_acc) msg = '[Epoch {}] Validation {}.'.format(epoch, info) logger.info(msg) else: current_acc = 0 save_parameters(net, logger, best_acc, current_acc, epoch, args.save_interval, args.save_prefix)
class AnswerVerifyThreshold(object): def __init__(self, tokenizer=nlp.data.BERTBasicTokenizer(lower=True), max_answer_length=30, n_best_size=20, max_len=384, version_2=True, ctx=mx.cpu()): self.tokenizer = tokenizer self.max_answer_length = max_answer_length self.n_best_size = n_best_size self.version_2 = version_2 self.ctx = ctx self.data = list() self.option = 2 if self.option == 1: self.null_score_diff_threshold = 0.0 # normally between -5 and -1 # TODO: consider cleverer ways such as svm etc. elif self.option == 2: self.threshold = 0.45 # 0.5 self.batch_size = 1024 self.classifier = nn.HybridSequential() with self.classifier.name_scope(): self.classifier.add(nn.Dense(units=10, activation='relu')) # input layer self.classifier.add(nn.Dense( units=10, activation='relu')) # inner layer 1 self.classifier.add(nn.Dense( units=10, activation='relu')) # inner layer 2 self.classifier.add( nn.Dense(units=1) ) # output layer: notice, it must have only 1 neuron for regression self.classifier.initialize(init=mx.init.Xavier(), ctx=ctx) self.loss = gluon.loss.SigmoidBinaryCrossEntropyLoss() self.trainer = Trainer(params=self.classifier.collect_params(), optimizer='sgd', optimizer_params={'learning_rate': 0.1}) # 0.01, 0.1 def train(self, train_features, example_ids, out, token_types=None, bert_out=None, num_epochs=1, verbose=False): if not self.version_2: return raw_data = self.get_training_data(train_features, example_ids, out, token_types=token_types) self.data.extend(raw_data) def evaluate(self, score_diff, best_pred): # asserted that prediction is not null if self.option == 1: if score_diff > self.null_score_diff_threshold: answerable = 0. else: answerable = 1. elif self.option == 2: data = mx.nd.array([[score_diff, best_pred]]).as_in_context(self.ctx) # Do forward pass on a batch of validation data output = self.classifier(data) # getting prediction as a sigmoid prediction = output.sigmoid() # Converting neuron outputs to classes predicted_classes = mx.nd.ceil(prediction - self.threshold) # calculate probabilities of belonging to different classes. F1 metric works only with this notation # prediction = prediction.reshape(-1) answerable = predicted_classes[0].asscalar() # print(score_diff, best_pred, "answerable:", answerable) # reset the data return answerable def update(self, epochs=100): if self.option == 1: data_numpy = np.array(self.data) X = np.array(data_numpy[:, :-1]) y = np.array(data_numpy[:, -1]) # np.mean() # self.null_score_diff_threshold = np.median(data_numpy[:,0]) self.null_score_diff_threshold = np.mean( data_numpy[:, 0]) + np.std(data_numpy[:, 0]) * .05 elif self.option == 2: data_numpy = np.array(self.data) X = nd.array(data_numpy[:, :-1]) y = nd.array(data_numpy[:, -1]) train_dataset = ArrayDataset(X, y) train_dataloader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True) for e in range(epochs): cumulative_train_loss = 0 for i, (data, label) in enumerate(train_dataloader): data = data.as_in_context(self.ctx) label = label.as_in_context(self.ctx) with autograd.record(): # Do forward pass on a batch of training data output = self.classifier(data) # Calculate loss for the training data batch loss_result = self.loss(output, label) # Calculate gradients loss_result.backward() # Update parameters of the network self.trainer.step(len(data)) self.data = list() def get_training_data(self, train_features, example_ids, out, token_types=None): output = mx.nd.split(out, axis=2, num_outputs=2) example_ids = example_ids.asnumpy().tolist() pred_start = output[0].reshape((0, -3)).asnumpy() pred_end = output[1].reshape((0, -3)).asnumpy() raw_data = [] for example_id, start, end in zip(example_ids, pred_start, pred_end): results = [PredResult(start=start, end=end)] features = train_features[example_id] label = 0 if features[0].is_impossible else 1 prediction, score_diff, top_predict = predict( features=features, results=results, tokenizer=self.tokenizer, max_answer_length=self.max_answer_length, n_best_size=self.n_best_size, version_2=self.version_2) non_empty_top = 1. if top_predict else 0. # print(prediction, "," , top_predict, ",", features[0].orig_answer_text) raw_data.append([score_diff, non_empty_top, label]) return raw_data
# print(data.shape) # break net = get_net() ctx = utils.getCtx() net.initialize(ctx=ctx, init=init.Xavier()) softmax_loss = loss.SoftmaxCrossEntropyLoss() epochs = 5 trainer = Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.5}) for epoch in range(epochs): total_loss = .0 total_acc = .0 for data, label in train_iter: with autograd.record(): output = net(data) losses = softmax_loss(output, label) losses.backward() trainer.step(batch_size) total_loss += nd.mean(losses).asscalar() total_acc += utils.accuracy(output, label) test_acc = utils.evaluate_accuracy(test_iter, net) print('Epoch %d, Train loss: %f, Train acc: %f, Test acc: %f' % ( epoch, total_loss / len(train_iter), total_acc / len(train_iter), test_acc ))
def train_model_for_ml(self): """ 训练模型, 多标签 """ base_net = self.get_base_net() # 基础网络 train_data, len_td = self.get_train_data(self.batch_size) # 训练数据,按批次获取 val_data, len_vd = self.get_val_data(self.batch_size) # 训练数据,按批次获取 trainer = Trainer(base_net.collect_params(), 'rmsprop', {'learning_rate': 1e-4}) loss_func = SigmoidBinaryCrossEntropyLoss() lr_steps = [10, 20, 30, np.inf] # 逐渐降低学习率 lr_factor = 0.75 lr_counter = 0 n_batch = int(len_td / self.batch_size) self.print_info('训练 - 样本数:{}, 批次样本: {}, 批次数: {}'.format( len_td, self.batch_size, n_batch)) for epoch in range(self.epochs): if epoch == lr_steps[lr_counter]: # 逐渐降低学习率 trainer.set_learning_rate(trainer.learning_rate * lr_factor) lr_counter += 1 e_loss, e_r, e_p, e_f1 = 0, 0, 0, 0 # epoch for i, batch in enumerate(train_data): data, labels = batch[0], batch[1].astype('float32') data = split_and_load(data, ctx_list=self.ctx, batch_axis=0, even_split=False) labels = split_and_load(labels, ctx_list=self.ctx, batch_axis=0, even_split=False) with autograd.record(): # 梯度求导 outputs = [base_net(X) for X in data] bc_loss = [ loss_func(yhat, y) for yhat, y in zip(outputs, labels) ] for l in bc_loss: l.backward() trainer.step(self.batch_size) batch_loss = sum([l.mean().asscalar() for l in bc_loss]) / len( bc_loss) # batch的loss e_loss += batch_loss br, bp, bf1 = self.get_batch_rpf(outputs, labels) e_r += br e_p += bp e_f1 += bf1 self.print_info( 'batch: {}, loss: {:.5f}, recall: {:.2f}, precision: {:.2f}, f1: {:.2f}' .format(i, batch_loss, br, bp, bf1)) n_batch = i + 1 # 批次数 e_loss /= n_batch e_r /= n_batch e_p /= n_batch e_f1 /= n_batch self.print_info( 'epoch: {}, loss: {:.5f}, recall: {:.2f}, precision: {:.2f}, f1: {:.2f}' .format(epoch, e_loss, e_r, e_p, e_f1)) e_r, e_p, e_f1 = self.val_net(base_net, val_data, len_vd) self.save_net_and_params(base_net, epoch, e_f1, name='multilabel') # 存储网络