data = data.as_in_context(data_ctx) label = label.as_in_context(data_ctx) output = net(data) prediction = nd.dot(output, scale) acc.update(preds=prediction, labels=label) return acc.get()[1] if __name__ == '__main__': net.output.collect_params().initialize( init=mx.init.Uniform(scale=1 / math.sqrt(2048)), ctx=model_ctx, force_reinit=False) # net.collect_params().initialize(init=mx.init.Xavier(), ctx=model_ctx) # net.load_params('/home/gdshen/datasets/mxnet_checkpoint/checkpoint-imdb-15.params', ctx=model_ctx) net.collect_params().reset_ctx(ctx=model_ctx) # net.hybridize() trainer = gluon.Trainer(net.collect_params(), 'sgd', { 'learning_rate': lr, 'momentum': mom, 'wd': weight_decay }) # trainer = gluon.Trainer(net.collect_params(), 'Adam', {'learning_rate': lr}) data_iter = gluon.data.DataLoader(training_datasets, 10, shuffle=True, num_workers=8, last_batch='discard') eval_iter = gluon.data.DataLoader(test_datasets, 10,
def train(): logging.info('Start Training for Task: %s\n' % (task)) # Initialize the net with pretrained model,使用预训练好的模型参数 # finetune_net = gluon.model_zoo.vision.get_model(model_name, pretrained=True) # with finetune_net.name_scope(): # finetune_net.output = nn.Dense(task_num_class) # finetune_net.output.initialize(init.Xavier(), ctx = ctx) #对网络进行初始化参数 # finetune_net.collect_params().reset_ctx(ctx) #参数放在gpu上 # finetune_net.hybridize() #使用网络融合 finetune_net = Net(ctx,task_num_class).output finetune_net.collect_params().reset_ctx(ctx) #参数放在gpu上 finetune_net.hybridize() # Define DataLoader定义数据加载器 train_data = gluon.data.DataLoader( gluon.data.vision.ImageFolderDataset( os.path.join('data/train_valid', task, 'train'), transform=transform_train), batch_size=batch_size, shuffle=True, num_workers=num_workers, last_batch='discard') val_data = gluon.data.DataLoader( gluon.data.vision.ImageFolderDataset( os.path.join('data/train_valid', task, 'val'), transform=transform_val), batch_size=batch_size, shuffle=False, num_workers = num_workers) # Define Trainer 训练器 trainer = gluon.Trainer(finetune_net.collect_params(), 'sgd', { 'learning_rate': lr, 'momentum': momentum, 'wd': wd}) metric = mx.metric.Accuracy() L = gluon.loss.SoftmaxCrossEntropyLoss()#损失函数 lr_counter = 0 num_batch = len(train_data)#训练数据有多少个batch-size # Start Training for epoch in range(epochs): #每次训练一个epoch if epoch == lr_steps[lr_counter]: #学习率衰减为原来的lr_factor trainer.set_learning_rate(trainer.learning_rate*lr_factor) lr_counter += 1 tic = time.time() train_loss = 0 metric.reset() AP = 0. AP_cnt = 0 #每次从训练数据中拿出batch-size个数据进行训练 for i, batch in enumerate(train_data): data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0, even_split=False) label = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0, even_split=False) with ag.record(): outputs = [finetune_net(X) for X in data] #每个图像经过网络计算得到结果 loss = [L(yhat, y) for yhat, y in zip(outputs, label)]#计算loss for l in loss: l.backward() #计算梯度 trainer.step(batch_size) #每次迭代batch-size个数据 train_loss += sum([l.mean().asscalar() for l in loss]) / len(loss) metric.update(label, outputs) ap, cnt = calculate_ap(label, outputs) AP += ap AP_cnt += cnt progressbar(i, num_batch-1) #训练进度条 train_map = AP / AP_cnt _, train_acc = metric.get() train_loss /= num_batch val_acc, val_map, val_loss = validate(finetune_net, val_data, ctx) #计算验证精度 logging.info('[Epoch %d] Train-acc: %.3f, mAP: %.3f, loss: %.3f | Val-acc: %.3f, mAP: %.3f, loss: %.3f | time: %.1f' % (epoch, train_acc, train_map, train_loss, val_acc, val_map, val_loss, time.time() - tic)) logging.info('\n') return (finetune_net)