def train_main(): if (start_epoch == 0) and (not os.path.exists(params_path)): # 从头开始训练,就要重新构建文件夹 os.makedirs(params_path) print('create params directory %s' % (params_path), flush=True) elif (start_epoch == 0) and (os.path.exists(params_path)): shutil.rmtree(params_path) os.makedirs(params_path) print('delete the old one and create params directory %s' % (params_path), flush=True) elif (start_epoch > 0) and (os.path.exists(params_path)): # 从中间开始训练,就要保证原来的目录存在 print('train from params directory %s' % (params_path), flush=True) else: raise SystemExit('Wrong type of model!') criterion = nn.L1Loss().to(DEVICE) # 定义损失函数 optimizer = optim.Adam(net.parameters(), lr=learning_rate) # 定义优化器,传入所有网络参数 sw = SummaryWriter(logdir=params_path, flush_secs=5) total_param = 0 print('Net\'s state_dict:', flush=True) for param_tensor in net.state_dict(): print(param_tensor, '\t', net.state_dict()[param_tensor].size(), flush=True) total_param += np.prod(net.state_dict()[param_tensor].size()) print('Net\'s total params:', total_param, flush=True) print('Optimizer\'s state_dict:') for var_name in optimizer.state_dict(): print(var_name, '\t', optimizer.state_dict()[var_name], flush=True) global_step = 0 best_epoch = 0 best_val_loss = np.inf # train model if start_epoch > 0: params_filename = os.path.join(params_path, 'epoch_%s.params' % start_epoch) net.load_state_dict(torch.load(params_filename)) print('start epoch:', start_epoch, flush=True) print('load weight from: ', params_filename, flush=True) start_time = time() for epoch in range(start_epoch, epochs): params_filename = os.path.join(params_path, 'epoch_%s.params' % epoch) # apply model on the validation data set val_loss = compute_val_loss(net, val_loader, criterion, sw, epoch) if val_loss < best_val_loss: best_val_loss = val_loss best_epoch = epoch torch.save(net.state_dict(), params_filename) print('save parameters to file: %s' % params_filename, flush=True) net.train() # ensure dropout layers are in train mode train_start_time = time() for batch_index, batch_data in enumerate(train_loader): encoder_inputs, decoder_inputs, labels = batch_data encoder_inputs = encoder_inputs.transpose(-1, -2) # (B, N, T, F) decoder_inputs = decoder_inputs.unsqueeze(-1) # (B, N, T, 1) labels = labels.unsqueeze(-1) optimizer.zero_grad() outputs = net(encoder_inputs, decoder_inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() training_loss = loss.item() global_step += 1 sw.add_scalar('training_loss', training_loss, global_step) print('epoch: %s, train time every whole data:%.2fs' % (epoch, time() - train_start_time), flush=True) print('epoch: %s, total time:%.2fs' % (epoch, time() - start_time), flush=True) print('best epoch:', best_epoch, flush=True) print('apply the best val model on the test data set ...', flush=True) predict_main(best_epoch, test_loader, test_target_tensor, _max, _min, 'test') # fine tune the model optimizer = optim.Adam(net.parameters(), lr=learning_rate * 0.1) print('fine tune the model ... ', flush=True) for epoch in range(epochs, epochs + fine_tune_epochs): params_filename = os.path.join(params_path, 'epoch_%s.params' % epoch) net.train() # ensure dropout layers are in train mode train_start_time = time() for batch_index, batch_data in enumerate(train_loader): encoder_inputs, decoder_inputs, labels = batch_data encoder_inputs = encoder_inputs.transpose(-1, -2) # (B, N, T, F) decoder_inputs = decoder_inputs.unsqueeze(-1) # (B, N, T, 1) labels = labels.unsqueeze(-1) predict_length = labels.shape[2] # T optimizer.zero_grad() encoder_output = net.encode(encoder_inputs) # decode decoder_start_inputs = decoder_inputs[:, :, :1, :] decoder_input_list = [decoder_start_inputs] for step in range(predict_length): decoder_inputs = torch.cat(decoder_input_list, dim=2) predict_output = net.decode(decoder_inputs, encoder_output) decoder_input_list = [decoder_start_inputs, predict_output] loss = criterion(predict_output, labels) loss.backward() optimizer.step() training_loss = loss.item() global_step += 1 sw.add_scalar('training_loss', training_loss, global_step) print('epoch: %s, train time every whole data:%.2fs' % (epoch, time() - train_start_time), flush=True) print('epoch: %s, total time:%.2fs' % (epoch, time() - start_time), flush=True) # apply model on the validation data set val_loss = compute_val_loss(net, val_loader, criterion, sw, epoch) if val_loss < best_val_loss: best_val_loss = val_loss best_epoch = epoch torch.save(net.state_dict(), params_filename) print('save parameters to file: %s' % params_filename, flush=True) print('best epoch:', best_epoch, flush=True) print('apply the best val model on the test data set ...', flush=True) predict_main(best_epoch, test_loader, test_target_tensor, _max, _min, 'test')
net = model(num_for_predict, all_backbones) net.initialize(ctx = ctx) for val_w, val_d, val_r, val_t in val_loader: net([val_w, val_d, val_r]) break net.initialize(ctx = ctx, init = MyInit(), force_reinit = True) # initialize a trainer to train model trainer = gluon.Trainer(net.collect_params(), optimizer, {'learning_rate': learning_rate}) # initialize a SummaryWriter to write information into logs dir sw = SummaryWriter(logdir = params_path, flush_secs = 5) # compute validation loss before training compute_val_loss(net, val_loader, loss_function, sw, 0) # compute testing set MAE, RMSE, MAPE before training evaluate(net, test_loader, true_value, num_of_vertices, sw, 0) # train model global_step = 1 for epoch in range(1, epochs + 1): for train_w, train_d, train_r, train_t in train_loader: start_time = time() with autograd.record(): output = net([train_w, train_d, train_r]) l = loss_function(output, train_t)
num_nodes=num_nodes, week=24, day=12, recent=24, K=3, Kt=3) net.to(device) #to cuda optimizer = optim.Adam(net.parameters(), lr=learning_rate, weight_decay=wdecay) scheduler = optim.lr_scheduler.ExponentialLR(optimizer, decay) #scheduler = optim.lr_scheduler.MultiStepLR(optimizer, [20,30], gamma=0.7, last_epoch=-1) #calculate origin loss in epoch 0 compute_val_loss(net, val_loader, loss_function, supports, device, epoch=0) # compute testing set MAE, RMSE, MAPE before training evaluate(net, test_loader, true_value, supports, device, epoch=0) clip = 5 his_loss = [] train_time = [] for epoch in range(1, epochs + 1): train_l = [] start_time_train = time() for train_w, train_d, train_r, train_t in train_loader: train_w = train_w.to(device) train_d = train_d.to(device) train_r = train_r.to(device) train_t = train_t.to(device)
val_r = val_r.as_in_context(ctx) val_t = val_t.as_in_context(ctx) net([val_w, val_d, val_r]) break net.initialize(ctx=ctx, init=MyInit(), force_reinit=True) # initialize a trainer to train model trainer = gluon.Trainer(net.collect_params(), optimizer, {'learning_rate': learning_rate}) # initialize a SummaryWriter to write information into logs dir sw = SummaryWriter(logdir=params_path, flush_secs=5) # compute validation loss before training compute_val_loss(net, val_loader, loss_function, sw, epoch=0, ctx=ctx) # compute testing set MAE, RMSE, MAPE before training evaluate(net, test_loader, true_value, num_of_vertices, sw, epoch=0, ctx=ctx) # train model global_step = 1 for epoch in range(1, epochs + 1): for train_w, train_d, train_r, train_t in train_loader: # running on single gpu
time: {batch_end_time - batch_start_time:.2f}' ) print( f's:[{epoch:d}, {i + 1:5d}] loss: {running_loss_s / group_num:.2f}, \ time: {batch_end_time - batch_start_time:.2f}' ) print( '--------------------------------------------------------------------' ) running_loss = 0.0 running_loss_f = 0.0 running_loss_o = 0.0 running_loss_s = 0.0 batch_start_time = batch_end_time epoch_end_time = time.perf_counter() print(f'Epoch cost {epoch_end_time - epoch_start_time:.2f} seconds') # probably not need to run this after every epoch:可能不需要在每一个时代之后都运行这个 with torch.no_grad(): # compute validation loss:计算验证损失 compute_val_loss(net, val_loader, loss_function, None, epoch, device, all_data['stats']['stats']) # testing:测试 evaluate(net, test_loader, true_value, num_of_vertices, None, epoch, device, all_data['stats']['stats']) end_time = time.perf_counter() print(f'Total running time is {end_time - start_time:.2f} seconds.')
train_t = train_t.to(device) outputs = net([train_w, train_d, train_r]) loss = loss_function(outputs, train_t) # loss is a tensor on the same device as outpus and train_t loss.backward() optimizer.step() running_loss += loss.item() # type of running_loss is float, loss.item() is a float on CPU if i % group_num == group_num - 1: batch_end_time = time.perf_counter() print(f'[{epoch:d}, {i + 1:5d}] loss: {running_loss / group_num:.2f}, \ time: {batch_end_time - batch_start_time:.2f}') running_loss = 0.0 batch_start_time = batch_end_time epoch_end_time = time.perf_counter() print(f'Epoch cost {epoch_end_time - epoch_start_time:.2f} seconds') # probably not need to run this after every epoch with torch.no_grad(): # compute validation loss compute_val_loss(net, val_loader, loss_function, None, epoch, device) # testing evaluate(net, test_loader, true_value, num_of_vertices, None, epoch, device) end_time = time.perf_counter() print(f'Total running time is {end_time - start_time:.2f} seconds.')