def train(model, train_loader, valid_loader, config): model.cuda() optimizer = optim.Adam(model.parameters(), lr=config.train_lr, weight_decay = config.weight_decay) checkpoint_path = os.path.join(config.log_path, 'checkpoint.pth') resume = os.path.isfile(checkpoint_path) if resume: # Load checkpoint. print('==> Resuming from checkpoint..') checkpoint = torch.load(checkpoint_path) best_acc = checkpoint['best_acc'] start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) logger_train = Logger(os.path.join(config.log_path, 'log_train.txt'), resume=True) logger_valid = Logger(os.path.join(config.log_path, 'log_valid.txt'), resume=True) else: best_acc = -1 start_epoch = 0 logger_train = Logger(os.path.join(config.log_path, 'log_train.txt')) logger_train.set_names(['Learning Rate', 'Essential Loss', 'Classfi Loss', 'Detector loss']) logger_valid = Logger(os.path.join(config.log_path, 'log_valid.txt')) logger_valid.set_names(['Valid Acc', 'Essential Loss', 'Clasfi Loss']) train_loader_iter = iter(train_loader) for step in trange(start_epoch, config.train_iter, ncols=config.tqdm_width): try: train_data = next(train_loader_iter) except StopIteration: train_loader_iter = iter(train_loader) train_data = next(train_loader_iter) train_data = tocuda(train_data) # run training cur_lr = adjust_learning_rate(optimizer, step, config) loss_val = train_step(step, optimizer, model, train_data, config) logger_train.append([cur_lr]+list(loss_val)) # Check if we want to write validation b_save = ((step + 1) % config.save_intv) == 0 b_validate = ((step + 1) % config.val_intv) == 0 if b_validate: va_res, loss1, loss2 = valid(valid_loader, step, config) logger_valid.append([va_res, loss1, loss2]) if va_res > best_acc: print("Saving best model with va_res = {}".format(va_res)) best_acc = va_res torch.save({ 'epoch': step + 1, 'state_dict': model.state_dict(), 'best_acc': best_acc, 'optimizer' : optimizer.state_dict(), }, os.path.join(config.log_path, 'model_best.pth')) if b_save: torch.save({ 'epoch': step + 1, 'state_dict': model.state_dict(), 'best_acc': best_acc, 'optimizer' : optimizer.state_dict(), }, checkpoint_path)
def train(model, train_loader, valid_loader, config): model.cuda() optimizer = optim.Adam(model.parameters(), lr=config.train_lr, weight_decay = config.weight_decay) match_loss = MatchLoss(config) checkpoint_path = os.path.join(config.log_path, 'checkpoint.pth') config.resume = os.path.isfile(checkpoint_path) if config.resume: print('==> Resuming from checkpoint..') checkpoint = torch.load(checkpoint_path) best_acc = checkpoint['best_acc'] start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) logger_train = Logger(os.path.join(config.log_path, 'log_train.txt'), title='oan', resume=True) logger_valid = Logger(os.path.join(config.log_path, 'log_valid.txt'), title='oan', resume=True) else: best_acc = -1 start_epoch = 0 logger_train = Logger(os.path.join(config.log_path, 'log_train.txt'), title='oan') logger_train.set_names(['Learning Rate'] + ['Geo Loss', 'Classfi Loss', 'L2 Loss']*(config.iter_num+1)) logger_valid = Logger(os.path.join(config.log_path, 'log_valid.txt'), title='oan') logger_valid.set_names(['Valid Acc'] + ['Geo Loss', 'Clasfi Loss', 'L2 Loss']) train_loader_iter = iter(train_loader) for step in trange(start_epoch, config.train_iter, ncols=config.tqdm_width): try: train_data = next(train_loader_iter) except StopIteration: train_loader_iter = iter(train_loader) train_data = next(train_loader_iter) train_data = tocuda(train_data) # run training cur_lr = optimizer.param_groups[0]['lr'] loss_vals = train_step(step, optimizer, model, match_loss, train_data) logger_train.append([cur_lr] + loss_vals) # Check if we want to write validation b_save = ((step + 1) % config.save_intv) == 0 b_validate = ((step + 1) % config.val_intv) == 0 if b_validate: va_res, geo_loss, cla_loss, l2_loss, _, _, _ = valid(valid_loader, model, step, config) logger_valid.append([va_res, geo_loss, cla_loss, l2_loss]) if va_res > best_acc: print("Saving best model with va_res = {}".format(va_res)) best_acc = va_res torch.save({ 'epoch': step + 1, 'state_dict': model.state_dict(), 'best_acc': best_acc, 'optimizer' : optimizer.state_dict(), }, os.path.join(config.log_path, 'model_best.pth')) if b_save: torch.save({ 'epoch': step + 1, 'state_dict': model.state_dict(), 'best_acc': best_acc, 'optimizer' : optimizer.state_dict(), }, checkpoint_path)
def mutation(m, mutationP, mutationFunction, mutationConfig, validate): def trivalMutation(m, mutationP, validate): mutationMatrix = np.random.rand(m.wall.shape[0], m.wall.shape[1]) < mutationP return m.wall ^ mutationMatrix if isinstance(m, frame.maze): newMazeList = [] count = 0 if m.teleported: newMazeList.append(m.teleported) count = count + 1 while count < self.size: if mutationFunction is None: wall = trivalMutation(m, mutationP, validate) else: #TODO: other mutationFunction pass newMaze = frame.maze(m.rows, m.cols, m.p, m.rootNum) newMaze.build(initFunction = frame.setWall, initConfig = {'wall': wall}) if validate and not test.valid(newMaze): continue newMazeList.append(newMaze) count = count + 1 return newMazeList else: print('E: localSearch.neighbor.__call__(), not a maze input') exit()
def main(args): cuda = True cudnn.benchmark = True # data_root = '/home/weiyuhua/Challenge2020/Data/DG' data_root = '/home/yin/code/weiyuhua/Challenge2020/Data/DG' model_root = args.model_root logs = args.logs lr = args.lr batch_size = args.batch_size n_epoch = args.n_epoch unseen_index = args.unseen_index val_split = args.val_split manual_seed = random.randint(1, 10000) random.seed(manual_seed) torch.manual_seed(manual_seed) tb_dir = os.path.join(logs, 'tb_dir') if not os.path.exists(logs): os.makedirs(logs) if not os.path.exists(model_root): os.makedirs(model_root) if not os.path.exists(tb_dir): os.makedirs(tb_dir) # Tensorboard train_writer = SummaryWriter(tb_dir + '/train') val_writer = SummaryWriter(tb_dir + '/valid') test_writer = SummaryWriter(tb_dir + '/test') # get train, val and test datasets D = GetDataset(data_root, unseen_index, val_split) train_datasets, val_datasets, test_dataset = D.get_datasets() # get dataloaders train_dataloaders = [] for train_dataset in train_datasets: train_dataloader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, num_workers=8) train_dataloaders.append(train_dataloader) val_dataloaders = [] for val_dataset in val_datasets: val_dataloader = DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=False, num_workers=8) val_dataloaders.append(val_dataloader) test_dataloader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False, num_workers=8) # load model my_net = CNNModel() # setup optimizer optimizer = optim.Adam(my_net.parameters(), lr=lr) loss_class = torch.nn.NLLLoss() loss_domain = torch.nn.NLLLoss() if cuda: my_net = my_net.cuda() loss_class = loss_class.cuda() loss_domain = loss_domain.cuda() for p in my_net.parameters(): p.requires_grad = True # training best_accu_val = 0.0 for epoch in range(n_epoch): len_dataloader = np.min( np.array([ len(train_dataloaders[i]) for i in range(len(train_dataloaders)) ])) data_train_iters = [] for train_dataloader in train_dataloaders: data_train_iter = iter(train_dataloader) data_train_iters.append(data_train_iter) for i in range(len_dataloader): p = float(i + epoch * len_dataloader) / n_epoch / len_dataloader alpha = 2. / (1. + np.exp(-10 * p)) - 1 err_label_s = [] err_domain_s = [] # err_label_all = torch.tensor(0.0) # err_domain_all = torch.tensor(0.0) err_label_all = 0 err_domain_all = 0 # training model using multi-source data for j, data_train_iter in enumerate(data_train_iters): data_train = data_train_iter.next() s_ecg, s_label = data_train my_net.zero_grad() batch_size = len(s_label) domain_label = (torch.ones(batch_size) * j).long() if cuda: s_ecg = s_ecg.cuda() s_label = s_label.cuda() domain_label = domain_label.cuda() class_output, domain_output = my_net(input_data=s_ecg, alpha=alpha) err_label = loss_class(class_output, s_label) err_domain = loss_domain(domain_output, domain_label) err_label_s.append(err_label.data.cpu().numpy()) err_domain_s.append(err_domain.data.cpu().numpy()) err_label_all += err_label err_domain_all += err_domain # err = err_domain_all + err_label_all err = err_label_all err.backward() optimizer.step() print('\n') for j in range(len(train_dataloaders)): print('\r epoch: %d, [iter: %d / all %d], domain: %d, err_label: %f, err_domain: %f' \ % (epoch, i + 1, len_dataloader, j + 1, err_label_s[j], err_domain_s[j])) # tb training train_writer.add_scalar('err_label_%d' % (j), err_label_s[j]) train_writer.add_scalar('err_domain_%d' % (j), err_domain_s[j]) torch.save(my_net, '{0}/model_epoch_current.pth'.format(model_root)) print('\n') ## validation val_accus, best_accu_val, val_err_label_s, val_err_domain_s = valid( val_dataloaders, model_root, best_accu_val) for i in range(len(val_dataloaders)): print('\r epoch: %d, Validation, domain: %d, accu: %f' % (epoch, i + 1, val_accus[i])) # tb validation val_writer.add_scalar('err_label_%d' % (i), val_err_label_s[i]) val_writer.add_scalar('err_domain_%d' % (i), val_err_domain_s[i]) val_writer.add_scalar('accu_%d' % (i), val_accus[i]) ## test test_accu, test_err_label = test(test_dataloader, model_root, model_best=False) test_writer.add_scalar('accu', test_accu) test_writer.add_scalar('err_label', test_err_label) result_path = os.path.join(logs, 'results.txt') print('============ Summary ============= \n') for i, train_dataloader in enumerate(train_dataloaders): train_accu, train_err_label = test(train_dataloader, model_root) write_log( 'Accuracy of the train dataset %d : %f err_label : %f' % (i + 1, train_accu, train_err_label), result_path) for i, val_dataloader in enumerate(val_dataloaders): val_accu, val_err_label = test(val_dataloader, model_root) write_log( 'Accuracy of the val dataset %d : %f err_label : %f' % (i + 1, val_accu, val_err_label), result_path) test_accu, test_err_label = test(test_dataloader, model_root) write_log( 'Accuracy of the test dataset %d : %f err_label : %f' % (i + 1, test_accu, test_err_label), result_path)
def main(): """Create the model and start the training.""" cycle_n = 0 start_epoch = args.start_epoch writer = SummaryWriter(osp.join(args.snapshot_dir, TIMESTAMP)) if not os.path.exists(args.snapshot_dir): os.makedirs(args.snapshot_dir) h, w = map(int, args.input_size.split(',')) input_size = [h, w] best_f1 = 0 torch.cuda.set_device(args.local_rank) try: world_size = int(os.environ['WORLD_SIZE']) distributed = world_size > 1 except: distributed = False world_size = 1 if distributed: dist.init_process_group(backend=args.dist_backend, init_method='env://') rank = 0 if not distributed else dist.get_rank() log_file = args.snapshot_dir + '/' + TIMESTAMP + 'output.log' logger = get_root_logger(log_file=log_file, log_level='INFO') logger.info(f'Distributed training: {distributed}') cudnn.enabled = True cudnn.benchmark = True torch.backends.cudnn.deterministic = False torch.backends.cudnn.enabled = True if distributed: model = dml_csr.DML_CSR(args.num_classes) schp_model = dml_csr.DML_CSR(args.num_classes) else: model = dml_csr.DML_CSR(args.num_classes, InPlaceABN) schp_model = dml_csr.DML_CSR(args.num_classes, InPlaceABN) if args.restore_from is not None: print('Resume training from {}'.format(args.restore_from)) model.load_state_dict(torch.load(args.restore_from), True) start_epoch = int(float( args.restore_from.split('.')[0].split('_')[-1])) + 1 else: resnet_params = torch.load(RESTORE_FROM) new_params = model.state_dict().copy() for i in resnet_params: i_parts = i.split('.') if not i_parts[0] == 'fc': new_params['.'.join(i_parts[0:])] = resnet_params[i] model.load_state_dict(new_params) model.cuda() args.schp_restore = osp.join(args.snapshot_dir, TIMESTAMP, 'best.pth') if os.path.exists(args.schp_restore): print('Resume schp checkpoint from {}'.format(args.schp_restore)) schp_model.load_state_dict(torch.load(args.schp_restore), True) else: schp_resnet_params = torch.load(RESTORE_FROM) schp_new_params = schp_model.state_dict().copy() for i in schp_resnet_params: i_parts = i.split('.') if not i_parts[0] == 'fc': schp_new_params['.'.join(i_parts[0:])] = schp_resnet_params[i] schp_model.load_state_dict(schp_new_params) schp_model.cuda() if distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) schp_model = torch.nn.parallel.DistributedDataParallel( schp_model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) else: model = SingleGPU(model) schp_model = SingleGPU(schp_model) criterion = Criterion(loss_weight=[1, 1, 1, 4, 1], lambda_1=args.lambda_s, lambda_2=args.lambda_e, lambda_3=args.lambda_c, num_classes=args.num_classes) criterion.cuda() normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) transform = transforms.Compose([transforms.ToTensor(), normalize]) train_dataset = FaceDataSet(args.data_dir, args.train_dataset, crop_size=input_size, transform=transform) if distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) else: train_sampler = None trainloader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=False, num_workers=2, pin_memory=True, drop_last=True, sampler=train_sampler) val_dataset = datasets[str(args.model_type)](args.data_dir, args.valid_dataset, crop_size=input_size, transform=transform) num_samples = len(val_dataset) valloader = data.DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False, pin_memory=True, drop_last=False) # Optimizer Initialization optimizer = optim.SGD(model.parameters(), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) lr_scheduler = SGDRScheduler(optimizer, total_epoch=args.epochs, eta_min=args.learning_rate / 100, warmup_epoch=10, start_cyclical=args.schp_start, cyclical_base_lr=args.learning_rate / 2, cyclical_epoch=args.cycle_epochs) optimizer.zero_grad() total_iters = args.epochs * len(trainloader) start = timeit.default_timer() for epoch in range(start_epoch, args.epochs): model.train() if distributed: train_sampler.set_epoch(epoch) for i_iter, batch in enumerate(trainloader): i_iter += len(trainloader) * epoch if epoch < args.schp_start: lr = adjust_learning_rate(optimizer, i_iter, total_iters) else: lr = lr_scheduler.get_lr()[0] images, labels, edges, semantic_edges, _ = batch labels = labels.long().cuda(non_blocking=True) edges = edges.long().cuda(non_blocking=True) semantic_edges = semantic_edges.long().cuda(non_blocking=True) preds = model(images) if cycle_n >= 1: with torch.no_grad(): soft_preds, soft_edges, soft_semantic_edges = schp_model( images) else: soft_preds = None soft_edges = None soft_semantic_edges = None loss = criterion(preds, [ labels, edges, semantic_edges, soft_preds, soft_edges, soft_semantic_edges ], cycle_n) optimizer.zero_grad() loss.backward() optimizer.step() lr_scheduler.step() with torch.no_grad(): loss = loss.detach() * labels.shape[0] count = labels.new_tensor([labels.shape[0]], dtype=torch.long) if dist.is_initialized(): dist.all_reduce(count, dist.ReduceOp.SUM) dist.all_reduce(loss, dist.ReduceOp.SUM) loss /= count.item() if not dist.is_initialized() or dist.get_rank() == 0: if i_iter % 50 == 0: writer.add_scalar('learning_rate', lr, i_iter) writer.add_scalar('loss', loss.data.cpu().numpy(), i_iter) if i_iter % 500 == 0: images_inv = inv_preprocess(images, args.save_num_images) labels_colors = decode_parsing(labels, args.save_num_images, args.num_classes, is_pred=False) edges_colors = decode_parsing(edges, args.save_num_images, 2, is_pred=False) semantic_edges_colors = decode_parsing( semantic_edges, args.save_num_images, args.num_classes, is_pred=False) if isinstance(preds, list): preds = preds[0] preds_colors = decode_parsing(preds[0], args.save_num_images, args.num_classes, is_pred=True) pred_edges = decode_parsing(preds[1], args.save_num_images, 2, is_pred=True) pred_semantic_edges_colors = decode_parsing( preds[2], args.save_num_images, args.num_classes, is_pred=True) img = vutils.make_grid(images_inv, normalize=False, scale_each=True) lab = vutils.make_grid(labels_colors, normalize=False, scale_each=True) pred = vutils.make_grid(preds_colors, normalize=False, scale_each=True) edge = vutils.make_grid(edges_colors, normalize=False, scale_each=True) pred_edge = vutils.make_grid(pred_edges, normalize=False, scale_each=True) pred_semantic_edges = vutils.make_grid( pred_semantic_edges_colors, normalize=False, scale_each=True) writer.add_image('Images/', img, i_iter) writer.add_image('Labels/', lab, i_iter) writer.add_image('Preds/', pred, i_iter) writer.add_image('Edge/', edge, i_iter) writer.add_image('Pred_edge/', pred_edge, i_iter) cur_loss = loss.data.cpu().numpy() logger.info( f'iter = {i_iter} of {total_iters} completed, loss = {cur_loss}, lr = {lr}' ) if (epoch + 1) % (args.eval_epochs) == 0: parsing_preds, scales, centers = valid(model, valloader, input_size, num_samples) mIoU, f1 = compute_mean_ioU(parsing_preds, scales, centers, args.num_classes, args.data_dir, input_size, args.valid_dataset, True) if not dist.is_initialized() or dist.get_rank() == 0: torch.save( model.module.state_dict(), osp.join(args.snapshot_dir, TIMESTAMP, 'checkpoint_{}.pth'.format(epoch + 1))) if 'Helen' in args.data_dir: if f1['overall'] > best_f1: torch.save( model.module.state_dict(), osp.join(args.snapshot_dir, TIMESTAMP, 'best.pth')) best_f1 = f1['overall'] else: if f1['Mean_F1'] > best_f1: torch.save( model.module.state_dict(), osp.join(args.snapshot_dir, TIMESTAMP, 'best.pth')) best_f1 = f1['Mean_F1'] writer.add_scalars('mIoU', mIoU, epoch) writer.add_scalars('f1', f1, epoch) logger.info( f'mIoU = {mIoU}, and f1 = {f1} of epoch = {epoch}, util now, best_f1 = {best_f1}' ) if (epoch + 1) >= args.schp_start and ( epoch + 1 - args.schp_start) % args.cycle_epochs == 0: logger.info(f'Self-correction cycle number {cycle_n}') schp.moving_average(schp_model, model, 1.0 / (cycle_n + 1)) cycle_n += 1 schp.bn_re_estimate(trainloader, schp_model) parsing_preds, scales, centers = valid(schp_model, valloader, input_size, num_samples) mIoU, f1 = compute_mean_ioU(parsing_preds, scales, centers, args.num_classes, args.data_dir, input_size, args.valid_dataset, True) if not dist.is_initialized() or dist.get_rank() == 0: torch.save( schp_model.module.state_dict(), osp.join(args.snapshot_dir, TIMESTAMP, 'schp_{}_checkpoint.pth'.format(cycle_n))) if 'Helen' in args.data_dir: if f1['overall'] > best_f1: torch.save( schp_model.module.state_dict(), osp.join(args.snapshot_dir, TIMESTAMP, 'best.pth')) best_f1 = f1['overall'] else: if f1['Mean_F1'] > best_f1: torch.save( schp_model.module.state_dict(), osp.join(args.snapshot_dir, TIMESTAMP, 'best.pth')) best_f1 = f1['Mean_F1'] writer.add_scalars('mIoU', mIoU, epoch) writer.add_scalars('f1', f1, epoch) logger.info( f'mIoU = {mIoU}, and f1 = {f1} of epoch = {epoch}, util now, best_f1 = {best_f1}' ) torch.cuda.empty_cache() end = timeit.default_timer() print('epoch = {} of {} completed using {} s'.format( epoch, args.epochs, (end - start) / (epoch - start_epoch + 1))) end = timeit.default_timer() print(end - start, 'seconds')
def train(train_samples, valid_samples, word2num, lr=0.001, epoch=5, use_cuda=False): print('Training...') # Prepare training data print(' Preparing training data...') statement_word2num = word2num[0] subject_word2num = word2num[1] speaker_word2num = word2num[2] speaker_pos_word2num = word2num[3] state_word2num = word2num[4] party_word2num = word2num[5] context_word2num = word2num[6] train_data = train_samples dataset_to_variable(train_data, use_cuda) valid_data = valid_samples dataset_to_variable(valid_data, use_cuda) # Construct model instance print(' Constructing network model...') model = Net(len(statement_word2num), len(subject_word2num), len(speaker_word2num), len(speaker_pos_word2num), len(state_word2num), len(party_word2num), len(context_word2num)) if use_cuda: model.cuda() # Start training print(' Start training') optimizer = optim.Adam(model.parameters(), lr=lr) model.train() step = 0 display_interval = 2000 for epoch_ in range(epoch): print(' ==> Epoch ' + str(epoch_) + ' started.') random.shuffle(train_data) total_loss = 0 for sample in train_data: optimizer.zero_grad() prediction = model(sample) label = Variable(torch.LongTensor([sample.label])) loss = F.cross_entropy(prediction, label) loss.backward() optimizer.step() step += 1 if step % display_interval == 0: print(' ==> Iter: ' + str(step) + ' Loss: ' + str(loss)) total_loss += loss.data.numpy() print(' ==> Epoch ' + str(epoch_) + ' finished. Avg Loss: ' + str(total_loss / len(train_data))) valid(valid_data, word2num, model) return model
def train(train_samples, valid_samples, word2num, lr = 0.001, epoch = 5, use_cuda = False): print('Training...') # Prepare training data print(' Preparing training data...') statement_word2num = word2num[0] subject_word2num = word2num[1] speaker_word2num = word2num[2] speaker_pos_word2num = word2num[3] state_word2num = word2num[4] party_word2num = word2num[5] context_word2num = word2num[6] train_data = train_samples dataset_to_variable(train_data, use_cuda) valid_data = valid_samples dataset_to_variable(valid_data, use_cuda) # Construct model instance print(' Constructing network model...') model = Net(len(statement_word2num), len(subject_word2num), len(speaker_word2num), len(speaker_pos_word2num), len(state_word2num), len(party_word2num), len(context_word2num)) if use_cuda: model.cuda() # Start training print(' Start training') optimizer = optim.Adam(model.parameters(), lr = lr) model.train() step = 0 display_interval = 2000 for epoch_ in range(epoch): print(' ==> Epoch '+str(epoch_)+' started.') random.shuffle(train_data) total_loss = 0 for sample in train_data: optimizer.zero_grad() prediction = model(sample) label = Variable(torch.LongTensor([sample.label])) loss = F.cross_entropy(prediction, label) loss.backward() optimizer.step() step += 1 if step % display_interval == 0: print(' ==> Iter: '+str(step)+' Loss: '+str(loss)) total_loss += loss.data.numpy() print(' ==> Epoch '+str(epoch_)+' finished. Avg Loss: '+str(total_loss/len(train_data))) valid(valid_data, word2num, model) return model
def train(train_samples, valid_samples, word2num, max_len_statement, max_len_subject, max_len_speaker_pos, max_len_context, lr=0.001, epoch=1, use_cuda=False, batch_size=20, batch_size_val=5, model_path='models'): print('Training...') # Prepare training data print(' Preparing training data...') statement_word2num = word2num[0] subject_word2num = word2num[1] speaker_word2num = word2num[2] speaker_pos_word2num = word2num[3] state_word2num = word2num[4] party_word2num = word2num[5] context_word2num = word2num[6] # train_data = train_samples train_data = CustomDataset(train_samples, max_len_statement, max_len_subject, max_len_speaker_pos, max_len_context) train_loader = DataLoader(train_data, batch_size=batch_size, collate_fn=collate_fn) # dataset_to_variable(train_data, use_cuda) valid_data = valid_samples valid_samples = CustomDataset(valid_samples, max_len_statement, max_len_subject, max_len_speaker_pos, max_len_context) valid_loader = DataLoader(valid_samples, batch_size=batch_size_val, collate_fn=collate_fn) # dataset_to_variable(valid_data, use_cuda) # Construct model instance print(' Constructing network model...') model = Net(len(statement_word2num), len(subject_word2num), len(speaker_word2num), len(speaker_pos_word2num), len(state_word2num), len(party_word2num), len(context_word2num)) if use_cuda: print('using cuda') model.cuda() # Start training print(' Start training') optimizer = optim.Adam(model.parameters(), lr=lr) lr_scheduler = ReduceLROnPlateau(optimizer=optimizer, mode='max', factor=0.5, patience=5) model.train() step = 0 display_interval = 50 optimal_val_acc = 0 for epoch_ in range(epoch): print(' ==> Epoch ' + str(epoch_) + ' started.') # random.shuffle(train_data) total_loss = 0 for (inputs_statement, inputs_subject, inputs_speaker, inputs_speaker_pos, inputs_state, inputs_party, inputs_context, target) in train_loader: # sample = [inputs_statement, inputs_subject, inputs_speaker, inputs_speaker_pos, inputs_state, inputs_party, inputs_context] optimizer.zero_grad() if use_cuda: inputs_statement.cuda() inputs_subject.cuda() inputs_speaker.cuda() inputs_speaker_pos.cuda() inputs_state.cuda() inputs_party.cuda() inputs_context.cuda() # sample.cuda() target.cuda() prediction = model(inputs_statement, inputs_subject, inputs_speaker, inputs_speaker_pos, inputs_state, inputs_party, inputs_context) # label = Variable(torch.LongTensor([sample.label])) # loss = F.cross_entropy(prediction, label) loss = F.cross_entropy(prediction, target) loss.backward() optimizer.step() step += 1 if step % display_interval == 0: print(' ==> Iter: ' + str(step) + ' Loss: ' + str(loss)) total_loss += loss.data.numpy() * len(inputs_statement) print(' ==> Epoch ' + str(epoch_) + ' finished. Avg Loss: ' + str(total_loss / len(train_data))) val_acc = valid(valid_loader, word2num, model, max_len_statement, max_len_subject, max_len_speaker_pos, max_len_context, use_cuda) lr_scheduler.step(val_acc) for param_group in optimizer.param_groups: print("The current learning rate used by the optimizer is : {}". format(param_group['lr'])) if val_acc > optimal_val_acc: optimal_val_acc = val_acc model_file = os.path.join( model_path, 'model_bs_{}_lr_{}_acc_{}.pth'.format(batch_size, lr, val_acc)) old_models = [ os.path.join(model_path, filename) for filename in os.listdir(model_path) if filename.startswith("model_bs_{}_lr_{}".format( batch_size, lr)) ] for file_ in old_models: os.remove(file_) torch.save(model.state_dict(), model_file) return optimal_val_acc
def train(train_samples, valid_samples, test_samples, index, word2vec_preweight, vocabulary_dim, process_file, jpg_file, save_model_file, test_label_file, para_dict): # print ('Traing begin') # print ('Prepare train data') train_loss_list = [] valid_loss_list = [] valid_acc_list = [] best_valid_acc = 0 best_test_acc = 0 # train_data train_statement_data = [x[1] for x in train_samples] train_statement_data = np.array(train_statement_data) train_statement_data = torch.from_numpy(train_statement_data).cuda() train_statement_len = [x[2] for x in train_samples] train_statement_len = np.array(train_statement_len) train_statement_len = torch.from_numpy(train_statement_len).int().cuda() # train_statement_len = train_statement_len.unsqueeze(1) train_meta_data = [x[3] for x in train_samples] train_meta_data = np.array(train_meta_data) train_meta_data = torch.from_numpy(train_meta_data).cuda() train_history_data = [x[4] for x in train_samples] train_history_data = np.array(train_history_data) train_history_data = torch.from_numpy(train_history_data) train_history_data = train_history_data.float().cuda() train_target = [x[0] for x in train_samples] train_target = np.array(train_target) train_target = torch.from_numpy(train_target).cuda() # valid data valid_statement_data = [x[1] for x in valid_samples] valid_statement_data = np.array(valid_statement_data) valid_statement_data = torch.from_numpy(valid_statement_data).cuda() valid_statement_len = [x[2] for x in valid_samples] valid_statement_len = np.array(valid_statement_len) valid_statement_len = torch.from_numpy(valid_statement_len).int().cuda() # valid_statement_len = valid_statement_len.unsqueeze(1) valid_meta_data = [x[3] for x in valid_samples] valid_meta_data = np.array(valid_meta_data) valid_meta_data = torch.from_numpy(valid_meta_data).cuda() valid_history_data = [x[4] for x in valid_samples] valid_history_data = np.array(valid_history_data) valid_history_data = torch.from_numpy(valid_history_data) valid_history_data = valid_history_data.float().cuda() valid_target = [x[0] for x in valid_samples] valid_target = np.array(valid_target) # test data test_statement_data = [x[1] for x in test_samples] test_statement_data = np.array(test_statement_data) test_statement_data = torch.from_numpy(test_statement_data).cuda() test_statement_len = [x[2] for x in test_samples] test_statement_len = np.array(test_statement_len) test_statement_len = torch.from_numpy(test_statement_len).int().cuda() # test_statement_len = test_statement_len.unsqueeze(1) test_meta_data = [x[3] for x in test_samples] test_meta_data = np.array(test_meta_data) test_meta_data = torch.from_numpy(test_meta_data).cuda() test_history_data = [x[4] for x in test_samples] test_history_data = np.array(test_history_data) test_history_data = torch.from_numpy(test_history_data) test_history_data = test_history_data.float().cuda() test_target = [x[0] for x in test_samples] test_target = np.array(test_target) print('Construct network model') model = Net(word2vec_preweight, vocabulary_dim, index, para_dict['transformer_num_layers'], para_dict['num_heads'], para_dict['dropout']) #print('Model Structure',model) # print ('Start training......') train_dataset = CustomDataset(train_statement_data, train_statement_len, train_meta_data, train_history_data, train_target) train_loader = DataLoader(train_dataset, batch_size=para_dict['batch_size'], shuffle=False, drop_last=True) optimizer = optim.Adam(model.parameters(), lr=para_dict['lr'], weight_decay=para_dict['weight_decay']) loss_func = nn.CrossEntropyLoss() display_interval = 50 model.train() model.cuda() for epoch in range(para_dict['EPOCH']): #print ('==>EPOCH:'+str(epoch)+' '+'started') process_file.write('==>EPOCH:' + str(epoch) + ' ' + 'started' + '\n') for step, (batch_statement, batch_statement_len, batch_meta, batch_history, batch_y) in enumerate(train_loader): batch_statement = Variable(batch_statement).cuda() batch_statement_len = Variable(batch_statement_len).cuda() batch_meta = Variable(batch_meta).cuda() batch_history = Variable(batch_history).cuda() batch_y = Variable(batch_y).cuda() output = model(batch_statement, batch_statement_len, batch_meta, batch_history) loss = loss_func(output, batch_y) optimizer.zero_grad() loss.backward() optimizer.step() if step % display_interval == 0: train_loss_list.append(loss.cpu().data.numpy()) # print ('...==>Iter:'+str(step)+' '+'train_Loss='+str(loss.cpu().data.numpy())) process_file.write('...==>Epoch:' + str(epoch) + ' ' + 'train_Loss=' + str(loss.data.cpu().numpy()) + '\r\n') valid_loss, valid_acc = valid( valid_statement_data, valid_statement_len, valid_meta_data, valid_history_data, valid_target, model, loss_func) # ------------------------ valid_loss_list.append(valid_loss) valid_acc_list.append(valid_acc) if best_valid_acc < valid_acc: best_valid_acc = valid_acc test_acc = test(test_statement_data, test_statement_len, test_meta_data, test_history_data, test_target, test_label_file, model) best_test_acc = test_acc # print('......==>Iter:' + str(step) + ' ' + 'valid_Loss=' + str(valid_loss)+' '+'valid_Acc='+str(valid_acc)) process_file.write('......==>Epoch:' + str(epoch) + ' ' + 'valid_Loss=' + str(valid_loss) + ' ' + 'valid_Acc=' + str(valid_acc) + '\r\n') x = range( para_dict['EPOCH'] * (len(train_samples) // para_dict['batch_size'] // display_interval + 1)) # this is related display_interval plt.figure(figsize=(10, 10)) plt.subplot(211) plt.title('Loss vs epoch') plt.xlim( 0, para_dict['EPOCH'] * (len(train_samples) // para_dict['batch_size'] // display_interval + 1)) plt.ylim(min(train_loss_list + valid_loss_list), max(train_loss_list + valid_loss_list)) plt.ylabel('Loss') plt.xlabel('Iter') plt.plot(x, train_loss_list, label='train_loss') plt.plot(x, valid_loss_list, label='valid_loss') plt.legend(loc='best') plt.subplot(212) plt.title('train vs valid') plt.xlim( 0, para_dict['EPOCH'] * (len(train_samples) // para_dict['batch_size'] // display_interval + 1)) plt.ylim(min(valid_acc_list), max(valid_acc_list)) plt.ylabel('Acc') plt.xlabel('Iter') plt.plot(x, valid_acc_list, label='valid_acc') plt.legend(loc='best') #plt.show() plt.savefig(jpg_file) plt.close() # save model # torch.save(model,save_model_file) # save the whole net return best_test_acc