n_classes = len(train_data_loader.dataset.label2id) model = MultiInstanceLearning(n_classes=n_classes, vocab_size=len(vocab), device=device, bert_checkpoint=bert_checkpoint, dense_layer_checkpoint=dense_layer_checkpoint, **cfg.config) model.to(device) logger.info(model) # optimizer and criterion param = filter(lambda p: p.requires_grad, model.parameters()) # param = [p for p in model.parameters() if p.requires_grad] optimizer = get_optimizer(cfg.config['optimizer'], param, lr=cfg.config['lr']) lr_scheduler = ReduceLROnPlateau(optimizer, 'max', factor=cfg.config['lr_decay'], patience=cfg.config['lr_decay_patience']) criterion = nn.CrossEntropyLoss( weight=train_data_loader.dataset.weights.to(device)) #trainer trainer = MILTrainer(model=model, optimizer=optimizer, criterion=criterion, cfg=cfg.config, logger=logger, data_loader=train_data_loader, valid_data_loader=valid_data_loader,
def train(conf, data_category): print(json.dumps(conf, indent=4)) os.environ["CUDA_VISIBLE_DEVICES"] = str(conf['device']) device = torch.device(0) model_name = conf['model']['name'] optimizer_name = conf['optimizer']['name'] data_set = conf['data']['dataset'] graph = h5py.File(os.path.join('data', data_set, 'all_graph.h5'), 'r') scheduler_name = conf['scheduler']['name'] loss = get_loss(**conf['loss']) # data_category = conf['data']['data_category'] loss.to(device) encoder, decoder, support = None, None, None if model_name == 'Costnet': base_model_name = conf['Base']['name'] encoder, decoder = preprocessing(base_model_name, conf, loss, graph, data_category, device, data_set, optimizer_name, scheduler_name) if model_name == 'Metricnet' or model_name == 'GWNET' or model_name == 'Evonet' or model_name == 'STGCN' or model_name == 'DCRNN' or model_name == 'STG2Seq' or model_name == 'Evonet2': support = preprocessing_for_metric( data_category=data_category, dataset=conf['data']['dataset'], Normal_Method=conf['data']['Normal_Method'], _len=conf['data']['_len'], **conf['preprocess']) model, trainer = create_model(model_name, loss, conf['model'][model_name], data_category, device, graph, encoder, decoder, support) optimizer = get_optimizer(optimizer_name, model.parameters(), conf['optimizer'][optimizer_name]['lr']) scheduler = get_scheduler(scheduler_name, optimizer, **conf['scheduler'][scheduler_name]) if torch.cuda.device_count() > 1: print("use ", torch.cuda.device_count(), "GPUS") model = nn.DataParallel(model) else: model.to(device) save_folder = os.path.join('save', conf['name'], f'{data_set}_{"".join(data_category)}', conf['tag']) run_folder = os.path.join('run', conf['name'], f'{data_set}_{"".join(data_category)}', conf['tag']) shutil.rmtree(save_folder, ignore_errors=True) os.makedirs(save_folder) shutil.rmtree(run_folder, ignore_errors=True) os.makedirs(run_folder) with open(os.path.join(save_folder, 'config.yaml'), 'w+') as _f: yaml.safe_dump(conf, _f) data_loader, normal = get_data_loader(**conf['data'], data_category=data_category, device=device, model_name=model_name) if len(data_category) == 2: train_model(model=model, dataloaders=data_loader, trainer=trainer, node_num=conf['node_num'], loss_func=loss, optimizer=optimizer, normal=normal, scheduler=scheduler, folder=save_folder, tensorboard_folder=run_folder, device=device, **conf['train']) # test_model(folder = save_folder) else: train_baseline(model=model, dataloaders=data_loader, trainer=trainer, optimizer=optimizer, normal=normal, scheduler=scheduler, folder=save_folder, tensorboard_folder=run_folder, device=device, **conf['train']) test_baseline(folder=save_folder, trainer=trainer, model=model, normal=normal, dataloaders=data_loader, device=device)
def train(label, phi, t_label, t_phi, cfg): # writer = SummaryWriter() train_label, validate_label, _, _ = train_test_split( label.label, test_size=cfg.tv_value, random_state=20, shuffle=True) train_dataset = ds.SnapshotDataset(phi, train_label) validate_dataset = ds.SnapshotDataset(phi, validate_label) t_dataset = ds.SnapshotDataset(t_phi, t_label) phi = phi.to(cfg.device) model = End2end(phi, cfg) print(sum(p.numel() for p in model.parameters() if p.requires_grad)) model = model.to(cfg.device) optimizer = util.get_optimizer(cfg.o_name, model, cfg.learning_rate) scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', 0.5, cfg.scheduler) loss_func = get_loss(cfg) # with writer as w: # dummy_x = torch.zeros_like(label[0].unsqueeze(0)) # dummy_y = torch.zeros_like(label[0, 0].unsqueeze(0)) # w.add_graph(model, (dummy_x, dummy_y, phi)) losses = [] val_losses = [] best_val_loss = 1 best_psnr = 0 accumulation_steps = cfg.poor tain_data_loader = DataLoader(train_dataset, batch_size=cfg.batch, shuffle=True, drop_last=True) validate_data_loader = DataLoader(validate_dataset, batch_size=math.floor(cfg.batch / 2), shuffle=False, drop_last=True) for ep in range(cfg.epoch): optimizer.zero_grad() for ep_i, batch in enumerate(train_data_loader): label, y = batch initial = y.repeat(args.frame, 1, 1, 1).permute(1, 0, 2, 3).mul( phi.cpu()).div(phi.cpu().sum(0) + 0.0001) initial = initial.to(cfg.device) y = y.to(cfg.device) label = label.to(cfg.device) model.train() layers, symmetric = model(initial, y, phi) net_output = layers[-1] loss = loss_func(layers, label, symmetric) loss.backward() if (ep_i + 1) % accumulation_steps == 0: print("ep", ep, "ep_i ", ep_i, "loss ", loss.item()) optimizer.step() optimizer.zero_grad() with torch.no_grad(): losses.append(loss.item()) val_loss = torch.zeros([1]) for v_ep_i, v_batch in enumerate(validate_data_loader): v_initial = v_y.repeat(args.frame, 1, 1, 1).permute( 1, 0, 2, 3).mul(phi.cpu()).div(phi.cpu().sum(0) + 0.0001) v_initial = v_initial.to(cfg.device) v_y = v_y.to(cfg.device) v_label = v_label.to(cfg.device) model.eval() v_layers, symmetric = model(v_initial, v_y, phi) net_output = v_layers[-1] val_loss += loss_func(v_layers, v_label, symmetric) scheduler.step(val_loss) val_losses.append(val_loss.item()) print("ep ", ep, "loss ", loss.item(), "val loss ", val_loss, "lr", optimizer.param_groups[0]['lr'], "time ", time()) if ep % cfg.store == 0: best_val_loss = val_loss best_img = np.clip(net_output.detach().cpu().numpy(), 0, 1).astype(np.float64) best_psnr = compare_psnr(v_label.cpu().numpy(), best_img) print("PSNR: ", np.round(best_psnr, 2)) util.save(model, best_psnr, best_img, v_label.cpu().numpy(), cfg) t_phi = t_phi.to(cfg.device) data_loader = DataLoader(t_dataset, batch_size=t_label.shape[0], shuffle=False) label, y = next(iter(data_loader)) initial = y.repeat(args.frame, 1, 1, 1).permute(1, 0, 2, 3).mul( t_phi.cpu()).div(t_phi.cpu().sum(0) + 0.0001) initial = initial.to(cfg.device) y = y.to(cfg.device) layers, _ = model(initial, y, t_phi) net_output = layers[-1].detach().cpu().numpy() psnr = compare_psnr(label.numpy(), np.clip(net_output, 0, 1).astype(np.float64)) return model, psnr, net_output
def preprocessing(base_model_name, conf, loss, graph, data_category, device, data_set, optimizer_name, scheduler_name): if base_model_name == 'LinearDecompose': data_loader = get_data_loader_base(base_model_name=base_model_name, dataset=conf['data']['dataset'], batch_size=conf['batch_size_base'], _len=conf['data']['_len'], data_category=data_category, device=device) model, trainer = create_model(base_model_name, loss, conf['Base'][base_model_name], data_category, device, graph) save_folder = os.path.join('saves', f"{conf['name']}_{base_model_name}", f'{data_set}_{"".join(data_category)}') run_folder = os.path.join('run', f"{conf['name']}_{base_model_name}", f'{data_set}_{"".join(data_category)}') optimizer = get_optimizer(optimizer_name, model.parameters(), conf['optimizerbase'][optimizer_name]['lr']) scheduler = get_scheduler(scheduler_name, optimizer, **conf['scheduler'][scheduler_name]) shutil.rmtree(save_folder, ignore_errors=True) os.makedirs(save_folder) shutil.rmtree(run_folder, ignore_errors=True) os.makedirs(run_folder) model = train_decompose(model=model, dataloaders=data_loader, trainer=trainer, optimizer=optimizer, scheduler=scheduler, folder=save_folder, tensorboard_floder=run_folder, device=device, **conf['train']) model.load_state_dict( torch.load(f"{os.path.join(save_folder, 'best_model.pkl')}") ['model_state_dict']) return model.encoder, model.decoder if base_model_name == 'SvdDecompose': data = get_data_loader_base(base_model_name=base_model_name, dataset=conf['data']['dataset'], batch_size=conf['batch_size_base'], _len=conf['data']['_len'], data_category=data_category, device=device) data = torch.from_numpy(data).float().to(device) save_folder = os.path.join('saves', f"{conf['name']}_{base_model_name}", f'{data_set}_{"".join(data_category)}') run_folder = os.path.join('run', f"{conf['name']}_{base_model_name}", f'{data_set}_{"".join(data_category)}') model, trainer = create_model(base_model_name, loss, conf['Base'][base_model_name], data_category, device, graph) shutil.rmtree(save_folder, ignore_errors=True) os.makedirs(save_folder) shutil.rmtree(run_folder, ignore_errors=True) os.makedirs(run_folder) model.decompose(data) return model.encoder, model.decoder
def train(conf, data_category): print(json.dumps(conf, indent=4)) os.environ["CUDA_VISIBLE_DEVICES"] = str(conf['device']) device = torch.device(0) model_name = conf['model']['name'] optimizer_name = conf['optimizer']['name'] data_set = conf['data']['dataset'] scheduler_name = conf['scheduler']['name'] loss = get_loss(**conf['loss']) loss.to(device) support = preprocessing_for_metric(data_category=data_category, dataset=conf['data']['dataset'], Normal_Method=conf['data']['Normal_Method'], _len=conf['data']['_len'], **conf['preprocess']) model, trainer = create_model(model_name, loss, conf['model'][model_name], data_category, device, support) optimizer = get_optimizer(optimizer_name, model.parameters(), conf['optimizer'][optimizer_name]['lr']) scheduler = get_scheduler(scheduler_name, optimizer, **conf['scheduler'][scheduler_name]) if torch.cuda.device_count() > 1: print("use ", torch.cuda.device_count(), "GPUS") model = nn.DataParallel(model) else: model.to(device) save_folder = os.path.join('save', conf['name'], f'{data_set}_{"".join(data_category)}', conf['tag']) run_folder = os.path.join('run', conf['name'], f'{data_set}_{"".join(data_category)}', conf['tag']) shutil.rmtree(save_folder, ignore_errors=True) os.makedirs(save_folder) shutil.rmtree(run_folder, ignore_errors=True) os.makedirs(run_folder) with open(os.path.join(save_folder, 'config.yaml'), 'w+') as _f: yaml.safe_dump(conf, _f) data_loader, normal = get_data_loader(**conf['data'], data_category=data_category, device=device, model_name=model_name) train_model(model=model, dataloaders=data_loader, trainer=trainer, optimizer=optimizer, normal=normal, scheduler=scheduler, folder=save_folder, tensorboard_folder=run_folder, device=device, **conf['train']) test_model(folder=save_folder, trainer=trainer, model=model, normal=normal, dataloaders=data_loader, device=device)
def train_seg(**kwargs): ''' 训练分割网络 ''' parse(kwargs) loss_function = getattr(Loss_, opt.seg_loss_function) model = getattr(models, opt.seg_model)().cuda() if opt.seg_model_path is not None: model.load(opt.seg_model_path) dataset = SegDataLoader() dataloader = t.utils.data.DataLoader(dataset, opt.batch_size, num_workers=opt.num_workers, shuffle=opt.shuffle, pin_memory=opt.pin_memory) pre_loss = 100 lr = opt.lr optimizer = get_optimizer(model, opt.lr) loss_meter = tnt.meter.AverageValueMeter() for epoch in range(opt.max_epoch): loss_meter.reset() start = time.time() for ii, (input, mask) in enumerate(dataloader): optimizer.zero_grad() input = t.autograd.Variable(input).cuda() target = t.autograd.Variable(mask).cuda() output = model(input) loss, _ = loss_function(output, target) # othter_info = [jj.data.cpu().tolist() for jj in othter_info] # vis.vis.text(othter_info, win='othter_info') loss_meter.add(loss.data[0]) (loss).backward() optimizer.step() ### 可视化, 记录, log,print if ii % opt.plot_every == 0 and ii > opt.plot_every: if os.path.exists(opt.seg_debug_file): import ipdb ipdb.set_trace() vis_plots = {'loss': loss_meter.value()[0], 'ii': ii} vis.plot_many(vis_plots) # 随机展示一张图片 k = t.randperm(input.size(0))[0] vis.vis.histogram(output.data[k].view(-1).cpu(), win=u'output_hist', opts=dict(title='output_hist')) #!TODO: tell 代成 make 1/3 和1 ,而不是1和3 vis_imgs = { 'input': input.data[k], 'mask': target.data[k], 'output': output.data[k] } vis.img_grid_many(vis_imgs) print "epoch:%4d/%4d,time: %.8f,loss: %.8f " % ( epoch, ii, time.time() - start, loss_meter.value()[0]) model.save() vis.log({ ' epoch:': epoch, ' loss:': str(loss_meter.value()[0]), ' lr: ': lr }) # info = time.strftime('[%m%d %H:%M] epoch') + str(epoch) + ':' + \ # str(loss_meter.value()[0]) + str('; lr:') + str(self.lr) + '<br>' # vis.vis.texts += info # vis.vis.text(vis.vis.texts, win=u'log') # 梯度衰减 if loss_meter.value()[0] > pre_loss: lr = lr * opt.lr_decay optimizer = get_optimizer(model, lr) pre_loss = loss_meter.value()[0] if lr < opt.min_lr: break
def train_cls(**kwargs): ''' 训练分类网络 ''' parse(kwargs) loss_function = getattr(Loss_, opt.cls_loss_function) model = getattr(models, opt.cls_model)().cuda() if opt.cls_model_path is not None: model.load(opt.cls_model_path) dataset = ClsDataset() dataloader = t.utils.data.DataLoader(dataset, opt.batch_size, num_workers=opt.num_workers, shuffle=opt.shuffle, pin_memory=opt.pin_memory) pre_loss = 100 lr = opt.lr optimizer = get_optimizer(model, opt.lr, weight_decay=opt.weight_decay) loss_meter = tnt.meter.AverageValueMeter() confusem = tnt.meter.ConfusionMeter(2) for epoch in range(opt.max_epoch): loss_meter.reset() confusem.reset() start = time.time() for ii, (input, label) in enumerate(dataloader): optimizer.zero_grad() input = t.autograd.Variable(input).cuda() #!TODO: modify label target = label.cuda() #!TODO: output maybe a list output = model(input) loss = loss_function(output, target) (loss).backward() optimizer.step() # loss1,loss2,loss3 = loss_function(score1,target),loss_function(score2,target),loss_function(score3,target) # loss = loss1+loss2+loss3 # prob1,prob2,prob3=t.nn.functional.softmax(score1),t.nn.functional.softmax(score2),t.nn.functional.softmax(score3) # prob=(prob1+prob2+prob3)/3.0 confusem.add(get_pro(output).data, target) loss_meter.add(loss.data[0]) if ii % opt.plot_every == 0 and ii > 0: vis_plots = {'loss': loss_meter.value()[0], 'ii': ii} vis.plot_many(vis_plots) vis.img_grid(label[0], input.data[0]) vis.vis.text('cm:%s, loss:%s' % (str(confusem.value()), loss.data[0]), win=u'confusionmatrix') if os.path.exists(opt.cls_debug_file): import ipdb ipdb.set_trace() print "epoch:%4d/%4d, time:%.8f,loss:%.8f" % ( epoch, ii, time.time() - start, loss_meter.value()[0]) model.save() val_cm, val_loss = val_cls(model, loss_function) vis.log( 'epoch:{epoch},loss:{loss:.4f},lr:{lr:.6f},cm:{cm},val_loss:{val_loss:.4f},val_cm:{val_cm}' .format(epoch=epoch, loss=(loss_meter.value()[0]), lr=lr, cm=str(confusem.value()), val_loss=(val_loss.value()[0]), val_cm=str(val_cm.value()))) vis.plot('val_loss', (val_loss.value()[0])) if loss_meter.value()[0] > pre_loss * 1.: lr = lr * opt.lr_decay # 第二种降低学习率的方法:不会有moment等的丢失 for param_group in optimizer.param_groups: param_group['lr'] = lr # optimizer = get_optimizer(model, lr) pre_loss = loss_meter.value()[0] if lr < opt.min_lr: break