def train(self, epoch): self.logger.info('Start training from epoch: {:d}, iter: {:d}'.format(epoch, 1)) self.dpcl.train() num_batchs = len(self.train_dataloader) total_loss = 0.0 num_index = 1 start_time = time.time() for mix_wave, target_waves, non_slient in self.train_dataloader: mix_wave = mix_wave.to(self.device) target_waves = target_waves.to(self.device) non_slient = non_slient.to(self.device) model = torch.nn.DataParallel(self.dpcl) mix_embs = model(mix_wave) l = Loss(mix_embs, target_waves, non_slient, self.num_spks) epoch_loss = l.loss() total_loss += epoch_loss.item() self.optimizer.zero_grad() epoch_loss.backward() if self.clip_norm: torch.nn.utils.clip_grad_norm_(self.dpcl.parameters(),self.clip_norm) self.optimizer.step() if num_index % self.print_freq == 0: message = '<epoch:{:3d}, iter:{:8,d}, lr:{:.3e}>, loss:{:.3f}'.format( epoch, num_index, self.optimizer.param_groups[0]['lr'], total_loss/num_index) self.logger.info(message) end_time = time.time() total_loss = total_loss/num_batchs message = '<epoch:{:3d}, iter:{:8,d}, lr:{:.3e}, loss:{:.3f}, Total time:{:.3f} min> '.format( epoch, num_batchs, self.optimizer.param_groups[0]['lr'], total_loss, (end_time-start_time)/60) self.logger.info(message) return total_loss
def validation(self, epoch): self.logger.info( 'Start Validation from epoch: {:d}, iter: {:d}'.format(epoch, 1)) self.dpcl.eval() num_batchs = len(self.val_dataloader) num_index = 1 total_loss = 0.0 start_time = time.time() with torch.no_grad(): for mix_wave, target_waves, non_slient in self.val_dataloader: mix_wave = mix_wave.to(self.device) target_waves = target_waves.to(self.device) non_slient = non_slient.to(self.device) mix_embs = self.dpcl(mix_wave) l = Loss(mix_embs, target_waves, non_slient, self.num_spks) epoch_loss = l.loss() total_loss += epoch_loss.item() if num_index % self.print_freq == 0: message = '<epoch:{:d}, iter:{:d}, lr:{:.3e}, loss:{:.3f}>'.format( epoch, num_index, self.optimizer.param_groups[0]['lr'], total_loss / num_index) self.logger.info(message) num_index += 1 end_time = time.time() total_loss = total_loss / num_batchs message = '<epoch:{:d}, iter:{:d}, lr:{:.3e}, loss:{:.3f}, Total time:{:.3f} min> '.format( epoch, num_batchs, self.optimizer.param_groups[0]['lr'], total_loss, (end_time - start_time) / 60) self.logger.info(message) return total_loss
def train(self, epoch): self.logger.info('Start training from epoch: {:d}, iter: {:d}'.format( epoch, 1)) self.danet.train() num_batchs = len(self.train_dataloader) total_loss = 0.0 num_index = 1 start_time = time.time() for mix_samp, wf, ibm, non_silent in self.train_dataloader: mix_samp = Variable(mix_samp).contiguous().to(self.device) wf = Variable(wf).contiguous().to(self.device) ibm = Variable(ibm).contiguous().to(self.device) non_silent = Variable(non_silent).contiguous().to(self.device) hidden = self.danet.init_hidden(mix_samp.size(0)) input_list = [mix_samp, ibm, non_silent, hidden] self.optimizer.zero_grad() if self.gpuid: #mask=torch.nn.parallel.data_parallel(self.danet,input_list,device_ids=self.gpuid) mask, hidden = self.danet(input_list) else: mask, hidden = self.danet(mix_samp, ibm, non_silent) l = Loss(mix_samp, wf, mask) epoch_loss = l.loss() total_loss += epoch_loss.item() epoch_loss.backward() #if self.clip_norm: # torch.nn.utils.clip_grad_norm_( # self.danet.parameters(), self.clip_norm) self.optimizer.step() if num_index % self.print_freq == 0: message = '<epoch:{:d}, iter:{:d}, lr:{:.3e}, loss:{:.3f}>'.format( epoch, num_index, self.optimizer.param_groups[0]['lr'], total_loss / num_index) self.logger.info(message) num_index += 1 end_time = time.time() total_loss = total_loss / num_index message = '<epoch:{:d}, iter:{:d}, lr:{:.3e}, loss:{:.3f}, Total time:{:.3f} min> '.format( epoch, num_index, self.optimizer.param_groups[0]['lr'], total_loss, (end_time - start_time) / 60) self.logger.info(message) return total_loss
def validation(self, epoch): self.logger.info( 'Start Validation from epoch: {:d}, iter: {:d}'.format(epoch, 0)) self.dualrnn.eval() num_batchs = len(self.val_dataloader) num_index = 1 total_loss = 0.0 start_time = time.time() with torch.no_grad(): for mix, ref in self.val_dataloader: mix = mix.to(self.device) ref = [ref[i].to(self.device) for i in range(self.num_spks)] # self.optimizer.zero_grad() # if self.gpuid: # #model = torch.nn.DataParallel(self.dualrnn) # #out = model(mix) # out = torch.nn.parallel.data_parallel(self.dualrnn,mix,device_ids=self.gpuid) # else: out = self.dualrnn(mix) l = Loss(out, ref) epoch_loss = l total_loss += epoch_loss.item() if num_index % self.print_freq == 0: message = '<epoch:{:d}, iter:{:d}, lr:{:.3e}, loss:{:.3f}>'.format( epoch, num_index, self.optimizer.param_groups[0]['lr'], total_loss/num_index) self.logger.info(message) num_index += 1 end_time = time.time() total_loss = total_loss/num_index message = 'Finished *** <epoch:{:d}, iter:{:d}, lr:{:.3e}, loss:{:.3f}, Total time:{:.3f} min> '.format( epoch, num_index, self.optimizer.param_groups[0]['lr'], total_loss, (end_time-start_time)/60) self.logger.info(message) return total_loss
def validation(self, epoch): self.logger.info( 'Start Validation from epoch: {:d}, iter: {:d}'.format(epoch, 0)) self.dualrnn.eval() num_batchs = len(self.val_dataloader) num_index = 1 total_loss = 0.0 start_time = time.time() with torch.no_grad(): pbar = tqdm(self.val_dataloader, desc='Val loop', leave=False) for batch in pbar: # mix = mix.to(self.device) # ref = [ref[i].to(self.device) for i in range(self.num_spks)] batch["first_videos_features"] = batch[ "first_videos_features"][:, :50, :].detach() batch["second_videos_features"] = batch[ "second_videos_features"][:, :50, :].detach() batch = {k: v.to(self.device) for k, v in batch.items()} ref = [batch[f"{i}_audios"] for i in ["first", "second"]] # self.optimizer.zero_grad() # if self.gpuid: # #model = torch.nn.DataParallel(self.dualrnn) # #out = model(mix) # out = torch.nn.parallel.data_parallel(self.dualrnn,mix,device_ids=self.gpuid) # else: out = self.dualrnn(batch) #, batch["audios_lens"]) l = Loss(out, ref) #, batch["audios_lens"]) pbar.set_description(f"Loss: {round(l.item(), 4)}") epoch_loss = l total_loss += epoch_loss.item() if num_index: message = 'Val: <epoch:{:d}, iter:{:d}, lr:{:.3e}, loss:{:.3f}>'.format( epoch, num_index, self.optimizer.param_groups[0]['lr'], total_loss / num_index) self.logger.info(message) num_index += 1 end_time = time.time() total_loss = total_loss / num_index message = 'Finished *** <epoch:{:d}, iter:{:d}, lr:{:.3e}, loss:{:.3f}, Total time:{:.3f} min> '.format( epoch, num_index, self.optimizer.param_groups[0]['lr'], total_loss, (end_time - start_time) / 60) self.logger.info(message) audio_writer = [ handler for handler in self.logger.handlers if hasattr(handler, 'write_audio') ][0] audio_idx = random.randrange(0, out[0].size(0)) audio_writer.write_audio((ref[0][audio_idx], ref[1][audio_idx]), (out[0][audio_idx], out[1][audio_idx]), batch['mix_noised_audios'][audio_idx], batch["audios_lens"][audio_idx], batch["noise_audios"][audio_idx], epoch) return total_loss
def train(self, epoch): self.logger.info('Start training from epoch: {:d}, iter: {:d}'.format( epoch, 0)) self.dualrnn.train() num_batchs = len(self.train_dataloader) total_loss = 0.0 num_index = 1 start_time = time.time() pbar = tqdm(self.train_dataloader, leave=False) for batch in pbar: batch["first_videos_features"] = batch[ "first_videos_features"][:, :50, :].detach() batch["second_videos_features"] = batch[ "second_videos_features"][:, :50, :].detach() batch = {k: v.to(self.device) for k, v in batch.items()} #mix = mix.to(self.device) ref = [batch[f"{i}_audios"] for i in ["first", "second"]] #ref = [ref[i].to(self.device) for i in range(self.num_spks)] self.optimizer.zero_grad() # if self.gpuid: # out = torch.nn.parallel.data_parallel(self.dualrnn,mix,device_ids=self.gpuid) # out = self.dualrnn(mix) # else: st_time = perf_counter() out = self.dualrnn(batch) #, batch["audios_lens"]) st_time = perf_counter() l = Loss(out, ref) #, batch["audios_lens"]) epoch_loss = l pbar.set_description(f"Loss: {round(l.item(), 4)}") #pbar.update(1) total_loss += epoch_loss.item() #epoch_loss.backward() with amp.scale_loss(epoch_loss, self.optimizer) as scaled_loss: scaled_loss.backward() if self.clip_norm: torch.nn.utils.clip_grad_norm_(self.dualrnn.parameters(), self.clip_norm) self.optimizer.step() if num_index % self.print_freq == 0: message = 'Train: <epoch:{:d}, iter:{:d}, lr:{:.3e}, loss:{:.3f}>'.format( epoch, num_index, self.optimizer.param_groups[0]['lr'], total_loss / num_index) self.logger.info(message) num_index += 1 end_time = time.time() total_loss = total_loss / num_index message = 'Finished *** <epoch:{:d}, iter:{:d}, lr:{:.3e}, loss:{:.3f}, Total time:{:.3f} min> '.format( epoch, num_index, self.optimizer.param_groups[0]['lr'], total_loss, (end_time - start_time) / 60) self.logger.info(message) return total_loss
def validation(self, epoch): self.logger.info( 'Start Validation from epoch: {:d}, iter: {:d}'.format(epoch, 1)) self.danet.eval() num_batchs = len(self.val_dataloader) num_index = 1 total_loss = 0.0 start_time = time.time() with torch.no_grad(): for mix_samp, wf, ibm, non_silent in self.val_dataloader: mix_samp = Variable(mix_samp).contiguous().to(self.device) wf = Variable(wf).contiguous().to(self.device) ibm = Variable(ibm).contiguous().to(self.device) non_silent = Variable(non_silent).contiguous().to(self.device) hidden = self.danet.init_hidden(mix_samp.size(0)) input_list = [mix_samp, ibm, non_silent, hidden] if self.gpuid: #mask=torch.nn.parallel.data_parallel(self.danet,input_list,device_ids=self.gpuid) mask, hidden = self.danet(input_list) else: mask, hidden = self.danet(mix_samp, ibm, non_silent) l = Loss(mix_samp, wf, mask) epoch_loss = l.loss() total_loss += epoch_loss.item() if num_index % self.print_freq == 0: message = '<epoch:{:d}, iter:{:d}, lr:{:.3e}, loss:{:.3f}>'.format( epoch, num_index, self.optimizer.param_groups[0]['lr'], total_loss / num_index) self.logger.info(message) num_index += 1 end_time = time.time() total_loss = total_loss / num_index message = '<epoch:{:d}, iter:{:d}, lr:{:.3e}, loss:{:.3f}, Total time:{:.3f} min> '.format( epoch, num_index, self.optimizer.param_groups[0]['lr'], total_loss, (end_time - start_time) / 60) self.logger.info(message) return total_loss
def run(load_last_checkpoint=False): save_dir = f'{OUTPUT_PATH}/models/' os.makedirs(save_dir, exist_ok=True) neural_net = Net() loss_fn = Loss() optim = torch.optim.SGD(neural_net.parameters(), DEFAULT_LR, momentum=0.9, weight_decay=1e-4) starting_epoch = 0 initial_loss = None if load_last_checkpoint: model_paths = glob(f'''{save_dir}*.ckpt''') model_names = [int(i.split('/')[-1][:-5]) for i in model_paths] latest_model_path = f'''{save_dir}{max(model_names)}.ckpt''' print('loading latest model from:', latest_model_path) checkpoint = torch.load(latest_model_path) neural_net.load_state_dict(checkpoint['model_state_dict']) optim.load_state_dict(checkpoint['optimizer_state_dict']) starting_epoch = checkpoint['epoch'] initial_loss = checkpoint['loss'] if torch.cuda.is_available(): neural_net = neural_net.cuda() loss_fn = loss_fn.cuda() print(f'''Training from epoch: {starting_epoch} towards: {TOTAL_EPOCHS}, with learning rate starting from: {get_lr(starting_epoch)}, and loss: {initial_loss}''') meta = pd.read_csv(f'{OUTPUT_PATH}/augmented_meta.csv', index_col=0).sample(frac=1).reset_index(drop=True) meta_group_by_series = meta.groupby(['seriesuid']).indices list_of_groups = [{i: list(meta_group_by_series[i])} for i in meta_group_by_series.keys()] random.Random(0).shuffle(list_of_groups) val_split = int(VAL_PCT * len(list_of_groups)) val_indices = list(itertools.chain(*[list(i.values())[0] for i in list_of_groups[:val_split]])) train_indices = list(itertools.chain(*[list(i.values())[0] for i in list_of_groups[val_split:]])) ltd = LunaDataSet(train_indices, meta) lvd = LunaDataSet(val_indices, meta) train_loader = DataLoader(ltd, batch_size=1, shuffle=False) val_loader = DataLoader(lvd, batch_size=1, shuffle=False) for ep in range(starting_epoch, TOTAL_EPOCHS): train(train_loader, neural_net, loss_fn, ep, optim, get_lr, save_dir=save_dir) validate(val_loader, neural_net, loss_fn)
def main(): par = ArgumentParser() par.add_argument('--batch_size', type=int, default=16) arg = par.parse_args() if torch.cuda.is_available(): device = torch.device('cuda') torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = True else: device = torch.device('cpu') dataset = VOCDataset('VOCdevkit', split='trainval') dataloader = DataLoader( dataset, batch_size=arg.batch_size, shuffle=True, num_workers=8, collate_fn=detection_collate ) dataset[0] torch.backends.cuda.enabled = True torch.backends.cuda.benchmark = True model = YOLOv2().to(device) criterion = Loss().cuda() lr = 1e-4 opt = SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=5e-4) for epoch in range(160): i = 0 if epoch == 60: adjust_learning_rate(opt, lr/10) if epoch == 90: adjust_learning_rate(opt, lr/100) for batch in dataloader: i += 1 img, boxes, label, num_obj = batch img = Variable(img).to(device) boxes = Variable(boxes).to(device) output = model(img) target = boxes, label, num_obj box_loss, iou_loss, class_loss = criterion(output, target) opt.zero_grad() loss = box_loss.mean() + iou_loss.mean() \ + class_loss.mean() if i % 10 == 0: print( f"batch {epoch} {i}/{len(dataloader)} loss:{round(loss.item(),3)} box: {round(box_loss.mean().item(),3)} iou: {round(iou_loss.mean().item(),3)} class: {round(class_loss.mean().item(),3)}") loss.backward() opt.step() if epoch % 5 == 0: print(f"epoch {epoch} save model") torch.save(model, f'weights/yolov2_{epoch}.pth')
def __init__(self, dataset, batch_size, device=torch.device('cpu')): self.anchors = dataset.s_anchors, dataset.m_anchors, dataset.l_anchors self.device = device self.data_loader = DataLoader(dataset, batch_size, shuffle=True) self.model = Yolov3Net(self.anchors) self.model.train() if torch.cuda.is_available(): self.model = torch.nn.DataParallel( self.model).to(device=self.device) self.optimizer = Adam(self.model.parameters(), weight_decay=0.0005) self.criterion = Loss(self.anchors, input_size=dataset.input_size)
def train(self, epoch): self.logger.info('Start training from epoch: {:d}, iter: {:d}'.format( epoch, 0)) self.dualrnn.train() num_batchs = len(self.train_dataloader) total_loss = 0.0 num_index = 1 start_time = time.time() for mix, ref in self.train_dataloader: mix = mix.to(self.device) ref = [ref[i].to(self.device) for i in range(self.num_spks)] self.optimizer.zero_grad() if self.gpuid: out = torch.nn.parallel.data_parallel(self.dualrnn, mix, device_ids=self.gpuid) #out = self.dualrnn(mix) else: out = self.dualrnn(mix) l = Loss(out, ref) epoch_loss = l total_loss += epoch_loss.item() epoch_loss.backward() if self.clip_norm: torch.nn.utils.clip_grad_norm_(self.dualrnn.parameters(), self.clip_norm) self.optimizer.step() if num_index % self.print_freq == 0: message = '<epoch:{:d}, iter:{:d}, lr:{:.3e}, loss:{:.3f}>'.format( epoch, num_index, self.optimizer.param_groups[0]['lr'], total_loss / num_index) self.logger.info(message) num_index += 1 end_time = time.time() total_loss = total_loss / num_index message = 'Finished *** <epoch:{:d}, iter:{:d}, lr:{:.3e}, loss:{:.3f}, Total time:{:.3f} min> '.format( epoch, num_index, self.optimizer.param_groups[0]['lr'], total_loss, (end_time - start_time) / 60) self.logger.info(message) return total_loss
def __init__(self, args): super(Model, self).__init__() self.save_hyperparameters() self.args = args self.f1_score = F1(args) self.model = UNetLoc(args) if args.type == "pre" else get_dmg_unet( args) self.loss = Loss(args) self.best_f1 = torch.tensor(0) self.best_epoch = 0 self.tta_flips = [[2], [3], [2, 3]] self.lr = args.lr self.n_class = 2 if self.args.type == "pre" else 5 self.softmax = nn.Softmax(dim=1) self.test_idx = 0 self.dllogger = Logger(backends=[ JSONStreamBackend( Verbosity.VERBOSE, os.path.join(args.results, f"{args.logname}.json")), StdOutBackend(Verbosity.VERBOSE, step_format=lambda step: f"Epoch: {step} "), ])
def train(args): # build model model = Model() #print(sum(p.numel() for p in model.parameters() if p.requires_grad)) mode(model, True) optimizer = torch.optim.AdamW(model.parameters(), lr=hps.lr) criterion = Loss() # load checkpoint iteration = 1 if args.ckpt_pth != '': model, optimizer, iteration = load_checkpoint(args.ckpt_pth, model, optimizer) iteration += 1 # next iteration is iteration+1 # get scheduler if hps.sch: if args.ckpt_pth != '': scheduler = torch.optim.lr_scheduler.StepLR(optimizer, hps.sch_step, hps.sch_g, last_epoch=iteration) else: scheduler = torch.optim.lr_scheduler.StepLR( optimizer, hps.sch_step, hps.sch_g) # make dataset train_loader = prepare_dataloaders(args.data_dir) # get logger ready if args.log_dir != '': if not os.path.isdir(args.log_dir): os.makedirs(args.log_dir) os.chmod(args.log_dir, 0o775) logger = Logger(args.log_dir) # get ckpt_dir ready if args.ckpt_dir != '' and not os.path.isdir(args.ckpt_dir): os.makedirs(args.ckpt_dir) os.chmod(args.ckpt_dir, 0o775) model.train() # ================ MAIN TRAINNIG LOOP =================== while iteration <= hps.max_iter: for batch in train_loader: if iteration > hps.max_iter: break start = time.perf_counter() wavs, mels = batch wavs = mode(wavs) mels = mode(mels) # forward outputs = model(wavs, mels) p_wavs = model.infer(mels) if iteration % hps.n == 0 else None # loss loss = criterion(outputs, p_wavs, wavs) # zero grad ans backward model.zero_grad() loss[0].backward() grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), hps.gn) # update optimizer.step() if hps.sch: scheduler.step(min(iteration, hps.sch_stop)) # info dur = time.perf_counter() - start print('Iter: {} Loss(z/s): {:.2e}/{:.2e} GN: {:.2e} {:.1f}s/it'. format(iteration, loss[1].item(), loss[2].item(), grad_norm, dur)) # log if args.log_dir != '' and (iteration % hps.iters_per_log == 0): learning_rate = optimizer.param_groups[0]['lr'] logger.log_training(loss[1].item(), loss[2].item(), learning_rate, iteration) # save ckpt if args.ckpt_dir != '' and (iteration % hps.iters_per_ckpt == 0): ckpt_pth = os.path.join(args.ckpt_dir, 'ckpt_{}'.format(iteration)) save_checkpoint(model, optimizer, iteration, ckpt_pth) # sample if args.log_dir != '' and (iteration % hps.iters_per_sample == 0): model.eval() with torch.no_grad(): pred = model.infer(mels[:1]) logger.sample_training(wavs[0], pred[0], iteration) model.train() iteration += 1 if args.log_dir != '': logger.close()
test_set = VOCLoader(root='./datasets_raid1/voc/VOC2007', image_set='test', transform=transform) #import pdb; pdb.set_trace() test_loader = DataLoader(test_set, batch_size=32, shuffle=False, collate_fn=train_set.collate_fn) model = SSD(21).to(device) optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, momentum=0.9, weight_decay=0.0005) criterion = Loss().to(device) #import pdb; pdb.set_trace() epochs = 1 for epoch in range(epochs): print("%d/%d" % (epoch, epochs)) model.train() for i, (images, categories, boxes) in enumerate(test_loader): #import pdb; pdb.set_trace() images = images.to(device) boxes = [box.to(device) for box in boxes] categories = [category.to(device) for category in categories] predicted_loc, predicted_cls = model(images)
def main(opt): if torch.cuda.is_available(): print('Will compute using CUDA') # torch.distributed.init_process_group(backend='nccl', init_method='env://') # num_gpus = torch.distributed.get_world_size() num_gpus = 1 torch.cuda.manual_seed(123) else: torch.manual_seed(123) num_gpus = 1 train_params = { "batch_size": opt.batch_size * num_gpus, "shuffle": True, "drop_last": False, "num_workers": opt.num_workers, "collate_fn": collate_fn } test_params = { "batch_size": opt.batch_size * num_gpus, "shuffle": True, "drop_last": False, "num_workers": opt.num_workers, "collate_fn": collate_fn } dboxes = generate_dboxes() model = SSD() train_set = OIDataset(SimpleTransformer(dboxes)) train_loader = DataLoader(train_set, **train_params) test_set = OIDataset(SimpleTransformer(dboxes, eval=True), train=False) test_loader = DataLoader(test_set, **test_params) encoder = Encoder(dboxes) opt.lr = opt.lr * num_gpus * (opt.batch_size / 32) criterion = Loss(dboxes) optimizer = torch.optim.SGD(model.parameters(), lr=opt.lr, momentum=opt.momentum, weight_decay=opt.weight_decay, nesterov=True) scheduler = MultiStepLR(optimizer=optimizer, milestones=opt.multistep, gamma=0.1) if torch.cuda.is_available(): model.cuda() criterion.cuda() model = torch.nn.DataParallel(model) if os.path.isdir(opt.log_path): shutil.rmtree(opt.log_path) os.makedirs(opt.log_path) if not os.path.isdir(opt.save_folder): os.makedirs(opt.save_folder) checkpoint_path = os.path.join(opt.save_folder, "SSD.pth") writer = SummaryWriter(opt.log_path) if os.path.isfile(checkpoint_path): checkpoint = torch.load(checkpoint_path) first_epoch = checkpoint["epoch"] + 1 model.module.load_state_dict(checkpoint["model_state_dict"]) scheduler.load_state_dict(checkpoint["scheduler"]) optimizer.load_state_dict(checkpoint["optimizer"]) # evaluate(model, test_loader, encoder, opt.nms_threshold) else: first_epoch = 0 for epoch in range(first_epoch, opt.epochs): train(model, train_loader, epoch, writer, criterion, optimizer, scheduler) evaluate(model, test_loader, encoder, opt.nms_threshold) checkpoint = { "epoch": epoch, "model_state_dict": model.module.state_dict(), "optimizer": optimizer.state_dict(), "scheduler": scheduler.state_dict() } torch.save(checkpoint, checkpoint_path)
topology_vis[-1], loss_obj.x_grids, loss_obj.y_grids, loss_obj.z_grids, itest, args, 'val') print('') return loss_eval if __name__ == '__main__': # parse args args = parse_args() # load data args, data_val = load_data(args, dtype, 'val') # setup loss object loss_obj = Loss(args) # initialize the model assert (os.path.isfile(args.model)) print "Validating with snapshotted model %s ..." % args.model deep_marching_cubes = torch.load(args.model) if torch.cuda.is_available(): deep_marching_cubes.cuda() # validation loss = run_val(deep_marching_cubes, loss_obj, data_val, args, 'val') print('============== average loss:%f' % (loss / args.num_val)) print 'Done!'
def train(): parser = argparse.ArgumentParser( description='Parameters for training Model') # configuration fiule parser.add_argument('--opt', type=str, help='Path to option YAML file.') args = parser.parse_args() opt = option.parse(args.opt) set_logger.setup_logger(opt['logger']['name'], opt['logger']['path'], screen=opt['logger']['screen'], tofile=opt['logger']['tofile']) logger = logging.getLogger(opt['logger']['name']) day_time = datetime.date.today().strftime('%y%m%d') # build model model = opt['model']['MODEL'] logger.info("Building the model of {}".format(model)) # Extraction and Suppression model if opt['model']['MODEL'] == 'DPRNN_Speaker_Extraction' or opt['model'][ 'MODEL'] == 'DPRNN_Speaker_Suppression': net = model_function.Extractin_Suppression_Model( **opt['Dual_Path_Aux_Speaker']) # Separation model if opt['model']['MODEL'] == 'DPRNN_Speech_Separation': net = model_function.Speech_Serapation_Model( **opt['Dual_Path_Aux_Speaker']) if opt['train']['gpuid']: if len(opt['train']['gpuid']) > 1: logger.info('We use GPUs : {}'.format(opt['train']['gpuid'])) else: logger.info('We use GPUs : {}'.format(opt['train']['gpuid'])) device = torch.device('cuda:{}'.format(opt['train']['gpuid'][0])) gpuids = opt['train']['gpuid'] if len(gpuids) > 1: net = torch.nn.DataParallel(net, device_ids=gpuids) net = net.to(device) logger.info('Loading {} parameters: {:.3f} Mb'.format( model, check_parameters(net))) # build optimizer logger.info("Building the optimizer of {}".format(model)) Optimizer = make_optimizer(net.parameters(), opt) Scheduler = ReduceLROnPlateau(Optimizer, mode='min', factor=opt['scheduler']['factor'], patience=opt['scheduler']['patience'], verbose=True, min_lr=opt['scheduler']['min_lr']) # build dataloader logger.info('Building the dataloader of {}'.format(model)) train_dataloader, val_dataloader = make_dataloader(opt) logger.info('Train Datasets Length: {}, Val Datasets Length: {}'.format( len(train_dataloader), len(val_dataloader))) # build trainer logger.info('............. Training ................') total_epoch = opt['train']['epoch'] num_spks = opt['num_spks'] print_freq = opt['logger']['print_freq'] checkpoint_path = opt['train']['path'] early_stop = opt['train']['early_stop'] max_norm = opt['optim']['clip_norm'] best_loss = np.inf no_improve = 0 ce_loss = torch.nn.CrossEntropyLoss() weight = 0.1 epoch = 0 # Resume training settings if opt['resume']['state']: opt['resume']['path'] = opt['resume'][ 'path'] + '/' + '200722_epoch{}.pth.tar'.format( opt['resume']['epoch']) ckp = torch.load(opt['resume']['path'], map_location='cpu') epoch = ckp['epoch'] logger.info("Resume from checkpoint {}: epoch {:.3f}".format( opt['resume']['path'], epoch)) net.load_state_dict(ckp['model_state_dict']) net.to(device) Optimizer.load_state_dict(ckp['optim_state_dict']) while epoch < total_epoch: epoch += 1 logger.info('Start training from epoch: {:d}, iter: {:d}'.format( epoch, 0)) num_steps = len(train_dataloader) # trainning process total_SNRloss = 0.0 total_CEloss = 0.0 num_index = 1 start_time = time.time() for inputs, targets in train_dataloader: # Separation train if opt['model']['MODEL'] == 'DPRNN_Speech_Separation': mix = inputs ref = targets net.train() mix = mix.to(device) ref = [ref[i].to(device) for i in range(num_spks)] net.zero_grad() train_out = net(mix) SNR_loss = Loss(train_out, ref) loss = SNR_loss # Extraction train if opt['model']['MODEL'] == 'DPRNN_Speaker_Extraction': mix, aux = inputs ref, aux_len, sp_label = targets net.train() mix = mix.to(device) aux = aux.to(device) ref = ref.to(device) aux_len = aux_len.to(device) sp_label = sp_label.to(device) net.zero_grad() train_out = net([mix, aux, aux_len]) SNR_loss = Loss_SI_SDR(train_out[0], ref) CE_loss = torch.mean(ce_loss(train_out[1], sp_label)) loss = SNR_loss + weight * CE_loss total_CEloss += CE_loss.item() # Suppression train if opt['model']['MODEL'] == 'DPRNN_Speaker_Suppression': mix, aux = inputs ref, aux_len = targets net.train() mix = mix.to(device) aux = aux.to(device) ref = ref.to(device) aux_len = aux_len.to(device) net.zero_grad() train_out = net([mix, aux, aux_len]) SNR_loss = Loss_SI_SDR(train_out[0], ref) loss = SNR_loss # BP processs loss.backward() torch.nn.utils.clip_grad_norm_(net.parameters(), max_norm) Optimizer.step() total_SNRloss += SNR_loss.item() if num_index % print_freq == 0: message = '<Training epoch:{:d} / {:d} , iter:{:d} / {:d}, lr:{:.3e}, SI-SNR_loss:{:.3f}, CE loss:{:.3f}>'.format( epoch, total_epoch, num_index, num_steps, Optimizer.param_groups[0]['lr'], total_SNRloss / num_index, total_CEloss / num_index) logger.info(message) num_index += 1 end_time = time.time() mean_SNRLoss = total_SNRloss / num_index mean_CELoss = total_CEloss / num_index message = 'Finished Training *** <epoch:{:d} / {:d}, iter:{:d}, lr:{:.3e}, ' \ 'SNR loss:{:.3f}, CE loss:{:.3f}, Total time:{:.3f} min> '.format( epoch, total_epoch, num_index, Optimizer.param_groups[0]['lr'], mean_SNRLoss, mean_CELoss, (end_time - start_time) / 60) logger.info(message) # development processs val_num_index = 1 val_total_loss = 0.0 val_CE_loss = 0.0 val_acc_total = 0.0 val_acc = 0.0 val_start_time = time.time() val_num_steps = len(val_dataloader) for inputs, targets in val_dataloader: net.eval() with torch.no_grad(): # Separation development if opt['model']['MODEL'] == 'DPRNN_Speech_Separation': mix = inputs ref = targets mix = mix.to(device) ref = [ref[i].to(device) for i in range(num_spks)] Optimizer.zero_grad() val_out = net(mix) val_loss = Loss(val_out, ref) val_total_loss += val_loss.item() # Extraction development if opt['model']['MODEL'] == 'DPRNN_Speaker_Extraction': mix, aux = inputs ref, aux_len, label = targets mix = mix.to(device) aux = aux.to(device) ref = ref.to(device) aux_len = aux_len.to(device) label = label.to(device) Optimizer.zero_grad() val_out = net([mix, aux, aux_len]) val_loss = Loss_SI_SDR(val_out[0], ref) val_ce = torch.mean(ce_loss(val_out[1], label)) val_acc = accuracy_speaker(val_out[1], label) val_acc_total += val_acc val_total_loss += val_loss.item() val_CE_loss += val_ce.item() # suppression development if opt['model']['MODEL'] == 'DPRNN_Speaker_Suppression': mix, aux = inputs ref, aux_len = targets mix = mix.to(device) aux = aux.to(device) ref = ref.to(device) aux_len = aux_len.to(device) Optimizer.zero_grad() val_out = net([mix, aux, aux_len]) val_loss = Loss_SI_SDR(val_out[0], ref) val_total_loss += val_loss.item() if val_num_index % print_freq == 0: message = '<Valid-Epoch:{:d} / {:d}, iter:{:d} / {:d}, lr:{:.3e}, ' \ 'val_SISNR_loss:{:.3f}, val_CE_loss:{:.3f}, val_acc :{:.3f}>' .format( epoch, total_epoch, val_num_index, val_num_steps, Optimizer.param_groups[0]['lr'], val_total_loss / val_num_index, val_CE_loss / val_num_index, val_acc_total / val_num_index) logger.info(message) val_num_index += 1 val_end_time = time.time() mean_val_total_loss = val_total_loss / val_num_index mean_val_CE_loss = val_CE_loss / val_num_index mean_acc = val_acc_total / val_num_index message = 'Finished *** <epoch:{:d}, iter:{:d}, lr:{:.3e}, val SI-SNR loss:{:.3f}, val_CE_loss:{:.3f}, val_acc:{:.3f}' \ ' Total time:{:.3f} min> '.format(epoch, val_num_index, Optimizer.param_groups[0]['lr'], mean_val_total_loss, mean_val_CE_loss, mean_acc, (val_end_time - val_start_time) / 60) logger.info(message) Scheduler.step(mean_val_total_loss) if mean_val_total_loss >= best_loss: no_improve += 1 logger.info( 'No improvement, Best SI-SNR Loss: {:.4f}'.format(best_loss)) if mean_val_total_loss < best_loss: best_loss = mean_val_total_loss no_improve = 0 save_checkpoint(epoch, checkpoint_path, net, Optimizer, day_time) logger.info( 'Epoch: {:d}, Now Best SI-SNR Loss Change: {:.4f}'.format( epoch, best_loss)) if no_improve == early_stop: save_checkpoint(epoch, checkpoint_path, net, Optimizer, day_time) logger.info("Stop training cause no impr for {:d} epochs".format( no_improve)) break