def train(opts): CUDA = True if torch.cuda.is_available() and not opts.no_cuda else False device = 'cuda' if CUDA else 'cpu' num_devices = 1 np.random.seed(opts.seed) random.seed(opts.seed) torch.manual_seed(opts.seed) if CUDA: torch.cuda.manual_seed_all(opts.seed) num_devices = torch.cuda.device_count() print('[*] Using CUDA {} devices'.format(num_devices)) else: print('[!] Using CPU') print('Seeds initialized to {}'.format(opts.seed)) #torch.autograd.set_detect_anomaly(True) # --------------------- # Build Model minions_cfg = worker_parser(opts.net_cfg) #make_transforms(opts, minions_cfg) opts.random_scale = str2bool(opts.random_scale) dsets, collater_keys = build_dataset_providers(opts, minions_cfg) dset, va_dset = dsets # Build collater, appending the keys from the loaded transforms to the # existing default ones collater = DictCollater() collater.batching_keys.extend(collater_keys) dloader = DataLoader(dset, batch_size=opts.batch_size, shuffle=True, collate_fn=collater, num_workers=opts.num_workers,drop_last=True, pin_memory=CUDA) # Compute estimation of bpe. As we sample chunks randomly, we # should say that an epoch happened after seeing at least as many # chunks as total_train_wav_dur // chunk_size bpe = (dset.total_wav_dur // opts.chunk_size) // opts.batch_size print ("Dataset has a total {} hours of training data".format(dset.total_wav_dur/16000/3600.0)) opts.bpe = bpe if opts.do_eval: assert va_dset is not None, ( "Asked to do validation, but failed to build validation set" ) va_dloader = DataLoader(va_dset, batch_size=opts.batch_size, shuffle=True, collate_fn=DictCollater(), num_workers=opts.num_workers,drop_last=True, pin_memory=CUDA) va_bpe = (va_dset.total_wav_dur // opts.chunk_size) // opts.batch_size opts.va_bpe = va_bpe else: va_dloader = None # fastet lr to MI #opts.min_lrs = {'mi':0.001} if opts.fe_cfg is not None: with open(opts.fe_cfg, 'r') as fe_cfg_f: print(fe_cfg_f) fe_cfg = json.load(fe_cfg_f) print(fe_cfg) else: fe_cfg = None # load config file for attention blocks if opts.att_cfg: with open(opts.att_cfg) as f: att_cfg = json.load(f) print(att_cfg) else: att_cfg = None print(str2bool(opts.tensorboard)) Trainer = trainer(frontend_cfg=fe_cfg, att_cfg=att_cfg, minions_cfg=minions_cfg, cfg=vars(opts), backprop_mode=opts.backprop_mode, lr_mode=opts.lr_mode, tensorboard=str2bool(opts.tensorboard), device=device) print(Trainer.model) print('Frontend params: ', Trainer.model.frontend.describe_params()) Trainer.model.to(device) Trainer.train_(dloader, device=device, valid_dataloader=va_dloader)
def train(opts): CUDA = True if torch.cuda.is_available() and not opts.no_cuda else False device = 'cuda' if CUDA else 'cpu' num_devices = 1 np.random.seed(opts.seed) random.seed(opts.seed) torch.manual_seed(opts.seed) if CUDA: torch.cuda.manual_seed_all(opts.seed) num_devices = torch.cuda.device_count() print('[*] Using CUDA {} devices'.format(num_devices)) else: print('[!] Using CPU') print('Seeds initialized to {}'.format(opts.seed)) #torch.autograd.set_detect_anomaly(True) # --------------------- # Build Model minions_cfg = worker_parser(opts.net_cfg) #make_transforms(opts, minions_cfg) opts.random_scale = str2bool(opts.random_scale) dsets, collater_keys = build_dataset_providers(opts, minions_cfg) dset, va_dset = dsets # Build collater, appending the keys from the loaded transforms to the # existing default ones collater = DictCollater() collater.batching_keys.extend(collater_keys) dloader = DataLoader(dset, batch_size=opts.batch_size, shuffle=True, collate_fn=collater, num_workers=opts.num_workers, drop_last=True, pin_memory=CUDA) # Compute estimation of bpe. As we sample chunks randomly, we # should say that an epoch happened after seeing at least as many # chunks as total_train_wav_dur // chunk_size bpe = (dset.total_wav_dur // opts.chunk_size) // opts.batch_size print("Dataset has a total {} hours of training data".format( dset.total_wav_dur / 16000 / 3600.0)) opts.bpe = bpe if opts.do_eval: assert va_dset is not None, ( "Asked to do validation, but failed to build validation set") va_dloader = DataLoader(va_dset, batch_size=opts.batch_size, shuffle=True, collate_fn=DictCollater(), num_workers=opts.num_workers, drop_last=True, pin_memory=CUDA) va_bpe = (va_dset.total_wav_dur // opts.chunk_size) // opts.batch_size opts.va_bpe = va_bpe else: va_dloader = None # fastet lr to MI #opts.min_lrs = {'mi':0.001} if opts.fe_cfg is not None: with open(opts.fe_cfg, 'r') as fe_cfg_f: print(fe_cfg_f) fe_cfg = json.load(fe_cfg_f) print(fe_cfg) else: fe_cfg = None # load config file for attention blocks if opts.att_cfg: with open(opts.att_cfg) as f: att_cfg = json.load(f) print(att_cfg) else: att_cfg = None print(str2bool(opts.tensorboard)) Trainer = trainer(frontend_cfg=fe_cfg, att_cfg=att_cfg, minions_cfg=minions_cfg, cfg=vars(opts), backprop_mode=opts.backprop_mode, lr_mode=opts.lr_mode, tensorboard=str2bool(opts.tensorboard), device=device) model_description = str(Trainer.model) tfh = tempfile.NamedTemporaryFile(mode="w") tfh.write(model_description) tfh.flush() print(model_description) num_params = Trainer.model.frontend.describe_params() print(f'Frontend params: {num_params}') # Prepare logging neptune_settings = None npt_exp = None if opts.neptune is not None: with open(opts.neptune, "r") as fh: neptune_settings = json.load(fh) fh.close() neptune.init(neptune_settings["project_name"], api_token=neptune_settings["api_key"]) npt_exp = neptune.create_experiment(params=vars(opts), name=opts.experimentname, tags=opts.tags) else: # running offline neptune.init(backend=neptune.OfflineBackend(), project_qualified_name="offline/PASE+") neptune.log_artifact(tfh.name, "model_description.txt") tfh.close() neptune.set_property("frontend_params", num_params) Trainer.model.to(device) Trainer.train_(dloader, device=device, valid_dataloader=va_dloader)