def data_loaders(self, batch_size): train_dataset = SpectrogramDataset(audio_conf=self.audio_conf, manifest_filepath=self.train_manifest, labels=self.labels, normalize=True, speed_volume_perturb=self.speed_volume_perturb, spec_augment=self.spec_augment) test_dataset = SpectrogramDataset(audio_conf=self.audio_conf, manifest_filepath=self.val_manifest, labels=self.labels, normalize=True, speed_volume_perturb=False, spec_augment=False) if not self.distributed: print('BucketingSampler') self.train_sampler = BucketingSampler(train_dataset, batch_size=batch_size) else: print('DistributedBucketingSampler') self.train_sampler = DistributedBucketingSampler(train_dataset, batch_size=batch_size, num_replicas=self.args.world_size, rank=self.rank) self.train_loader = AudioDataLoader(train_dataset, num_workers=self.args.num_workers, batch_sampler=self.train_sampler) self.test_loader = AudioDataLoader(test_dataset, batch_size=self.args.batch_size, num_workers=self.args.num_workers) if (not self.no_shuffle and self.start_epoch != 0) or self.no_sorta_grad: print("Shuffling batches for the following epochs") self.train_sampler.shuffle(self.start_epoch)
def main(args): # Construct Solver # data tr_dataset = AudioDataset(args.train_json, sample_rate=args.sample_rate, segment_length=args.segment_length) cv_dataset = AudioDataset( args.valid_json, sample_rate=args.sample_rate, segment_length=args.segment_length, ) tr_loader = AudioDataLoader(tr_dataset, batch_size=args.batch_size, shuffle=args.shuffle, num_workers=args.num_workers) cv_loader = AudioDataLoader(cv_dataset, batch_size=args.batch_size, num_workers=0) data = {'tr_loader': tr_loader, 'cv_loader': cv_loader} # model model = ConvTasNet(args.N, args.L, args.B, args.H, args.P, args.X, args.R, args.C, norm_type=args.norm_type, causal=args.causal, mask_nonlinear=args.mask_nonlinear) print(model) if args.use_cuda: model = torch.nn.DataParallel(model) model.cuda() # optimizer lr = args.lr / args.batch_per_step if args.optimizer == 'sgd': optimizier = torch.optim.SGD(model.parameters(), lr=lr, momentum=args.momentum, weight_decay=args.l2) elif args.optimizer == 'adam': optimizier = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=args.l2) else: print("Not support optimizer") return # solver solver = Solver(data, model, optimizier, args) solver.train()
def main(args): # data tr_dataset = AudioDataset('tr', batch_size=args.batch_size, sample_rate=args.sample_rate, nmic=args.mic) cv_dataset = AudioDataset('val', batch_size=args.batch_size, sample_rate=args.sample_rate, nmic=args.mic) tr_loader = AudioDataLoader(tr_dataset, batch_size=1, shuffle=args.shuffle, num_workers=0) #num_workers=0 for PC cv_loader = AudioDataLoader(cv_dataset, batch_size=1, num_workers=0) #num_workers=0 for PC data = {'tr_loader': tr_loader, 'cv_loader': cv_loader} # model model = FaSNet_TAC(enc_dim=args.enc_dim, feature_dim=args.feature_dim, hidden_dim=args.hidden_dim, layer=args.layer, segment_size=args.segment_size, nspk=args.nspk, win_len=args.win_len, context_len=args.context_len, sr=args.sample_rate) k = sum(p.numel() for p in model.parameters() if p.requires_grad) print('# of parameters:', k) #print(model) if args.use_cuda: model = torch.nn.DataParallel(model) model.cuda() # optimizer if args.optimizer == 'sgd': optimizier = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.l2) elif args.optimizer == 'adam': optimizier = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.l2) else: print("Not support optimizer") return # solver solver = Solver(data, model, optimizier, args) solver.train()
def main(args): # Construct Solver # data tr_dataset = AudioDataset(args.train_dir, args.batch_size, sample_rate=args.sample_rate, segment=args.segment) cv_dataset = AudioDataset( args.valid_dir, batch_size=1, # 1 -> use less GPU memory to do cv sample_rate=args.sample_rate, segment=-1, cv_maxlen=args.cv_maxlen) # -1 -> use full audio tr_loader = AudioDataLoader(tr_dataset, batch_size=1, shuffle=args.shuffle, num_workers=4) cv_loader = AudioDataLoader(cv_dataset, batch_size=1, num_workers=4, pin_memory=True) data = {'tr_loader': tr_loader, 'cv_loader': cv_loader} # model model = ConvTasNet(args.N, args.L, args.B, args.H, args.P, args.X, args.R, args.C, norm_type=args.norm_type, causal=args.causal, mask_nonlinear=args.mask_nonlinear) if args.use_cuda: model = torch.nn.DataParallel(model) model.cuda() # optimizer if args.optimizer == 'sgd': optimizier = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.l2) elif args.optimizer == 'adam': optimizier = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.l2) else: print("Not support optimizer") return # solver solver = Solver(data, model, optimizier, args) solver.train()
def main(args): # Construct Solver # data tr_dataset = AudioDataset(args.train_json, args.batch_size, args.maxlen_in, args.maxlen_out) cv_dataset = AudioDataset(args.valid_json, args.batch_size, args.maxlen_in, args.maxlen_out) tr_loader = AudioDataLoader(tr_dataset, batch_size=1, num_workers=args.num_workers) cv_loader = AudioDataLoader(cv_dataset, batch_size=1, num_workers=args.num_workers) # load dictionary and generate char_list, sos_id, eos_id char_list, sos_id, eos_id = process_dict(args.dict) vocab_size = len(char_list) data = {'tr_loader': tr_loader, 'cv_loader': cv_loader} # model encoder = Encoder(args.einput, args.ehidden, args.elayer, dropout=args.edropout, bidirectional=args.ebidirectional, rnn_type=args.etype) decoder = Decoder(vocab_size, args.dembed, sos_id, eos_id, args.dhidden, args.dlayer, bidirectional_encoder=args.ebidirectional) model = Seq2Seq(encoder, decoder) print(model) model.cuda() # optimizer if args.optimizer == 'sgd': optimizier = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.l2) elif args.optimizer == 'adam': optimizier = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.l2) else: print("Not support optimizer") return # solver ctc = 0 solver = Solver(data, model, optimizier, args) solver.train()
def separate(model, dataset, output_dir, sr=8000): model.to(device) model.eval() # Load data dataLoader = AudioDataLoader(dataset, batch_size=1) if not os.path.isdir(output_dir): os.mkdir(output_dir) with torch.no_grad(): for i, (mixture, name) in enumerate(dataLoader): # Get batch data mixture = mixture.to(device) # Forward estimate_source = model(mixture).squeeze(0) # [B, C, T] # Write result filename = os.path.join(output_dir, name.strip('.wav')) librosa.output.write_wav(f'{filename}.wav', mixture.squeeze(0).cpu().numpy(), sr) C = estimate_source.size(0) for c in range(C): librosa.output.write_wav(f'{filename}_s{c + 1}.wav', estimate_source[c].cpu().numpy(), sr)
def main(args): # Construct Solver # data tr_dataset = AudioDataset(args.tr_json, sample_rate=args.sample_rate, segment=args.segment, drop=args.drop) cv_dataset = AudioDataset(args.cv_json, sample_rate=args.sample_rate, drop=0, segment=-1) # -1 -> use full audio tr_loader = AudioDataLoader(tr_dataset, batch_size=args.batch_size, shuffle=args.shuffle, num_workers=args.num_workers) cv_loader = AudioDataLoader(cv_dataset, batch_size=1, num_workers=0) data = {'tr_loader': tr_loader, 'cv_loader': cv_loader} # model # N=512, L=32, B=128, Sc=128, H=512, X=8, R=3, P=3, C=2 model = ConvTasNet(args.N, args.L, args.B, args.Sc, args.H, args.X, args.R, args.P, args.C) print(model) if args.use_cuda: os.environ["CUDA_VISIBLE_DEVICES"] = '5,6,7' model = torch.nn.DataParallel(model) model.cuda() # optimizer if args.optimizer == 'sgd': optimizier = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.l2) elif args.optimizer == 'adam': optimizier = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.l2) else: print("Not support optimizer") return # solver solver = Solver(data, model, optimizier, args) solver.train()
def main(args): # Construct Solver # data tr_dataset = AudioDataset(args.train_dir, args.batch_size, sample_rate=args.sample_rate, segment=args.segment) cv_dataset = AudioDataset( args.valid_dir, batch_size=1, # 1 -> use less GPU memory to do cv sample_rate=args.sample_rate, segment=-1, cv_maxlen=args.cv_maxlen) # -1 -> use full audio tr_loader = AudioDataLoader(tr_dataset, batch_size=1, shuffle=args.shuffle, num_workers=args.num_workers) cv_loader = AudioDataLoader(cv_dataset, batch_size=1, num_workers=0) data = {'tr_loader': tr_loader, 'cv_loader': cv_loader} # model model = DPTNet(args.N, args.C, args.L, args.H, args.K, args.B) #print(model) k = sum(p.numel() for p in model.parameters() if p.requires_grad) print('# of parameters:', k) if args.use_cuda: os.environ["CUDA_VISIBLE_DEVICES"] = '5,6,7' model = torch.nn.DataParallel(model) model.cuda() # optimizer if args.optimizer == 'sgd': optimizier = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.l2) elif args.optimizer == 'adam': optimizier = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.l2) else: print("Not support optimizer") return # solver solver = Solver(data, model, optimizier, args) solver.train()
def main(args): # Construct Solver # data tr_dataset = AudioDataset(args.train_dir, args.batch_size, sample_rate=args.sample_rate, L=args.L) cv_dataset = AudioDataset(args.valid_dir, args.batch_size, sample_rate=args.sample_rate, L=args.L) tr_loader = AudioDataLoader(tr_dataset, batch_size=1, shuffle=args.shuffle, num_workers=args.num_workers) cv_loader = AudioDataLoader(cv_dataset, batch_size=1, shuffle=args.shuffle, num_workers=args.num_workers) data = {'tr_loader': tr_loader, 'cv_loader': cv_loader} # model model = TasNet(args.L, args.N, args.hidden_size, args.num_layers, bidirectional=args.bidirectional, nspk=args.nspk) print(model) model.cuda() # optimizer if args.optimizer == 'sgd': optimizier = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.l2) elif args.optimizer == 'adam': optimizier = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.l2) else: print("Not support optimizer") return # solver solver = Solver(data, model, optimizier, args) solver.train()
def evaluate(args): total_SISNRi = 0 total_SDRi = 0 total_cnt = 0 # Load model model = ConvTasNet.load_model(args.model_path) print(model) model.eval() #if args.use_cuda: if True: model.cuda() # Load data dataset = AudioDataset(args.data_dir, args.batch_size, sample_rate=args.sample_rate, segment=-1) data_loader = AudioDataLoader(dataset, batch_size=1, num_workers=2) with torch.no_grad(): for i, (data) in enumerate(data_loader): # Get batch data padded_mixture, mixture_lengths, padded_source = data #if args.use_cuda: if True: padded_mixture = padded_mixture.cuda() mixture_lengths = mixture_lengths.cuda() padded_source = padded_source.cuda() # Forward estimate_source = model(padded_mixture) # [B, C, T] loss, max_snr, estimate_source, reorder_estimate_source = \ cal_loss(padded_source, estimate_source, mixture_lengths) # Remove padding and flat mixture = remove_pad(padded_mixture, mixture_lengths) source = remove_pad(padded_source, mixture_lengths) # NOTE: use reorder estimate source estimate_source = remove_pad(reorder_estimate_source, mixture_lengths) # for each utterance for mix, src_ref, src_est in zip(mixture, source, estimate_source): print("Utt", total_cnt + 1) # Compute SDRi if args.cal_sdr: avg_SDRi = cal_SDRi(src_ref, src_est, mix) total_SDRi += avg_SDRi print("\tSDRi={0:.2f}".format(avg_SDRi)) # Compute SI-SNRi avg_SISNRi = cal_SISNRi(src_ref, src_est, mix) print("\tSI-SNRi={0:.2f}".format(avg_SISNRi)) total_SISNRi += avg_SISNRi total_cnt += 1 if args.cal_sdr: print("Average SDR improvement: {0:.2f}".format(total_SDRi / total_cnt)) print("Average SISNR improvement: {0:.2f}".format(total_SISNRi / total_cnt))
def main(args): # Construct Solver # data tr_dataset = AudioDataset(args.train_dir, args.batch_size, sample_rate=args.sample_rate, segment=args.segment) cv_dataset = AudioDataset(args.valid_dir, batch_size=1, # 1 -> use less GPU memory to do cv sample_rate=args.sample_rate, segment=-1, cv_maxlen=args.cv_maxlen) # -1 -> use full audio tr_loader = AudioDataLoader(tr_dataset, batch_size=1, shuffle=args.shuffle) cv_loader = AudioDataLoader(cv_dataset, batch_size=1) data = {'tr_loader': tr_loader, 'cv_loader': cv_loader} # model # model = FURCA(args.W, args.N, args.K, args.C, args.D, args.H, args.E, # norm_type=args.norm_type, causal=args.causal, # mask_nonlinear=args.mask_nonlinear) model = FaSNet_base(enc_dim=256, feature_dim=64, hidden_dim=128, layer=6, segment_size=250, nspk = 2, win_len = 2) print(model) if args.use_cuda: # model = torch.nn.DataParallel(model) model.cuda() #model.to(device) # optimizer if args.optimizer == 'sgd': optimizier = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.l2) elif args.optimizer == 'adam': optimizier = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.l2) else: print("Not support optimizer") return # solver solver = Solver(data, model, optimizier, args) solver.train()
def main(train_dir,batch_size,sample_rate, segment,valid_dir,cv_maxlen,shuffle,num_workers,N, L, B, H, P, X, R, C,norm_type, causal, mask_nonlinear,use_cuda,optimizer,lr,momentum,l2): # Construct Solver # data tr_dataset = AudioDataset(train_dir, batch_size, sample_rate=sample_rate, segment=segment) cv_dataset = AudioDataset(valid_dir, batch_size=1, # 1 -> use less GPU memory to do cv sample_rate=sample_rate, segment=-1, cv_maxlen=cv_maxlen) # -1 -> use full audio tr_loader = AudioDataLoader(tr_dataset, batch_size=1, shuffle=shuffle, num_workers=num_workers) cv_loader = AudioDataLoader(cv_dataset, batch_size=1, num_workers=0) data = {'tr_loader': tr_loader, 'cv_loader': cv_loader} # model model = ConvTasNet(N, L, B, H, P, X, R, C, norm_type=norm_type, causal=causal, mask_nonlinear=mask_nonlinear) print(model) if use_cuda: model = torch.nn.DataParallel(model) model.cuda() # optimizer if optimizer == 'sgd': optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=momentum, weight_decay=l2) elif optimizer == 'adam': #fatemeh: change optimizier to optimizer optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=l2) else: print("Not support optimizer") return # solver solver = Solver(data, model, optimizer, use_cuda,epochs,half_lr,early_stop,max_norm,save_folder,checkpoint,continue_from,model_path,print_freq,visdom,visdom_epoch,visdom_id) solver.train()
def evaluate(model, dataset, batch_size=2, verbose=1, cal_sdr=False): total_SISNRi = 0 total_SDRi = 0 total_cnt = 0 model.eval() model.to(device) data_loader = AudioDataLoader(dataset, batch_size=batch_size, shuffle=False) with torch.no_grad(): for i, (audio, mixture_lengths) in enumerate(data_loader): # Get batch data padded_mixture = audio[:, 0] padded_source = audio[:, 1:] padded_mixture = padded_mixture.to(device) mixture_lengths = mixture_lengths.to(device) padded_source = padded_source.to(device) # Forward estimate_source = model(padded_mixture) # [B, C, T] loss, max_snr, estimate_source, reorder_estimate_source = \ cal_loss(padded_source, estimate_source, mixture_lengths) # Remove padding and flat mixture = remove_pad(padded_mixture, mixture_lengths) source = remove_pad(padded_source, mixture_lengths) # NOTE: use reorder estimate source estimate_source = remove_pad(reorder_estimate_source, mixture_lengths) # for each utterance for mix, src_ref, src_est in zip(mixture, source, estimate_source): if verbose == 1: print("Utt", total_cnt + 1) # Compute SDRi if cal_sdr: avg_SDRi = cal_SDRi(src_ref, src_est, mix) total_SDRi += avg_SDRi if verbose == 1: print(f"\tSDRi={avg_SDRi:.{2}}") # Compute SI-SNRi avg_SISNRi = cal_SISNRi(src_ref, src_est, mix) if verbose == 1: print(f"\tSI-SNRi={avg_SISNRi:.{2}}") total_SISNRi += avg_SISNRi total_cnt += 1 if cal_sdr: print(f"Average SDR improvement: {total_SDRi / total_cnt:.{2}}") print(f"Average SISNR improvement: {total_SISNRi / total_cnt:.{2}}")
num_batches = 10 num_workers = 2 batch_frames = 2000 # test batch_frames train_dataset = AudioDataset( train_json, batch_size, max_length_in, max_length_out, num_batches, batch_frames=batch_frames) for i, minibatch in enumerate(train_dataset): print(i) print(minibatch) exit(0) # test train_dataset = AudioDataset( train_json, batch_size, max_length_in, max_length_out, num_batches) # NOTE: must set batch_size=1 here. train_loader = AudioDataLoader( train_dataset, batch_size=1, num_workers=num_workers, LFR_m=4, LFR_n=3) import torch #torch.set_printoptions(threshold=10000000) for i, (data) in enumerate(train_loader): inputs, inputs_lens, targets = data print(i) # print(inputs) print(inputs.size()) print(inputs_lens) # print(targets) print("*"*20)
def evaluate(args): total_SISNRi = 0 total_SDRi = 0 total_cnt = 0 # Load model model = DPTNet(args.N, args.C, args.L, args.H, args.K, args.B) if args.use_cuda: model = torch.nn.DataParallel(model) model.cuda() # model.load_state_dict(torch.load(args.model_path, map_location='cpu')) model_info = torch.load(args.model_path) state_dict = OrderedDict() for k, v in model_info['model_state_dict'].items(): name = k.replace("module.", "") # remove 'module.' state_dict[name] = v model.load_state_dict(state_dict) print(model) # Load data dataset = AudioDataset(args.data_dir, args.batch_size, sample_rate=args.sample_rate, segment=-1) data_loader = AudioDataLoader(dataset, batch_size=1, num_workers=2) with torch.no_grad(): for i, (data) in enumerate(data_loader): # Get batch data padded_mixture, mixture_lengths, padded_source = data if args.use_cuda: padded_mixture = padded_mixture.cuda() mixture_lengths = mixture_lengths.cuda() padded_source = padded_source.cuda() # Forward estimate_source = model(padded_mixture) # [B, C, T] loss, max_snr, estimate_source, reorder_estimate_source = \ cal_loss(padded_source, estimate_source, mixture_lengths) # Remove padding and flat mixture = remove_pad(padded_mixture, mixture_lengths) source = remove_pad(padded_source, mixture_lengths) # NOTE: use reorder estimate source estimate_source = remove_pad(reorder_estimate_source, mixture_lengths) # for each utterance for mix, src_ref, src_est in zip(mixture, source, estimate_source): print("Utt", total_cnt + 1) # Compute SDRi if args.cal_sdr: avg_SDRi = cal_SDRi(src_ref, src_est, mix) total_SDRi += avg_SDRi print("\tSDRi={0:.2f}".format(avg_SDRi)) # Compute SI-SNRi avg_SISNRi = cal_SISNRi(src_ref, src_est, mix) print("\tSI-SNRi={0:.2f}".format(avg_SISNRi)) total_SISNRi += avg_SISNRi total_cnt += 1 if args.cal_sdr: print("Average SDR improvement: {0:.2f}".format(total_SDRi / total_cnt)) print("Average SISNR improvement: {0:.2f}".format(total_SISNRi / total_cnt))
def main(args): # Construct Solver # data tr_dataset = AudioDataset(args.train_json, args.batch_size, args.maxlen_in, args.maxlen_out, batch_frames=args.batch_frames) cv_dataset = AudioDataset(args.valid_json, args.batch_size, args.maxlen_in, args.maxlen_out, batch_frames=args.batch_frames) tr_loader = AudioDataLoader(tr_dataset, batch_size=1, num_workers=args.num_workers, shuffle=args.shuffle, LFR_m=args.LFR_m, LFR_n=args.LFR_n) cv_loader = AudioDataLoader(cv_dataset, batch_size=1, num_workers=args.num_workers, LFR_m=args.LFR_m, LFR_n=args.LFR_n) # load dictionary and generate char_list, sos_id, eos_id char_list, sos_id, eos_id = process_dict(args.dict) vocab_size = len(char_list) data = {'tr_loader': tr_loader, 'cv_loader': cv_loader} # model encoder = Encoder(args.d_input * args.LFR_m, args.n_layers_enc, args.n_head, args.d_k, args.d_v, args.d_model, args.d_inner, dropout=args.dropout, pe_maxlen=args.pe_maxlen) decoder = Decoder( sos_id, eos_id, vocab_size, args.d_word_vec, args.n_layers_dec, args.n_head, args.d_k, args.d_v, args.d_model, args.d_inner, dropout=args.dropout, tgt_emb_prj_weight_sharing=args.tgt_emb_prj_weight_sharing, pe_maxlen=args.pe_maxlen) model = Transformer(encoder, decoder) print(model) model.cuda() # optimizer model = torch.nn.DataParallel(model, device_ids=[0, 1, 2, 3]) optimizier = TransformerOptimizer( torch.optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-09), args.k, args.d_model, args.warmup_steps) # solver solver = Solver(data, model, optimizier, args) solver.train()
def main(args): # Construct Solver # data tr_dataset = AudioDataset(args.train_json, args.batch_size, args.maxlen_in, args.maxlen_out) cv_dataset = AudioDataset(args.valid_json, args.batch_size, args.maxlen_in, args.maxlen_out) tr_loader = AudioDataLoader(tr_dataset, batch_size=1, num_workers=args.num_workers, LFR_m=args.LFR_m, LFR_n=args.LFR_n, align_trun=args.align_trun) cv_loader = AudioDataLoader(cv_dataset, batch_size=1, num_workers=args.num_workers, LFR_m=args.LFR_m, LFR_n=args.LFR_n, align_trun=args.align_trun) # load dictionary and generate char_list, sos_id, eos_id char_list, sos_id, eos_id = process_dict(args.dict) args.char_list = char_list vocab_size = len(char_list) data = {'tr_loader': tr_loader, 'cv_loader': cv_loader} # model #import pdb #pdb.set_trace() encoder = Encoder(args.einput * args.LFR_m, args.ehidden, args.elayer, vocab_size, dropout=args.edropout, bidirectional=args.ebidirectional, rnn_type=args.etype) decoder = Decoder(vocab_size, args.dembed, sos_id, eos_id, args.dhidden, args.dlayer, args.offset, args.atype, dropout=args.edropout, bidirectional_encoder=args.ebidirectional) if args.ebidirectional: eprojs = args.ehidden * 2 else: eprojs = args.ehidden ctc = CTC(odim=vocab_size, eprojs=eprojs, dropout_rate=args.edropout) #lstm_model = Lstmctc.load_model(args.continue_from) model = Seq2Seq(encoder, decoder, ctc, args) #model_dict = model.state_dict() print(model) #print(lstm_model) #pretrained_dict = torch.load(args.ctc_model) #pretrained_dict = {k: v for k, v in pretrained_dict['state_dict'].items() if k in model_dict} #pretrained_dict = {(k.replace('lstm','encoder')):v for k, v in pretrained_dict['state_dict'].items() if (k.replace('lstm','encoder')) in model_dict} #model_dict.update(pretrained_dict) #model.load_state_dict(model_dict) #for k,v in model.named_parameters(): # if k.startswith("encoder"): # print(k) # v.requires_grad=False model.cuda() # optimizer if args.optimizer == 'sgd': optimizier = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.l2) elif args.optimizer == 'adam': optimizier = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.l2) else: print("Not support optimizer") return # solver ctc = 0 solver = Solver(data, model, optimizier, args) solver.train()
def __init__(self): dir_path = os.path.dirname(os.path.realpath(__file__)) self.train_json = os.path.join(dir_path, self.train_json) self.valid_json = os.path.join(dir_path, self.valid_json) self.dict_txt = os.path.join(dir_path, self.dict_txt) self.char_list, self.sos_id, self.eos_id = process_dict(self.dict_txt) self.vocab_size = len(self.char_list) self.tr_dataset = AudioDataset(self.train_json, self.batch_size, self.maxlen_in, self.maxlen_out, batch_frames=self.batch_frames) self.cv_dataset = AudioDataset(self.valid_json, self.batch_size, self.maxlen_in, self.maxlen_out, batch_frames=self.batch_frames) self.tr_loader = AudioDataLoader(self.tr_dataset, batch_size=1, num_workers=self.num_workers, shuffle=self.shuffle, LFR_m=self.LFR_m, LFR_n=self.LFR_n) self.cv_loader = AudioDataLoader(self.cv_dataset, batch_size=1, num_workers=self.num_workers, LFR_m=self.LFR_m, LFR_n=self.LFR_n) self.data = {'tr_loader': self.tr_loader, 'cv_loader': self.cv_loader} self.encoder = Encoder(self.d_input * self.LFR_m, self.n_layers_enc, self.n_head, self.d_k, self.d_v, self.d_model, self.d_inner, dropout=self.dropout, pe_maxlen=self.pe_maxlen) self.decoder = Decoder( self.sos_id, self.eos_id, self.vocab_size, self.d_word_vec, self.n_layers_dec, self.n_head, self.d_k, self.d_v, self.d_model, self.d_inner, dropout=self.dropout, tgt_emb_prj_weight_sharing=self.tgt_emb_prj_weight_sharing, pe_maxlen=self.pe_maxlen) self.tr_loss = torch.Tensor(self.epochs) self.cv_loss = torch.Tensor(self.epochs) self.model = Transformer(self.encoder, self.decoder) self.optimizer = TransformerOptimizer( torch.optim.Adam(self.model.parameters(), betas=(0.9, 0.98), eps=1e-09), self.k, self.d_model, self.warmup_steps) self._reset()
def main(args): # Construct Solver # data tr_dataset = AudioDataset( args.train_json, args.batch_size, args.maxlen_in, args.maxlen_out, batch_frames=args.batch_frames, ) cv_dataset = AudioDataset( args.valid_json, args.batch_size, args.maxlen_in, args.maxlen_out, batch_frames=args.batch_frames, ) tr_loader = AudioDataLoader( tr_dataset, batch_size=1, num_workers=args.num_workers, shuffle=args.shuffle, LFR_m=args.LFR_m, LFR_n=args.LFR_n, ) cv_loader = AudioDataLoader( cv_dataset, batch_size=1, num_workers=args.num_workers, LFR_m=args.LFR_m, LFR_n=args.LFR_n, ) # load dictionary and generate char_list, sos_id, eos_id char_list, sos_id, eos_id = process_dict(args.dict) vocab_size = len(char_list) data = {"tr_loader": tr_loader, "cv_loader": cv_loader} # model encoder = Encoder( args.d_input * args.LFR_m, args.n_layers_enc, args.n_head, args.d_k, args.d_v, args.d_model, args.d_inner, dropout=args.dropout, pe_maxlen=args.pe_maxlen, ) decoder = Decoder( sos_id, eos_id, vocab_size, args.d_word_vec, args.n_layers_dec, args.n_head, args.d_k, args.d_v, args.d_model, args.d_inner, dropout=args.dropout, tgt_emb_prj_weight_sharing=args.tgt_emb_prj_weight_sharing, pe_maxlen=args.pe_maxlen, ) model = Transformer(encoder, decoder) device = flow.device("cuda") model.to(device) # optimizer optimizier = TransformerOptimizer( flow.optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-09), args.k, args.d_model, args.warmup_steps, args.step_num, ) # solver solver = Solver(data, model, optimizier, device, args) solver.train()
train_df = train_df.dropna(how='any') print(train_df.head()) # test_df = pd.read_csv('test_df.csv', names=['id', 'sent']) save_file = os.path.join('save', 'chars') chars = get_chars('chinese', save_file, train_df) char_to_token = {c: i for i, c in enumerate(chars)} token_to_char = {i: c for c, i in char_to_token.items()} sos_token = char_to_token['<sos>'] eos_token = char_to_token['<eos>'] pad_token = char_to_token['<pad>'] train_dataset = SpeechDataset(train_df, dataset_dir, char_to_token) train_loader = AudioDataLoader(pad_token, train_dataset, batch_size=32, shuffle=True, drop_last=True) # #test_dataset = SpeechDataset(test_df, dataset_dir) # #test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn) input_size = 128 # num rows in instagram hidden_dim = 64 # 256*2 nodes in each LSTM num_layers = 3 dropout = 0.1 layer_norm = False encoder = Listener(input_size, hidden_dim, num_layers, dropout=dropout,
cutoff_prob=args.cutoff_prob, beam_width=args.beam_width, num_processes=args.lm_workers) elif args.decoder == "greedy": decoder = GreedyDecoder(model.labels, blank_index=model.labels.index('_')) else: decoder = None target_decoder = GreedyDecoder(model.labels, blank_index=model.labels.index('_')) test_dataset = SpectrogramDataset(audio_conf=model.audio_conf, manifest_filepath=args.test_manifest, labels=model.labels, normalize=True) test_loader = AudioDataLoader(test_dataset, batch_size=args.batch_size, num_workers=args.num_workers) wer, cer, output_data = evaluate(test_loader=test_loader, device=device, model=model, decoder=decoder, target_decoder=target_decoder, save_output=args.save_output, verbose=args.verbose, half=args.half) # print('Test Summary \t' # 'Average WER {wer:.3f}\t' # 'Average CER {cer:.3f}\t'.format(wer=wer, cer=cer)) if args.save_output is not None: np.save(args.save_output, output_data)
def evaluate(args): total_SISNRi = 0 total_SDRi = 0 total_cnt = 0 avg_SISNRiPitNum = 0 length = torch.ones(1) length = length.int() numberEsti =[] # Load model model = ConvTasNet.load_model(args.model_path) # print(model) model.eval() if args.use_cuda: model.cuda(0) # Load data dataset = AudioDataset(args.data_dir, args.batch_size, sample_rate=args.sample_rate, segment=2) data_loader = AudioDataLoader(dataset, batch_size=1, num_workers=2) with torch.no_grad(): for i, (data) in enumerate(data_loader): print(i) # Get batch data padded_mixture, mixture_lengths, padded_source = data if args.use_cuda: padded_mixture = padded_mixture.cuda(0) mixture_lengths = mixture_lengths.cuda(0) # Forward estimate_source ,s_embed = model(padded_mixture) # [B, C, T],#[B,N,K,E] # print(estimate_source.shape) # embid = (model.separator.network[2][7])(padded_mixture) # print(embid) ''' embeddings = s_embed[0].data.cpu().numpy() embedding = (embeddings.reshape((1,-1,20)))[0] number = sourceNumEsti2(embedding) numberEsti.append(number) ''' # print(estimate_source) loss, max_snr, estimate_source, reorder_estimate_source = \ cal_loss(padded_source, estimate_source, mixture_lengths) # Remove padding and flat mixture = remove_pad(padded_mixture, mixture_lengths) source = remove_pad(padded_source, mixture_lengths) # print(max_snr.item()) # NOTE: use reorder estimate source estimate_source = remove_pad(reorder_estimate_source, mixture_lengths) # print((estimate_source[0].shape)) # for each utterance for mix, src_ref, src_est in zip(mixture, source, estimate_source): print("Utt", total_cnt + 1) # Compute SDRi if args.cal_sdr: avg_SDRi = cal_SDRi(src_ref, src_est, mix) total_SDRi += avg_SDRi print("\tSDRi={0:.2f}".format(avg_SDRi)) # Compute SI-SNRi avg_SISNRi = cal_SISNRi(src_ref, src_est, mix) #avg_SISNRiPit,a,b = cal_si_snr_with_pit(torch.from_numpy(src_ref), torch.from_numpy(src_est),length) print("\tSI-SNRi={0:.2f}".format(avg_SISNRi)) total_SISNRi += (avg_SISNRi) #total_SNRiPitNum += avg_SISNRiPit.numpy() total_cnt += 1 if args.cal_sdr: print("Average SDR improvement: {0:.2f}".format(total_SDRi / total_cnt)) print("Average SISNR improvement: {0:.2f}".format(total_SISNRi / total_cnt)) print("speaker:2 ./ClustertrainTFSE1New/final_paper_2_3_2chobatch6.pth.tar") return numberEsti