def test_pad_trim(self): audio_orig = self.sig.clone() length_orig = audio_orig.size(0) length_new = int(length_orig * 1.2) result = transforms.PadTrim(max_len=length_new, channels_first=False)(audio_orig) self.assertEqual(result.size(0), length_new) result = transforms.PadTrim(max_len=length_new, channels_first=True)(audio_orig.transpose( 0, 1)) self.assertEqual(result.size(1), length_new) audio_orig = self.sig.clone() length_orig = audio_orig.size(0) length_new = int(length_orig * 0.8) result = transforms.PadTrim(max_len=length_new, channels_first=False)(audio_orig) self.assertEqual(result.size(0), length_new) repr_test = transforms.PadTrim(max_len=length_new, channels_first=False) self.assertTrue(repr_test.__repr__())
def test_pad_trim(self): audio_orig = self.sig.clone() length_orig = audio_orig.size(0) length_new = int(length_orig * 1.2) result = transforms.PadTrim(max_len=length_new)(audio_orig) self.assertTrue( result.size(0) == length_new, print("old size: {}, new size: {}".format(audio_orig.size(0), result.size(0)))) audio_orig = self.sig.clone() length_orig = audio_orig.size(0) length_new = int(length_orig * 0.8) result = transforms.PadTrim(max_len=length_new)(audio_orig) self.assertTrue( result.size(0) == length_new, print("old size: {}, new size: {}".format(audio_orig.size(0), result.size(0)))) repr_test = transforms.PadTrim(max_len=length_new) repr_test.__repr__()
def test4(self): vx = VOXFORGE(self.bdir, download=False, label_type="lang", num_zips=10, randomize=False, dev_mode=False) vx.find_max_len() T = transforms.Compose([ transforms.PadTrim(vx.maxlen), ]) TT = spl_transforms.LENC(vx.LABELS) vx.transform = T vx.target_transform = TT print(vx.splits) dl = data.DataLoader(vx, batch_size=5) total_train = 0 for i, (mb, l) in enumerate(dl): vx.set_split("train") total_train += l.size(0) if i == 2: vx.set_split("valid") total_valid = 0 for mb_valid, l_valid in dl: total_valid += l_valid.size(0) print(total_valid) print(total_train)
def test1(self): # Data vx = VOXFORGE(self.bdir, label_type="lang") vx.find_max_len() print(vx.maxlen) T = tat.Compose([ tat.PadTrim(vx.maxlen), spl_transforms.MEL(n_mels=224), spl_transforms.BLC2CBL(), tvt.ToPILImage(), tvt.Scale((224, 224)), tvt.ToTensor(), ]) TT = spl_transforms.LENC(vx.LABELS) vx.transform = T vx.target_transform = TT dl = data.DataLoader(vx, batch_size = 25, shuffle=True) # Model and Loss model = models.squeezenet.squeezenet(True) model.train() for i, (mb, tgts) in enumerate(dl): vx.set_split("train") out = model(Variable(mb)) print(mb.size(), mb.min(), mb.max()) print(out.data.size()) print(out.data) break
def test1(self): # Data vx = VOXFORGE(self.bdir, label_type="lang", use_cache=True) #vx.find_max_len() vx.maxlen = 150000 T = tat.Compose([ tat.PadTrim(vx.maxlen), tat.MEL(n_mels=224), tat.BLC2CBL(), tvt.ToPILImage(), tvt.Scale((224, 224)), tvt.ToTensor(), ]) TT = spl_transforms.LENC(vx.LABELS) vx.transform = T vx.target_transform = TT dl = data.DataLoader(vx, batch_size = 25, shuffle=True) # Model and Loss model = models.resnet.resnet34(True) print(model) criterion = nn.CrossEntropyLoss() plist = nn.ParameterList() #plist.extend(list(model[0].parameters())) plist.extend(list(model[1].fc.parameters())) #plist.extend(list(model.parameters())) #optimizer = torch.optim.SGD(plist, lr=0.0001, momentum=0.9) optimizer = torch.optim.Adam(plist, lr=0.0001) train_losses = [] valid_losses = [] for i, (mb, tgts) in enumerate(dl): model.train() vx.set_split("train") mb, tgts = Variable(mb), Variable(tgts) model.zero_grad() out = model(mb) loss = criterion(out, tgts) loss.backward() optimizer.step() train_losses.append(loss.data[0]) print(loss.data[0]) if i % 5 == 0: start = time.time() model.eval() vx.set_split("valid") running_validation_loss = 0 correct = 0 for mb_valid, tgts_valid in dl: mb_valid, tgts_valid = Variable(mb_valid), Variable(tgts_valid) out_valid = model(mb_valid) loss_valid = criterion(out_valid, tgts_valid) running_validation_loss += loss_valid.data[0] correct += (out_valid.data.max(1)[1] == tgts_valid.data).sum() print_running_time(start) valid_losses.append((running_validation_loss, correct / len(vx))) print("loss: {}, acc: {}".format(running_validation_loss, correct / len(vx))) if i == 11: break vx.set_split("train")
def get_loader(config, data_dir): root = os.path.join(os.path.abspath(os.curdir), data_dir) print('-- Loading audios') dataset = AudioFolder(root=root, transform=transforms.Compose([ transforms.PadTrim(133623, 0), transforms.LC2CL() ])) loader = DataLoader(dataset=dataset, batch_size=config.batch_size, shuffle=True, num_workers=config.num_workers) return loader
def test_compose(self): audio_orig = self.sig.clone() length_orig = audio_orig.size(0) length_new = int(length_orig * 1.2) maxminmax = np.abs([audio_orig.min(), audio_orig.max()]).max().astype(np.float) tset = (transforms.Scale(factor=maxminmax), transforms.PadTrim(max_len=length_new)) result = transforms.Compose(tset)(audio_orig) self.assertTrue(np.abs([result.min(), result.max()]).max() == 1.) self.assertTrue(result.size(0) == length_new)
def test_compose(self): audio_orig = self.sig.clone() length_orig = audio_orig.size(0) length_new = int(length_orig * 1.2) maxminmax = max(abs(audio_orig.min()), abs(audio_orig.max())).item() tset = (transforms.Scale(factor=maxminmax), transforms.PadTrim(max_len=length_new, channels_first=False)) result = transforms.Compose(tset)(audio_orig) self.assertTrue(max(abs(result.min()), abs(result.max())) == 1.) self.assertTrue(result.size(0) == length_new) repr_test = transforms.Compose(tset) self.assertTrue(repr_test.__repr__())
def test4(self): ds = AUDIOSET(self.bdir) T = transforms.Compose([ transforms.PadTrim(ds.maxlen), ]) TT = mgc_transforms vx.transform = T dl = data.DataLoader(vx, collate_fn=bce_collate, batch_size=5) total_train = 0 for i, (mb, l) in enumerate(dl): total_train += l.size(0) if i == 2: #ds.set_split("valid") total_valid = 0 for mb_valid, l_valid in dl: total_valid += l_valid.size(0) print(total_valid) print(total_train)
def test3(self): """ Test that the data loader does transforms """ ds = AUDIOSET(self.bdir, randomize=True) T = transforms.Compose([ transforms.PadTrim(ds.maxlen), mgc_transforms.MEL(), mgc_transforms.BLC2CBL() ]) TT = mgc_transforms.BinENC(ds.labels_dict) ds.transform = T ds.target_transform = TT dl = data.DataLoader(ds, collate_fn=bce_collate, batch_size=5) labels_total = 0 print(ds.labels_dict) for i, (a, b) in enumerate(dl): print(a.size(), b.size()) if i > 10: break
def test5(self): import numpy as np vx = VOXFORGE(self.bdir, download=False, label_type="prompts", dev_mode=False) vx.find_max_len() T = transforms.Compose([ transforms.PadTrim(vx.maxlen), ]) TT = spl_transforms.WC() vx.transform = T vx.target_transform = TT print(vx.splits) dl = data.DataLoader(vx, batch_size=5, collate_fn=basic_collate) max_wc = 0 wc_all = [] for i, (mb, tgts) in enumerate(dl): sorted(tgts) max_wc = tgts[-1] if tgts[-1] > max_wc else max_wc wc_all.extend(tgts) print(np.histogram(wc_all, bins=20), len(wc_all))
def test3(self): vx = VOXFORGE(self.bdir, download=False, label_type="lang", num_zips=10, randomize=False, split="valid", dev_mode=False) vx.find_max_len() T = transforms.Compose([ transforms.PadTrim(vx.maxlen), spl_transforms.MEL(), spl_transforms.BLC2CBL() ]) TT = spl_transforms.LENC(vx.LABELS) vx.transform = T vx.target_transform = TT dl = data.DataLoader(vx, batch_size=5) labels_total = 0 for i, (a, b) in enumerate(dl): labels_total += b.sum() print((len(vx) - labels_total) / len(vx))
def test1(self): """ Test that the data loader does transforms """ NMELS = 224 ds = AUDIOSET(self.bdir, randomize=True) T = transforms.Compose([ transforms.PadTrim(ds.maxlen), mgc_transforms.MEL(n_mels=NMELS), mgc_transforms.BLC2CBL() ]) TT = mgc_transforms.BinENC(ds.labels_dict) ds.transform = T ds.target_transform = TT dl = data.DataLoader(ds, collate_fn=bce_collate, batch_size=5) labels_total = 0 for i, (a, b) in enumerate(dl): print(a.size(), b.size()) break self.assertTrue(a.size()[-2:] == (NMELS, 313))
## Setting seed import random param.seed = param.seed or random.randint(1, 10000) print("Random Seed: " + str(param.seed)) print("Random Seed: " + str(param.seed), file=log_output) random.seed(param.seed) torch.manual_seed(param.seed) if param.cuda: torch.cuda.manual_seed_all(param.seed) ## Transforming audio files trans = transf.Compose([ transf.Scale(), # This makes it into [-1,1] # transf.ToTensor(), transf.PadTrim(max_len=param.audio_size), # I don't know if this is needed # This makes it into [-1,1] so tanh will work properly # transf.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) ]) def load_sound(path): tensor_to_load_into = None import torchaudio out, sample_rate = torchaudio.load(path, tensor_to_load_into) return out ## Importing dataset data = dset.DatasetFolder(root=param.input_folder, transform=trans,
batch_size = args.batch_size depth = args.depth radixs = [2] * depth N = np.prod(radixs) channels = args.channels lr = args.lr steps = args.steps c = args.c generation_time = args.file_size filename = args.outfile maxlen = 50000 print('==> Downloading YesNo Dataset..') transform = transforms.Compose( [transforms.Scale(), transforms.PadTrim(maxlen), transforms.MuLawEncoding(quantization_channels=channels)]) data = torchaudio.datasets.YESNO('./data', download=True, transform=transform) data_loader = DataLoader(data, batch_size=batch_size, num_workers=4, shuffle=True) print('==> Building model..') net = general_FFTNet(radixs, 128, channels).cuda() print(sum(p.numel() for p in net.parameters() if p.requires_grad), "of parameters.") optimizer = optim.Adam(net.parameters(), lr=lr) criterion = torch.nn.CrossEntropyLoss() print("Start Training.") a = datetime.now().replace(microsecond=0)
def main(): # Init logger if not os.path.isdir(args.save_path): os.makedirs(args.save_path) log = open(os.path.join(args.save_path, 'log_seed_{}.txt'.format(args.manualSeed)), 'w') print_log('save path : {}'.format(args.save_path), log) state = {k: v for k, v in args._get_kwargs()} print_log(state, log) print_log("Random Seed: {}".format(args.manualSeed), log) print_log("python version : {}".format(sys.version.replace('\n', ' ')), log) print_log("torch version : {}".format(torch.__version__), log) print_log("cudnn version : {}".format(torch.backends.cudnn.version()), log) # Data loading code # Any other preprocessings? http://pytorch.org/audio/transforms.html sample_length = 10000 scale = transforms.Scale() padtrim = transforms.PadTrim(sample_length) downmix = transforms.DownmixMono() transforms_audio = transforms.Compose([ scale, padtrim, downmix ]) if not os.path.isdir(args.data_path): os.makedirs(args.data_path) train_dir = os.path.join(args.data_path, 'train') val_dir = os.path.join(args.data_path, 'val') #Choose dataset to use if args.dataset == 'arctic': # TODO No ImageFolder equivalent for audio. Need to create a Dataset manually train_dataset = Arctic(train_dir, transform=transforms_audio, download=True) val_dataset = Arctic(val_dir, transform=transforms_audio, download=True) num_classes = 4 elif args.dataset == 'vctk': train_dataset = dset.VCTK(train_dir, transform=transforms_audio, download=True) val_dataset = dset.VCTK(val_dir, transform=transforms_audio, download=True) num_classes = 10 elif args.dataset == 'yesno': train_dataset = dset.YESNO(train_dir, transform=transforms_audio, download=True) val_dataset = dset.YESNO(val_dir, transform=transforms_audio, download=True) num_classes = 2 else: assert False, 'Dataset is incorrect' train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, # pin_memory=True, # What is this? # sampler=None # What is this? ) val_loader = torch.utils.data.DataLoader( val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) #Feed in respective model file to pass into model (alexnet.py) print_log("=> creating model '{}'".format(args.arch), log) # Init model, criterion, and optimizer # net = models.__dict__[args.arch](num_classes) net = AlexNet(num_classes) # print_log("=> network :\n {}".format(net), log) # net = torch.nn.DataParallel(net, device_ids=list(range(args.ngpu))) # define loss function (criterion) and optimizer criterion = torch.nn.CrossEntropyLoss() # Define stochastic gradient descent as optimizer (run backprop on random small batch) optimizer = torch.optim.SGD(net.parameters(), state['learning_rate'], momentum=state['momentum'], weight_decay=state['decay'], nesterov=True) #Sets use for GPU if available if args.use_cuda: net.cuda() criterion.cuda() recorder = RecorderMeter(args.epochs) # optionally resume from a checkpoint # Need same python vresion that the resume was in if args.resume: if os.path.isfile(args.resume): print_log("=> loading checkpoint '{}'".format(args.resume), log) if args.ngpu == 0: checkpoint = torch.load(args.resume, map_location=lambda storage, loc: storage) else: checkpoint = torch.load(args.resume) recorder = checkpoint['recorder'] args.start_epoch = checkpoint['epoch'] net.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print_log("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch']), log) else: print_log("=> no checkpoint found at '{}'".format(args.resume), log) else: print_log("=> do not use any checkpoint for {} model".format(args.arch), log) if args.evaluate: validate(val_loader, net, criterion, 0, log, val_dataset) return # Main loop start_time = time.time() epoch_time = AverageMeter() # Training occurs here for epoch in range(args.start_epoch, args.epochs): current_learning_rate = adjust_learning_rate(optimizer, epoch, args.gammas, args.schedule) need_hour, need_mins, need_secs = convert_secs2time(epoch_time.avg * (args.epochs-epoch)) need_time = '[Need: {:02d}:{:02d}:{:02d}]'.format(need_hour, need_mins, need_secs) print_log('\n==>>{:s} [Epoch={:03d}/{:03d}] {:s} [learning_rate={:6.4f}]'.format(time_string(), epoch, args.epochs, need_time, current_learning_rate) \ + ' [Best : Accuracy={:.2f}, Error={:.2f}]'.format(recorder.max_accuracy(False), 100-recorder.max_accuracy(False)), log) print("One epoch") # train for one epoch # Call to train (note that our previous net is passed into the model argument) train_acc, train_los = train(train_loader, net, criterion, optimizer, epoch, log, train_dataset) # evaluate on validation set #val_acc, val_los = extract_features(test_loader, net, criterion, log) val_acc, val_los = validate(val_loader, net, criterion, epoch, log, val_dataset) is_best = recorder.update(epoch, train_los, train_acc, val_los, val_acc) save_checkpoint({ 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': net.state_dict(), 'recorder': recorder, 'optimizer' : optimizer.state_dict(), }, is_best, args.save_path, 'checkpoint.pth.tar') # measure elapsed time epoch_time.update(time.time() - start_time) start_time = time.time() recorder.plot_curve( os.path.join(args.save_path, 'curve.png') ) log.close()
import torch import torchaudio.datasets as dset from torchaudio import transforms transform = transforms.Compose( [transforms.Scale(), transforms.PadTrim(100000)]) train_dataset = dset.YESNO("data", transform=transform, download=True) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=10, ) for i, (input, target) in enumerate(train_loader): import ipdb ipdb.set_trace(context=21) print("HI") """ Vision MNIST test""" """ import torchvision.datasets as vdset from torchvision import transforms as vtransforms transform = vtransforms.Compose([ vtransforms.ToTensor() ]) mnist = vdset.MNIST("data", transform=transform, download=True) mnist_loader = torch.utils.data.DataLoader( mnist, batch_size=10,
low_mel_freq = 0 high_freq_mel = (2595 * np.log10(1 + (sr / 2) / 700)) mel_pts = np.linspace(low_mel_freq, high_freq_mel, n_filterbanks + 2) hz_pts = np.floor(700 * (10**(mel_pts / 2595) - 1)) bins = np.floor((n_fft + 1) * hz_pts / sr) # data transformations td = { "RfftPow": RfftPow(n_fft), "FilterBanks": FilterBanks(n_filterbanks, bins), "MFCC": MFCC(n_filterbanks, n_coefficients), } transforms = tat.Compose([ tat.Scale(), tat.PadTrim(58000, fill_value=1e-8), Preemphasis(), Sig2Features(ws, hs, td), ]) # set network parameters use_cuda = torch.cuda.is_available() batch_size = args.batch_size input_features = 26 hidden_size = 100 output_size = 3 #output_length = (8 + 7 + 2) # with "blanks" output_length = 8 # without blanks n_layers = 1 attn_modus = "dot"
def evaluate(): num_classes = 4 # Init logger if not os.path.isdir(args.save_path): os.makedirs(args.save_path) log = open(os.path.join(args.save_path, 'log_seed_{}.txt'.format(args.manualSeed)), 'w') print_log('save path : {}'.format(args.save_path), log) state = {k: v for k, v in args._get_kwargs()} print_log(state, log) print_log("Random Seed: {}".format(args.manualSeed), log) print_log("python version : {}".format(sys.version.replace('\n', ' ')), log) print_log("torch version : {}".format(torch.__version__), log) print_log("cudnn version : {}".format(torch.backends.cudnn.version()), log) # Any other preprocessings? http://pytorch.org/audio/transforms.html sample_length = 10000 scale = transforms.Scale() padtrim = transforms.PadTrim(sample_length) transforms_audio = transforms.Compose([ scale, padtrim ]) # Data loading fs, data = wavfile.read(args.file_name) data = torch.from_numpy(data).float() data = data.unsqueeze(1) audio = transforms_audio(data) audio = Variable(audio) audio = audio.view(1, -1) audio = audio.unsqueeze(0) #Feed in respective model file to pass into model (alexnet.py) print_log("=> creating model '{}'".format(args.arch), log) # Init model, criterion, and optimizer # net = models.__dict__[args.arch](num_classes) net = AlexNet(num_classes) print_log("=> network :\n {}".format(net), log) #Sets use for GPU if available if args.use_cuda: net.cuda() # optionally resume from a checkpoint # Need same python version that the resume was in if args.resume: if os.path.isfile(args.resume): print_log("=> loading checkpoint '{}'".format(args.resume), log) if args.ngpu == 0: checkpoint = torch.load(args.resume, map_location=lambda storage, loc: storage) else: checkpoint = torch.load(args.resume) recorder = checkpoint['recorder'] args.start_epoch = checkpoint['epoch'] net.load_state_dict(checkpoint['state_dict']) print_log("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch']), log) else: print_log("=> no checkpoint found at '{}'".format(args.resume), log) else: print_log("=> do not use any checkpoint for {} model".format(args.arch), log) net.eval() if args.use_cuda: audio = audio.cuda() output = net(audio) print(output) # TODO postprocess output to a string representing the person speaking # ouptut = val_dataset.postprocess_target(output) return