def get_dataloader(self): vx = VOXFORGE(args.data_path, langs=args.languages, label_type="lang", use_cache=args.use_cache, use_precompute=args.use_precompute) if self.model_name == "resnet34_conv" or self.model_name == "resnet101_conv": T = tat.Compose([ #tat.PadTrim(self.max_len), tat.MEL(n_mels=224), tat.BLC2CBL(), tvt.ToPILImage(), tvt.Resize((224, 224)), tvt.ToTensor(), ]) TT = spl_transforms.LENC(vx.LABELS) elif self.model_name == "resnet34_mfcc": sr = 16000 ws = 800 hs = ws // 2 n_fft = 512 # 256 n_filterbanks = 26 n_coefficients = 12 low_mel_freq = 0 high_freq_mel = (2595 * math.log10(1 + (sr / 2) / 700)) mel_pts = torch.linspace(low_mel_freq, high_freq_mel, n_filterbanks + 2) # sr = 16000 hz_pts = torch.floor(700 * (torch.pow(10, mel_pts / 2595) - 1)) bins = torch.floor((n_fft + 1) * hz_pts / sr) td = { "RfftPow": spl_transforms.RfftPow(n_fft), "FilterBanks": spl_transforms.FilterBanks(n_filterbanks, bins), "MFCC": spl_transforms.MFCC(n_filterbanks, n_coefficients), } T = tat.Compose([ tat.Scale(), #tat.PadTrim(self.max_len, fill_value=1e-8), spl_transforms.Preemphasis(), spl_transforms.Sig2Features(ws, hs, td), spl_transforms.DummyDim(), tat.BLC2CBL(), tvt.ToPILImage(), tvt.Resize((224, 224)), tvt.ToTensor(), ]) TT = spl_transforms.LENC(vx.LABELS) vx.transform = T vx.target_transform = TT if args.use_precompute: vx.load_precompute(args.model_name) dl = data.DataLoader(vx, batch_size=args.batch_size, num_workers=args.num_workers, shuffle=True) return vx, dl
def test4(self): vx = VOXFORGE(self.bdir, download=False, label_type="lang", num_zips=10, randomize=False, dev_mode=False) vx.find_max_len() T = transforms.Compose([ transforms.PadTrim(vx.maxlen), ]) TT = spl_transforms.LENC(vx.LABELS) vx.transform = T vx.target_transform = TT print(vx.splits) dl = data.DataLoader(vx, batch_size=5) total_train = 0 for i, (mb, l) in enumerate(dl): vx.set_split("train") total_train += l.size(0) if i == 2: vx.set_split("valid") total_valid = 0 for mb_valid, l_valid in dl: total_valid += l_valid.size(0) print(total_valid) print(total_train)
def test1(self): # Data vx = VOXFORGE(self.bdir, label_type="lang") vx.find_max_len() print(vx.maxlen) T = tat.Compose([ tat.PadTrim(vx.maxlen), spl_transforms.MEL(n_mels=224), spl_transforms.BLC2CBL(), tvt.ToPILImage(), tvt.Scale((224, 224)), tvt.ToTensor(), ]) TT = spl_transforms.LENC(vx.LABELS) vx.transform = T vx.target_transform = TT dl = data.DataLoader(vx, batch_size = 25, shuffle=True) # Model and Loss model = models.squeezenet.squeezenet(True) model.train() for i, (mb, tgts) in enumerate(dl): vx.set_split("train") out = model(Variable(mb)) print(mb.size(), mb.min(), mb.max()) print(out.data.size()) print(out.data) break
def test1(self): # Data vx = VOXFORGE(self.bdir, label_type="lang", use_cache=True) #vx.find_max_len() vx.maxlen = 150000 T = tat.Compose([ tat.PadTrim(vx.maxlen), tat.MEL(n_mels=224), tat.BLC2CBL(), tvt.ToPILImage(), tvt.Scale((224, 224)), tvt.ToTensor(), ]) TT = spl_transforms.LENC(vx.LABELS) vx.transform = T vx.target_transform = TT dl = data.DataLoader(vx, batch_size = 25, shuffle=True) # Model and Loss model = models.resnet.resnet34(True) print(model) criterion = nn.CrossEntropyLoss() plist = nn.ParameterList() #plist.extend(list(model[0].parameters())) plist.extend(list(model[1].fc.parameters())) #plist.extend(list(model.parameters())) #optimizer = torch.optim.SGD(plist, lr=0.0001, momentum=0.9) optimizer = torch.optim.Adam(plist, lr=0.0001) train_losses = [] valid_losses = [] for i, (mb, tgts) in enumerate(dl): model.train() vx.set_split("train") mb, tgts = Variable(mb), Variable(tgts) model.zero_grad() out = model(mb) loss = criterion(out, tgts) loss.backward() optimizer.step() train_losses.append(loss.data[0]) print(loss.data[0]) if i % 5 == 0: start = time.time() model.eval() vx.set_split("valid") running_validation_loss = 0 correct = 0 for mb_valid, tgts_valid in dl: mb_valid, tgts_valid = Variable(mb_valid), Variable(tgts_valid) out_valid = model(mb_valid) loss_valid = criterion(out_valid, tgts_valid) running_validation_loss += loss_valid.data[0] correct += (out_valid.data.max(1)[1] == tgts_valid.data).sum() print_running_time(start) valid_losses.append((running_validation_loss, correct / len(vx))) print("loss: {}, acc: {}".format(running_validation_loss, correct / len(vx))) if i == 11: break vx.set_split("train")
def test_compose(self): audio_orig = self.sig.clone() length_orig = audio_orig.size(0) length_new = int(length_orig * 1.2) maxminmax = max(abs(audio_orig.min()), abs(audio_orig.max())).item() tset = (transforms.Scale(factor=maxminmax), transforms.PadTrim(max_len=length_new, channels_first=False)) result = transforms.Compose(tset)(audio_orig) self.assertTrue(max(abs(result.min()), abs(result.max())) == 1.) self.assertTrue(result.size(0) == length_new) repr_test = transforms.Compose(tset) self.assertTrue(repr_test.__repr__())
def test_compose(self): audio_orig = self.sig.clone() length_orig = audio_orig.size(0) length_new = int(length_orig * 1.2) maxminmax = np.abs( [audio_orig.min(), audio_orig.max()]).max().astype(np.float) tset = (transforms.Scale(factor=maxminmax), transforms.PadTrim(max_len=length_new)) result = transforms.Compose(tset)(audio_orig) self.assertTrue(np.abs([result.min(), result.max()]).max() == 1.) self.assertTrue(result.size(0) == length_new) repr_test = transforms.Compose(tset) repr_test.__repr__()
def load_dataset(dataset='VCTK', train_subset=1.0, person_filter=None): transfs = transforms.Compose([ transforms.Scale(), prepro.DB_Spec(n_fft=400, hop_t=0.010, win_t=0.025) ]) if dataset == 'VCTK': person_filter = [ 'p249', 'p239', 'p276', 'p283', 'p243', 'p254', 'p258', 'p271' ] train_dataset = vctk_custom_dataset.VCTK('../datasets/VCTK-Corpus/', preprocessed=True, person_filter=person_filter, filter_mode='exclude') test_dataset = vctk_custom_dataset.VCTK('../datasets/VCTK-Corpus/', preprocessed=True, person_filter=person_filter, filter_mode='include') elif dataset == 'LibriSpeech': train_dataset = librispeech_custom_dataset.LibriSpeech( '../datasets/LibriSpeech/', preprocessed=True, split='train', person_filter=person_filter, filter_mode='include') test_dataset = librispeech_custom_dataset.LibriSpeech( '../datasets/LibriSpeech/', preprocessed=True, split='test', person_filter=person_filter, filter_mode='include') indices = list(range(len(train_dataset))) split = int(np.floor(len(train_dataset) * train_subset)) train_sampler = sampler.RandomSampler( sampler.SubsetRandomSampler(indices[:split])) test_sampler = sampler.RandomSampler(test_dataset) kwargs = {'num_workers': 8, 'pin_memory': True} if args.use_cuda else {} train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, sampler=train_sampler, drop_last=False, **kwargs) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.batch_size, sampler=test_sampler, drop_last=False, **kwargs) return train_loader, test_loader, train_dataset, test_dataset
def get_loader(config, data_dir): root = os.path.join(os.path.abspath(os.curdir), data_dir) print('-- Loading audios') dataset = AudioFolder(root=root, transform=transforms.Compose([ transforms.PadTrim(133623, 0), transforms.LC2CL() ])) loader = DataLoader(dataset=dataset, batch_size=config.batch_size, shuffle=True, num_workers=config.num_workers) return loader
def input_fn(request_body, content_type='application/json'): logger.info('Deserializing the input data.') if content_type == 'application/json': input_data = json.loads(request_body) url = input_data['url'] logger.info(f'Image url: {url}') image_data = Image.open(requests.get(url, stream=True).raw) image_transform = transforms.Compose([ transforms.Resize(size=256), transforms.CenterCrop(size=224), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) return image_transform(image_data) raise Exception(f'Requested unsupported ContentType in content_type {content_type}')
def test4(self): ds = AUDIOSET(self.bdir) T = transforms.Compose([ transforms.PadTrim(ds.maxlen), ]) TT = mgc_transforms vx.transform = T dl = data.DataLoader(vx, collate_fn=bce_collate, batch_size=5) total_train = 0 for i, (mb, l) in enumerate(dl): total_train += l.size(0) if i == 2: #ds.set_split("valid") total_valid = 0 for mb_valid, l_valid in dl: total_valid += l_valid.size(0) print(total_valid) print(total_train)
def test3(self): """ Test that the data loader does transforms """ ds = AUDIOSET(self.bdir, randomize=True) T = transforms.Compose([ transforms.PadTrim(ds.maxlen), mgc_transforms.MEL(), mgc_transforms.BLC2CBL() ]) TT = mgc_transforms.BinENC(ds.labels_dict) ds.transform = T ds.target_transform = TT dl = data.DataLoader(ds, collate_fn=bce_collate, batch_size=5) labels_total = 0 print(ds.labels_dict) for i, (a, b) in enumerate(dl): print(a.size(), b.size()) if i > 10: break
def test3(self): vx = VOXFORGE(self.bdir, download=False, label_type="lang", num_zips=10, randomize=False, split="valid", dev_mode=False) vx.find_max_len() T = transforms.Compose([ transforms.PadTrim(vx.maxlen), spl_transforms.MEL(), spl_transforms.BLC2CBL() ]) TT = spl_transforms.LENC(vx.LABELS) vx.transform = T vx.target_transform = TT dl = data.DataLoader(vx, batch_size=5) labels_total = 0 for i, (a, b) in enumerate(dl): labels_total += b.sum() print((len(vx) - labels_total) / len(vx))
def test5(self): import numpy as np vx = VOXFORGE(self.bdir, download=False, label_type="prompts", dev_mode=False) vx.find_max_len() T = transforms.Compose([ transforms.PadTrim(vx.maxlen), ]) TT = spl_transforms.WC() vx.transform = T vx.target_transform = TT print(vx.splits) dl = data.DataLoader(vx, batch_size=5, collate_fn=basic_collate) max_wc = 0 wc_all = [] for i, (mb, tgts) in enumerate(dl): sorted(tgts) max_wc = tgts[-1] if tgts[-1] > max_wc else max_wc wc_all.extend(tgts) print(np.histogram(wc_all, bins=20), len(wc_all))
def test1(self): """ Test that the data loader does transforms """ NMELS = 224 ds = AUDIOSET(self.bdir, randomize=True) T = transforms.Compose([ transforms.PadTrim(ds.maxlen), mgc_transforms.MEL(n_mels=NMELS), mgc_transforms.BLC2CBL() ]) TT = mgc_transforms.BinENC(ds.labels_dict) ds.transform = T ds.target_transform = TT dl = data.DataLoader(ds, collate_fn=bce_collate, batch_size=5) labels_total = 0 for i, (a, b) in enumerate(dl): print(a.size(), b.size()) break self.assertTrue(a.size()[-2:] == (NMELS, 313))
matplotlib.use('Agg') import matplotlib.pyplot as plt from loader_audioset import AUDIOSET import mgc_transforms import torchaudio.transforms as tat AUDIOSET_PATH = os.path.join(os.path.dirname(os.path.dirname(os.path.realpath(__file__))), "data", "audioset") DATASET = "balanced" IMG_SIZE = (10, 5) CMAP_COLOR = "jet" T = tat.Compose([ #tat.PadTrim(self.max_len), mgc_transforms.MEL(sr=16000, n_fft=800, hop_length=320, n_mels=224), mgc_transforms.BLC2CBL(), #mgc_transforms.Scale(), ]) ds = AUDIOSET(AUDIOSET_PATH, transform=T ,dataset=DATASET, num_samples=1) rev_labeler = {x["label_id"]: x["name"] for _, x in ds.labels_dict.items()} for sample, label in ds: sample.squeeze_() sample = sample.numpy() sample = np.log(sample) sample -= sample.min() plt.figure(figsize=IMG_SIZE) plt.title("MEL Spectrogram of {} Audio".format(rev_labeler[label[0]].capitalize())) plt.imshow(sample, interpolation='nearest',
import torch import torchaudio.datasets as dset from torchaudio import transforms transform = transforms.Compose( [transforms.Scale(), transforms.PadTrim(100000)]) train_dataset = dset.YESNO("data", transform=transform, download=True) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=10, ) for i, (input, target) in enumerate(train_loader): import ipdb ipdb.set_trace(context=21) print("HI") """ Vision MNIST test""" """ import torchvision.datasets as vdset from torchvision import transforms as vtransforms transform = vtransforms.Compose([ vtransforms.ToTensor() ]) mnist = vdset.MNIST("data", transform=transform, download=True) mnist_loader = torch.utils.data.DataLoader( mnist, batch_size=10,
def evaluate(): num_classes = 4 # Init logger if not os.path.isdir(args.save_path): os.makedirs(args.save_path) log = open(os.path.join(args.save_path, 'log_seed_{}.txt'.format(args.manualSeed)), 'w') print_log('save path : {}'.format(args.save_path), log) state = {k: v for k, v in args._get_kwargs()} print_log(state, log) print_log("Random Seed: {}".format(args.manualSeed), log) print_log("python version : {}".format(sys.version.replace('\n', ' ')), log) print_log("torch version : {}".format(torch.__version__), log) print_log("cudnn version : {}".format(torch.backends.cudnn.version()), log) # Any other preprocessings? http://pytorch.org/audio/transforms.html sample_length = 10000 scale = transforms.Scale() padtrim = transforms.PadTrim(sample_length) transforms_audio = transforms.Compose([ scale, padtrim ]) # Data loading fs, data = wavfile.read(args.file_name) data = torch.from_numpy(data).float() data = data.unsqueeze(1) audio = transforms_audio(data) audio = Variable(audio) audio = audio.view(1, -1) audio = audio.unsqueeze(0) #Feed in respective model file to pass into model (alexnet.py) print_log("=> creating model '{}'".format(args.arch), log) # Init model, criterion, and optimizer # net = models.__dict__[args.arch](num_classes) net = AlexNet(num_classes) print_log("=> network :\n {}".format(net), log) #Sets use for GPU if available if args.use_cuda: net.cuda() # optionally resume from a checkpoint # Need same python version that the resume was in if args.resume: if os.path.isfile(args.resume): print_log("=> loading checkpoint '{}'".format(args.resume), log) if args.ngpu == 0: checkpoint = torch.load(args.resume, map_location=lambda storage, loc: storage) else: checkpoint = torch.load(args.resume) recorder = checkpoint['recorder'] args.start_epoch = checkpoint['epoch'] net.load_state_dict(checkpoint['state_dict']) print_log("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch']), log) else: print_log("=> no checkpoint found at '{}'".format(args.resume), log) else: print_log("=> do not use any checkpoint for {} model".format(args.arch), log) net.eval() if args.use_cuda: audio = audio.cuda() output = net(audio) print(output) # TODO postprocess output to a string representing the person speaking # ouptut = val_dataset.postprocess_target(output) return
low_mel_freq = 0 high_freq_mel = (2595 * np.log10(1 + (sr / 2) / 700)) mel_pts = np.linspace(low_mel_freq, high_freq_mel, n_filterbanks + 2) hz_pts = np.floor(700 * (10**(mel_pts / 2595) - 1)) bins = np.floor((n_fft + 1) * hz_pts / sr) # data transformations td = { "RfftPow": RfftPow(n_fft), "FilterBanks": FilterBanks(n_filterbanks, bins), "MFCC": MFCC(n_filterbanks, n_coefficients), } transforms = tat.Compose([ tat.Scale(), tat.PadTrim(58000, fill_value=1e-8), Preemphasis(), Sig2Features(ws, hs, td), ]) # set network parameters use_cuda = torch.cuda.is_available() batch_size = args.batch_size input_features = 26 hidden_size = 100 output_size = 3 #output_length = (8 + 7 + 2) # with "blanks" output_length = 8 # without blanks n_layers = 1 attn_modus = "dot" # build networks, criterion, optimizers, dataset and dataloader
seq_M = args.seq_M batch_size = args.batch_size depth = args.depth radixs = [2] * depth N = np.prod(radixs) channels = args.channels lr = args.lr steps = args.steps c = args.c generation_time = args.file_size filename = args.outfile maxlen = 50000 print('==> Downloading YesNo Dataset..') transform = transforms.Compose( [transforms.Scale(), transforms.PadTrim(maxlen), transforms.MuLawEncoding(quantization_channels=channels)]) data = torchaudio.datasets.YESNO('./data', download=True, transform=transform) data_loader = DataLoader(data, batch_size=batch_size, num_workers=4, shuffle=True) print('==> Building model..') net = general_FFTNet(radixs, 128, channels).cuda() print(sum(p.numel() for p in net.parameters() if p.requires_grad), "of parameters.") optimizer = optim.Adam(net.parameters(), lr=lr) criterion = torch.nn.CrossEntropyLoss() print("Start Training.") a = datetime.now().replace(microsecond=0)
import argparse import torch import torchaudio from torchaudio import transforms, save import numpy as np import ujson from vctk_custom_dataset import VCTK import librosa from audio_utils import griffinlim import audio_utils as prepro parser = argparse.ArgumentParser(description='Prepare VCTK Dataset') parser.add_argument('--vctk-path', type=str, metavar='S', required=True, help='(path to VCTK-Corpus)') args = parser.parse_args() transfs = transforms.Compose( [prepro.DB_Spec(sr=11025, n_fft=400, hop_t=0.010, win_t=0.025)]) dataset = VCTK(root=args.vctk_path, transform=transfs)
def create_data_pipelines(H): vocab = Vocabulary(os.path.join(H.ROOT_DIR, H.EXPERIMENT), encoding=H.TARGET_ENCODING) augmentation_transform = transforms.Compose([ AudioNoiseInjection(probability=H.NOISE_BG_PROBABILITY, noise_levels=H.NOISE_BG_LEVELS, noise_dir=H.NOISE_BG_DIR), AudioNoiseGeneration(probability=H.AUDIO_NOISE_PROBABILITY, noise_levels=H.AUDIO_NOISE_LEVELS, noise_colors=H.AUDIO_NOISE_COLORS), AudioPitchShift(probability=H.AUDIO_PITCH_PROBABILITY, sample_rate=H.AUDIO_SAMPLE_RATE, pitch_pm=H.AUDIO_PITCH_PM), AudioTimeStrech(probability=H.AUDIO_SPEED_PROBABILITY, low_high=H.AUDIO_SPEED_LOW_HIGH), AudioDynamicRange(probability=H.AUDIO_DYNAMIC_PROBABILITY, low_high=H.AUDIO_DYNAMIC_LOW_HIGH), AudioTimeShift(probability=H.AUDIO_SHIFT_PROBABILITY, sample_rate=H.AUDIO_SAMPLE_RATE, min_max=H.AUDIO_SHIFT_MIN_MAX), ]) audio_transform_train = transforms.Compose([ AudioAugmentation(augmentation_transform, probability=H.AUGMENTATION_PROBABILITY), AudioNormalizeDB(db=H.NORMALIZE_DB, max_gain_db=H.NORMALIZE_MAX_GAIN), AudioSpectrogram(sample_rate=H.AUDIO_SAMPLE_RATE, window_size=H.SPECT_WINDOW_SIZE, window_stride=H.SPECT_WINDOW_STRIDE, window=H.SPECT_WINDOW), AudioNormalize(), FromNumpyToTensor(tensor_type=torch.FloatTensor) ]) audio_transform = transforms.Compose([ AudioNormalizeDB(db=H.NORMALIZE_DB, max_gain_db=H.NORMALIZE_MAX_GAIN), AudioSpectrogram(sample_rate=H.AUDIO_SAMPLE_RATE, window_size=H.SPECT_WINDOW_SIZE, window_stride=H.SPECT_WINDOW_STRIDE, window=H.SPECT_WINDOW), AudioNormalize(), FromNumpyToTensor(tensor_type=torch.FloatTensor) ]) if 'ctc' in H.TARGET_ENCODING: label_transform = transforms.Compose([ TranscriptEncodeCTC(vocab), FromNumpyToTensor(tensor_type=torch.LongTensor) ]) elif 'sts' in H.TARGET_ENCODING: label_transform = transforms.Compose([ TranscriptEncodeSTS(vocab), FromNumpyToTensor(tensor_type=torch.LongTensor) ]) else: raise ValueError('TARGET_ENCODING value not valid.') train_dataset = AudioDataset(os.path.join(H.ROOT_DIR, H.EXPERIMENT), manifests_files=H.MANIFESTS, datasets=["train", "pseudo"], transform=audio_transform_train, label_transform=label_transform, max_data_size=None, sorted_by='recording_duration', min_max_duration=H.MIN_MAX_AUDIO_DURATION, min_max_length=H.MIN_MAX_TRANSCRIPT_LEN, min_confidence=H.MIN_TRANSCRIPT_CONFIDENCE) train_sampler = BucketingSampler(train_dataset, batch_size=H.BATCH_SIZE) train_loader = torch.utils.data.DataLoader(train_dataset, num_workers=H.NUM_WORKERS, batch_sampler=train_sampler, collate_fn=collate_fn, pin_memory=True) logger.info(train_dataset) valid_dataset = AudioDataset(os.path.join(H.ROOT_DIR, H.EXPERIMENT), manifests_files=H.MANIFESTS, datasets="test", transform=audio_transform, label_transform=label_transform, max_data_size=None, sorted_by='recording_duration') valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=H.BATCH_SIZE, num_workers=H.NUM_WORKERS, shuffle=False, collate_fn=collate_fn, pin_memory=True) logger.info(valid_dataset) return train_loader, valid_loader, vocab
def test(args): if not os.path.exists('experiments'): os.makedirs('experiments') transfs = transforms.Compose([ # transforms.Scale(), prepro.DB_Spec(sr=11025, n_fft=400, hop_t=0.010, win_t=0.025) ]) # mel_basis = librosa.filters.mel(16000, 256, n_mels=80, norm=1) # sr = 16000 if args.model_type == 'vae_g_l': model = vae_g_l.VAE(args) model.load_state_dict( torch.load('experiments/' + args.model_name, map_location=lambda storage, loc: storage)) elif args.model_type == 'vae_l': model = vae_l.VAE(args) model.load_state_dict( torch.load('experiments/' + args.model_name, map_location=lambda storage, loc: storage)) model.eval() if args.dataset == "VCTK": # male example # data, sr = prepro.read_audio('/work/invx030/datasets/VCTK-Corpus/wav48/p245/p245_002.wav') # Female example data, sr = prepro.read_audio( '/work/invx030/datasets/VCTK-Corpus/wav48/p233/p233_003.wav') elif args.dataset == "LibriSpeech": # male # data, sr = prepro.read_audio('/work/invx030/datasets/LibriSpeech/test-clean/1089/134686/1089-134686-0001.flac') # female data, sr = prepro.read_audio( '/work/invx030/datasets/LibriSpeech/test-clean/4507/16021/4507-16021-0001.flac' ) else: raise Exception('No valid dataset provided (use --dataset)') hop_length = int(sr * 0.010) n_fft = 400 win_length = int(sr * 0.025) data = transfs(data) data = data / (torch.min(data)) data = Variable(data) data = data.unsqueeze(0) data = data.transpose(1, 2) original = data if args.predictive: data = F.pad(data, (0, 0, 1, 0), "constant", 1.) original = F.pad(original, (0, 0, 0, 1), "constant", 1.) outs = model(data) reconstruction = outs.decoder_out reconstruction = reconstruction.transpose(1, 2) reconstruction = reconstruction.squeeze(0) reconstruction = (reconstruction.data.cpu()).numpy() reconstruction = reconstruction * -80. original = original.transpose(1, 2) original = original.squeeze(0).squeeze(0) original = (original.data.cpu()).numpy() original = original * -80. librosa.display.specshow(original, sr=sr, hop_length=hop_length, x_axis='time', y_axis='linear', cmap='viridis') plt.colorbar(format='%+2.0f dB') plt.title('Original DB spectrogram') pylab.savefig('experiments/original_spec.png') plt.clf() librosa.display.specshow(reconstruction, sr=sr, hop_length=hop_length, x_axis='time', y_axis='linear', cmap='viridis') plt.colorbar(format='%+2.0f dB') plt.title('Reconstruction DB spectrogram') pylab.savefig('experiments/reconstruction_spec.png') inverse = to_audio(original, sr=sr, n_fft=n_fft, hop_t=0.010, win_t=0.025) librosa.output.write_wav('experiments/original.wav', inverse, sr, norm=True) inverse = to_audio(reconstruction, sr, n_fft=n_fft, hop_t=0.010, win_t=0.025) librosa.output.write_wav('experiments/reconstruction.wav', inverse, sr, norm=True)
seq_M = args.seq_M batch_size = args.batch_size depth = args.depth radixs = [2] * depth N = np.prod(radixs) channels = args.channels lr = args.lr steps = args.steps c = args.c generation_time = args.file_size filename = args.outfile features_size = args.feature_size print('==> Downloading YesNo Dataset..') transform = transforms.Compose([transforms.Scale()]) data = torchaudio.datasets.YESNO('./data', download=True, transform=transform) data_loader = DataLoader(data, batch_size=1, num_workers=2) print('==> Extracting features..') train_wav = [] train_features = [] train_targets = [] for batch_idx, (inputs, _) in enumerate(data_loader): inputs = inputs.view(-1).numpy() targets = np.roll(inputs, shift=-1) #h = mfcc(inputs, sr, winlen=winlen, winstep=winstep, numcep=features_size - 1, winfunc=np.hamming) x = inputs.astype(float)
def main(): # Init logger if not os.path.isdir(args.save_path): os.makedirs(args.save_path) log = open(os.path.join(args.save_path, 'log_seed_{}.txt'.format(args.manualSeed)), 'w') print_log('save path : {}'.format(args.save_path), log) state = {k: v for k, v in args._get_kwargs()} print_log(state, log) print_log("Random Seed: {}".format(args.manualSeed), log) print_log("python version : {}".format(sys.version.replace('\n', ' ')), log) print_log("torch version : {}".format(torch.__version__), log) print_log("cudnn version : {}".format(torch.backends.cudnn.version()), log) # Data loading code # Any other preprocessings? http://pytorch.org/audio/transforms.html sample_length = 10000 scale = transforms.Scale() padtrim = transforms.PadTrim(sample_length) downmix = transforms.DownmixMono() transforms_audio = transforms.Compose([ scale, padtrim, downmix ]) if not os.path.isdir(args.data_path): os.makedirs(args.data_path) train_dir = os.path.join(args.data_path, 'train') val_dir = os.path.join(args.data_path, 'val') #Choose dataset to use if args.dataset == 'arctic': # TODO No ImageFolder equivalent for audio. Need to create a Dataset manually train_dataset = Arctic(train_dir, transform=transforms_audio, download=True) val_dataset = Arctic(val_dir, transform=transforms_audio, download=True) num_classes = 4 elif args.dataset == 'vctk': train_dataset = dset.VCTK(train_dir, transform=transforms_audio, download=True) val_dataset = dset.VCTK(val_dir, transform=transforms_audio, download=True) num_classes = 10 elif args.dataset == 'yesno': train_dataset = dset.YESNO(train_dir, transform=transforms_audio, download=True) val_dataset = dset.YESNO(val_dir, transform=transforms_audio, download=True) num_classes = 2 else: assert False, 'Dataset is incorrect' train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, # pin_memory=True, # What is this? # sampler=None # What is this? ) val_loader = torch.utils.data.DataLoader( val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) #Feed in respective model file to pass into model (alexnet.py) print_log("=> creating model '{}'".format(args.arch), log) # Init model, criterion, and optimizer # net = models.__dict__[args.arch](num_classes) net = AlexNet(num_classes) # print_log("=> network :\n {}".format(net), log) # net = torch.nn.DataParallel(net, device_ids=list(range(args.ngpu))) # define loss function (criterion) and optimizer criterion = torch.nn.CrossEntropyLoss() # Define stochastic gradient descent as optimizer (run backprop on random small batch) optimizer = torch.optim.SGD(net.parameters(), state['learning_rate'], momentum=state['momentum'], weight_decay=state['decay'], nesterov=True) #Sets use for GPU if available if args.use_cuda: net.cuda() criterion.cuda() recorder = RecorderMeter(args.epochs) # optionally resume from a checkpoint # Need same python vresion that the resume was in if args.resume: if os.path.isfile(args.resume): print_log("=> loading checkpoint '{}'".format(args.resume), log) if args.ngpu == 0: checkpoint = torch.load(args.resume, map_location=lambda storage, loc: storage) else: checkpoint = torch.load(args.resume) recorder = checkpoint['recorder'] args.start_epoch = checkpoint['epoch'] net.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print_log("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch']), log) else: print_log("=> no checkpoint found at '{}'".format(args.resume), log) else: print_log("=> do not use any checkpoint for {} model".format(args.arch), log) if args.evaluate: validate(val_loader, net, criterion, 0, log, val_dataset) return # Main loop start_time = time.time() epoch_time = AverageMeter() # Training occurs here for epoch in range(args.start_epoch, args.epochs): current_learning_rate = adjust_learning_rate(optimizer, epoch, args.gammas, args.schedule) need_hour, need_mins, need_secs = convert_secs2time(epoch_time.avg * (args.epochs-epoch)) need_time = '[Need: {:02d}:{:02d}:{:02d}]'.format(need_hour, need_mins, need_secs) print_log('\n==>>{:s} [Epoch={:03d}/{:03d}] {:s} [learning_rate={:6.4f}]'.format(time_string(), epoch, args.epochs, need_time, current_learning_rate) \ + ' [Best : Accuracy={:.2f}, Error={:.2f}]'.format(recorder.max_accuracy(False), 100-recorder.max_accuracy(False)), log) print("One epoch") # train for one epoch # Call to train (note that our previous net is passed into the model argument) train_acc, train_los = train(train_loader, net, criterion, optimizer, epoch, log, train_dataset) # evaluate on validation set #val_acc, val_los = extract_features(test_loader, net, criterion, log) val_acc, val_los = validate(val_loader, net, criterion, epoch, log, val_dataset) is_best = recorder.update(epoch, train_los, train_acc, val_los, val_acc) save_checkpoint({ 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': net.state_dict(), 'recorder': recorder, 'optimizer' : optimizer.state_dict(), }, is_best, args.save_path, 'checkpoint.pth.tar') # measure elapsed time epoch_time.update(time.time() - start_time) start_time = time.time() recorder.plot_curve( os.path.join(args.save_path, 'curve.png') ) log.close()
## Setting seed import random param.seed = param.seed or random.randint(1, 10000) print("Random Seed: " + str(param.seed)) print("Random Seed: " + str(param.seed), file=log_output) random.seed(param.seed) torch.manual_seed(param.seed) if param.cuda: torch.cuda.manual_seed_all(param.seed) ## Transforming audio files trans = transf.Compose([ transf.Scale(), # This makes it into [-1,1] # transf.ToTensor(), transf.PadTrim(max_len=param.audio_size), # I don't know if this is needed # This makes it into [-1,1] so tanh will work properly # transf.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) ]) def load_sound(path): tensor_to_load_into = None import torchaudio out, sample_rate = torchaudio.load(path, tensor_to_load_into) return out ## Importing dataset data = dset.DatasetFolder(root=param.input_folder, transform=trans,
def get_dataloader(self): usl = True if self.loss_criterion == "crossentropy" else False ds = AUDIOSET(self.data_path, dataset=self.args.dataset, noises_dir=self.noises_dir, use_cache=False, num_samples=self.args.num_samples, add_no_label=self.args.add_no_label, use_single_label=usl) if any(x in self.model_name for x in ["resnet34_conv", "resnet101_conv", "squeezenet"]): T = tat.Compose([ #tat.PadTrim(self.max_len, fill_value=1e-8), mgc_transforms.SimpleTrim(self.max_len), mgc_transforms.MEL(sr=16000, n_fft=600, hop_length=300, n_mels=self.args.freq_bands//2), #mgc_transforms.Scale(), mgc_transforms.BLC2CBL(), mgc_transforms.Resize((self.args.freq_bands, self.args.freq_bands)), ]) elif "_mfcc_librosa" in self.model_name: T = tat.Compose([ #tat.PadTrim(self.max_len, fill_value=1e-8), mgc_transforms.SimpleTrim(self.max_len), mgc_transforms.MFCC2(sr=16000, n_fft=600, hop_length=300, n_mfcc=12), mgc_transforms.Scale(), mgc_transforms.BLC2CBL(), mgc_transforms.Resize((self.args.freq_bands, self.args.freq_bands)), ]) elif "_mfcc" in self.model_name: sr = 16000 ws = 800 hs = ws // 2 n_fft = 512 # 256 n_filterbanks = 26 n_coefficients = 12 low_mel_freq = 0 high_freq_mel = (2595 * math.log10(1 + (sr/2) / 700)) mel_pts = torch.linspace(low_mel_freq, high_freq_mel, n_filterbanks + 2) # sr = 16000 hz_pts = torch.floor(700 * (torch.pow(10,mel_pts / 2595) - 1)) bins = torch.floor((n_fft + 1) * hz_pts / sr) td = { "RfftPow": mgc_transforms.RfftPow(n_fft), "FilterBanks": mgc_transforms.FilterBanks(n_filterbanks, bins), "MFCC": mgc_transforms.MFCC(n_filterbanks, n_coefficients), } T = tat.Compose([ #tat.PadTrim(self.max_len, fill_value=1e-8), mgc_transforms.Preemphasis(), mgc_transforms.SimpleTrim(self.max_len), mgc_transforms.Sig2Features(ws, hs, td), mgc_transforms.DummyDim(), mgc_transforms.Scale(), tat.BLC2CBL(), mgc_transforms.Resize((self.args.freq_bands, self.args.freq_bands)), ]) elif "attn" in self.model_name: T = tat.Compose([ mgc_transforms.SimpleTrim(self.max_len), mgc_transforms.MEL(sr=16000, n_fft=600, hop_length=300, n_mels=self.args.freq_bands//2), #mgc_transforms.Scale(), mgc_transforms.SqueezeDim(2), tat.LC2CL(), ]) elif "bytenet" in self.model_name: #offset = 714 # make clips divisible by 224 T = tat.Compose([ mgc_transforms.SimpleTrim(self.max_len), #tat.PadTrim(self.max_len), mgc_transforms.Scale(), tat.LC2CL(), ]) ds.transform = T if self.loss_criterion == "crossentropy": TT = mgc_transforms.XEntENC(ds.labels_dict) #TT = mgc_transforms.BinENC(ds.labels_dict, dtype=torch.int64) else: TT = mgc_transforms.BinENC(ds.labels_dict) ds.target_transform = TT ds.use_cache = self.use_cache if self.use_cache: ds.init_cache() if self.use_precompute: ds.load_precompute(self.model_name) dl = data.DataLoader(ds, batch_size=self.batch_size, drop_last=True, num_workers=self.num_workers, collate_fn=bce_collate, shuffle=True) if "attn" in self.model_name: dl.collate_fn = sort_collate return ds, dl