def featurize(self, audio_clip): """ For a given audio clip, calculate the corresponding feature Params: audio_clip (str): Path to the audio clip """ if self.spectrogram: return spectrogram_from_file( audio_clip, step=self.step, window=self.window, max_freq=self.max_freq) else: (rate, sig) = wav.read(audio_clip) #if self.pncc: #print('-'*60) #print("pncc") #print('-'*60) #pncc_array = pncc(sig, sr=rate, n_pncc=self.pncc_dim) #print(pncc_array) #print(pncc_array.shape) #return pncc_array #else: #print('-'*60) #print("mfcc") #print('-'*60) pncc_array = pncc(sig) #print(mfcc_array) #print(mfcc_array.shape) return pncc_array
def featurize(self, audio_clip): """ For a given audio clip, calculate the log of its Fourier Transform Params: audio_clip(str): Path to the audio clip """ return spectrogram_from_file( audio_clip, step=self.step, window=self.window, max_freq=self.max_freq)
def featurize(self, audio_clip): if self.spectrogram: return spectrogram_from_file(audio_clip, step=self.step, window=self.window, max_freq=self.max_freq) else: (rate, sig) = wav.read(audio_clip) return mfcc(sig, rate, numcep=self.mfcc_dim)
def featurize(self, audio_clip): """ For a given audio clip, calculate the corresponding feature Params: audio_clip (str): Path to the audio clip """ if self.spectrogram: return spectrogram_from_file( audio_clip, step=self.step, window=self.window, max_freq=self.max_freq) else: (rate, sig) = wav.read(audio_clip) return mfcc(sig, rate, numcep=self.mfcc_dim)
def featurize(self, audio_clip): """ For a given audio clip, calculate the corresponding feature Params: audio_clip (str): Path to the audio clip """ if self.spectrogram: return spectrogram_from_file(audio_clip, step=self.step, window=self.window, max_freq=self.max_freq) else: return mfcc_from_file(filename=audio_clip, mfcc_dim=self.mfcc_dim)
def featurize(self, audio_clip, target=False): """ For a given audio clip, calculate the log of its Fourier Transform Params: audio_clip(str): Path to the audio clip """ if target: pad = 0 else: pad = self.pad return spectrogram_from_file(audio_clip, step=self.step, window=self.window, max_freq=self.max_freq, pad=pad, log=self.use_log)[0]
def featurize(self, audio_clip): """ For a given audio clip, calculate the corresponding feature Params: audio_clip (str): Path to the audio clip """ # print("featurize, self.raw = ", self.raw) if self.spectrogram and not self.raw: return spectrogram_from_file(audio_clip, step=self.step, window=self.window, max_freq=self.max_freq) else: (rate, sig) = wav.read(audio_clip) if self.raw: # print("featurize, sig.shape = ", sig.shape) return sig.reshape(-1, 1) return mfcc(sig, rate, numcep=self.mfcc_dim)
num_hidden = 50 num_layers = 1 batch_size = 1 initial_learning_rate = 1e-2 momentum = 0.9 num_examples = 1 num_batches_per_epoch = int(num_examples / batch_size) # Loading the data audio_filename = maybe_download('LDC93S1.wav', 93638) target_filename = maybe_download('LDC93S1.txt', 62) inputs = spectrogram_from_file(audio_filename, step=10, window=20, max_freq=8000, eps=1e-14) # Tranform in 3D array #print(len(inputs)) train_inputs = np.asarray(inputs[np.newaxis, :]) #print(len(train_inputs)) train_inputs = (train_inputs - np.mean(train_inputs)) / np.std(train_inputs) train_seq_len = [train_inputs.shape[1]] print(train_seq_len) # Readings targets with open(target_filename, 'r') as f: #Only the last line is necessary
def featurize(audio_clip, step=10, window=20, max_freq=22050, desc_file=None): return spectrogram_from_file( audio_clip, mode=ModelMode.TEST, step=step, window=window, max_freq=max_freq)
# from sample_models import own_model # from train_utils import train_my_model # from keras.backend.tensorflow_backend import set_session # from utils import spectrogram_from_file, mfcc_from_file # import tensorflow as tf # allocate 50% of GPU memory (if you like, feel free to change this) # config = tf.ConfigProto() # config.gpu_options.per_process_gpu_memory_fraction = 0.5 # set_session(tf.Session(config=config)) # train # model = own_model(input_dim=161, output_dim=29) # train_my_model(model, pickle_path='own_model_loss.pickle', save_model_path='own_model.h5') import numpy as np from utils import spectrogram_from_file, mfcc_from_file s = spectrogram_from_file('audio/example.wav', 10, 20, 8000) print(s) # print(np.isnan(s)) # print(s.shape) # mfcc = mfcc_from_file('audio/example.wav', 13) # print(mfcc.shape) # print(mfcc[0:2,:])
def load_metadata_from_desc_file(self, desc_file, partition='train', max_duration=10.0): """ Read metadata from the description file (possibly takes long, depending on the filesize) Params: desc_file (str): Path to a JSON-line file that contains labels and paths to the audio files partition (str): One of 'train', 'validation' or 'test' max_duration (float): In seconds, the maximum duration of utterances to train or test on """ logger.info('Reading description file: {} for partition: {}' .format(desc_file, partition)) audio_paths, durations, texts, arpabets = [], [], [], [] with open(desc_file, encoding='utf-8') as json_line_file: for line_num, json_line in enumerate(json_line_file): try: spec = json.loads(json_line) if float(spec['duration']) > max_duration: continue textlen= len(text_to_int_sequence(text_normalize(spec['text']))) speclen= len(spectrogram_from_file(spec['key'])) if textlen > speclen : print('label > feats ignore setence') continue if textlen < 2: print('small label ignore setence') continue audio_paths.append(spec['key']) durations.append(float(spec['duration'])) texts.append(spec['text']) if self.use_arpabets: arpabets.append(spec['arpabet']) except Exception as e: # Change to (KeyError, ValueError) or # (KeyError,json.decoder.JSONDecodeError), depending on # json module version logger.warn('Error reading line #{}: {}' .format(line_num, json_line)) logger.warn(str(e)) if not self.use_arpabets: arpabets = [''] * len(audio_paths) if partition == 'train': self.train_audio_paths = audio_paths self.train_durations = durations self.train_texts = texts self.train_arpabets = arpabets elif partition == 'validation': self.val_audio_paths = audio_paths self.val_durations = durations self.val_texts = texts self.val_arpabets = arpabets elif partition == 'test': self.test_audio_paths = audio_paths self.test_durations = durations self.test_texts = texts self.test_arpabets = arpabets else: raise Exception("Invalid partition to load metadata. " "Must be train/validation/test")
savedir = os.path.join(os.path.dirname(model_path), 'samples_{}ep'.format(best_epoch)) if not os.path.isdir(savedir): os.mkdir(savedir) model = train_loop_best.model.cpu() if hasattr(model, 'blocks'): for block in model.blocks: block.rnn.flatten_parameters() print('Generating samples...') for k, batch in tqdm(enumerate(test_loader)): f = G_test.audio_paths[k] spec, phase = spectrogram_from_file(f, window=window_size, step=step_size, log=args.use_log) #ref, phase_ref = spectrogram_from_file(f, window=window, step=step) with torch.no_grad(): Y_hat, test_loss, layer_outputs = test_fn_all_layers( model, train_loop_best.criterion, batch) # Visualize layer outputs depending on model if type(model) == HighwayModel: # need to plot both output and gate hiddens, masks = layer_outputs n_layers = len(hiddens) if n_layers > 1: fig, axes = plt.subplots(4, n_layers) else: