def generate(hop_size=256): while True: shuffled = sklearn.utils.shuffle(files) for f in shuffled: audio, _ = malaya_speech.load(f, sr=sr) mel = malaya_speech.featurization.universal_mel(audio) batch_max_steps = random.randint(16384, 110_250) batch_max_frames = batch_max_steps // hop_size if len(mel) > batch_max_frames: interval_start = 0 interval_end = len(mel) - batch_max_frames start_frame = random.randint(interval_start, interval_end) start_step = start_frame * hop_size audio = audio[start_step:start_step + batch_max_steps] mel = mel[start_frame:start_frame + batch_max_frames, :] v = speaker_model([audio]) yield { 'mel': mel, 'mel_length': [len(mel)], 'audio': audio, 'v': v[0], }
def signal_augmentation(wav): seed = random.randint(0, 100_000_000) wav = calc(wav, seed) if random.gauss(0.5, 0.14) > 0.6: n, _ = malaya_speech.load(random.choice(noises), sr = 16000) n = calc(n, seed, True) combined = augmentation.add_noise( wav, n, factor = random.uniform(0.05, 0.3) ) else: combined = wav return combined.astype('float32')
def read_wav(f): return malaya_speech.load(f, sr = sr)