def test_synthesize(): model_path = os.path.join("files", "tacotron2_statedict.pt") waveglow_path = os.path.join("files", "waveglow_256channels_universal_v5.pt") graph_path = "graph.png" audio_path = "synthesized_audio.wav" model = load_model(model_path) assert model waveglow = load_waveglow_model(waveglow_path) assert waveglow text = "hello everybody my name is david attenborough" inflect_engine = inflect.engine() synthesize(model, waveglow, text, inflect_engine, graph=graph_path, audio=audio_path) assert text_similarity(text, transcribe(audio_path)) > 0.5 assert os.path.isfile(graph_path) assert os.path.isfile(audio_path) os.remove(graph_path) os.remove(audio_path)
def process_segments(audio_path, output_path, segments, min_length, max_length, logging=logging): logging.info("Generating segments...") samples = [] total = len(segments) index = 0 for i in range(total): segment = segments[i] _, time_start, time_end = segment time_length = time_end - time_start if time_length >= min_length and time_length <= max_length: name = cut_audio(audio_path, int(time_start), int(time_end), output_path) clip_path = os.path.join(output_path, name) transcript = transcribe(clip_path) if transcript: samples.append({ "index": index, "start": time_start, "end": time_end, "name": name, "transcript": transcript.strip(), }) index += 1 logging.info(f"Progress - {i+1}/{total}") return samples
def transcribe_clips(folder, labels, output_path): files = os.listdir(folder)[:5] labels = read_labels(labels) data = [] for filename in tqdm(files): prediction = transcribe(os.path.join(folder, filename)) actual = labels[filename[:-4]] score = compare(prediction, actual) data.append(Transcription(filename, prediction, actual, score)) save_results(data, output_path)
def evalulate_audio(audio, text): """ Gets list of words not recognised in the audio. Compares the transcription and given text. Parameters ---------- audio : str Path to audio file text : str Synthesised text Returns ------- set Set of words not recognised in the audio """ results = transcribe(audio) original_words = text.split(" ") produced_words = results.split(" ") return set(original_words) - set(produced_words)
def process_segments(audio_path, output_path, segments, min_length, max_length, logging=logging): """ Generates audio clips and reduces segments to only valid ones. This includes removing segements which are too long, too short or cannot be transcribed. Parameters ---------- audio_path : str Path to audio file output_path : str Path to save clips to segments : list List of segments produced in get_segments min_length : int Minimum length of a clip (in milliseconds) max_length : int Maximum length of a clip (in milliseconds) logging : logging (optional) Logging object to write progress to Returns ------- list List of samples (dictionaries containing clip index, start, end, name & transcript) """ logging.info("Generating segments...") samples = [] total = len(segments) index = 0 for i in range(total): segment = segments[i] _, time_start, time_end = segment time_length = time_end - time_start if time_length >= min_length and time_length <= max_length: name = cut_audio(audio_path, int(time_start), int(time_end), output_path) clip_path = os.path.join(output_path, name) try: transcript = transcribe(clip_path) except: logging.info(f"Could not transcribe {clip_path}") transcript = None if transcript: samples.append({ "index": index, "start": time_start, "end": time_end, "name": name, "transcript": transcript.strip(), }) index += 1 logging.info(f"Progress - {i+1}/{total}") return samples
def evalulate_audio(audio, text): results = transcribe(audio) original_words = text.split(" ") produced_words = results.split(" ") return set(original_words) - set(produced_words)