Ejemplo n.º 1
0
def test_deepspeech():
    model_path = os.path.join("test_samples", "english.pbmm")
    transcription_model = DeepSpeech(model_path)

    audio_path = os.path.join("test_samples", "audio.wav")
    transcription = transcription_model.transcribe(audio_path)
    assert similarity(TEXT, transcription) > MIN_SYNTHESIS_SCORE
Ejemplo n.º 2
0
    def sim_align(self, a, start, end):
        source = self.text[start:end]
        words = source.split(" ")
        best = ""
        best_score = 0
        for i in range(len(words)):
            for j in range(i, len(words)):
                t = " ".join(words[i:j])
                score = similarity(a, t)
                if score > best_score:
                    best = t
                    best_score = score

        start = self.text.index(best)
        end = start + len(best)
        return start, end, best_score
Ejemplo n.º 3
0
def test_hifigan_synthesis():
    hifigan_model_path = os.path.join("test_samples", "hifigan.pt")
    hifigan_config_path = os.path.join("test_samples", "config.json")
    audio_path = "synthesized_audio.wav"
    transcription_model = Silero()

    hifigan = Hifigan(hifigan_model_path, hifigan_config_path)
    text = "the monkeys live"
    synthesize(
        model=FakeModelForSynthesis(),
        text=text,
        graph_path=None,
        audio_path=audio_path,
        vocoder=hifigan,
    )

    assert os.path.isfile(audio_path)
    assert similarity(
        text, transcription_model.transcribe(audio_path)) > MIN_SYNTHESIS_SCORE

    os.remove(audio_path)
Ejemplo n.º 4
0
def generate_clips_from_subtitles(
    audio_path,
    subs,
    transcription_model,
    output_path,
    logging=logging,
    min_length=MIN_LENGTH,
    max_length=MAX_LENGTH,
    min_confidence=MIN_CONFIDENCE,
):
    """
    Generates clips from subtitles.

    Parameters
    ----------
    audio_path : str
        Path to audio file (must have been converted using convert_audio)
    subs : list
        List of pysrt subtitle objects
    transcription_model : TranscriptionModel
        Transcription model
    output_path : str
        Path to save audio clips to
    logging : logging (optional)
        Logging object to write logs to
    min_length : float (optional)
        Minimum duration of a clip in seconds
    max_length : float (optional)
        Maximum duration of a clip in seconds
    min_confidence : float (optional)
        Minimum confidence score to generate a clip for

    Returns
    -------
    (list, list)
        List of clips and clip lengths in seconds
    """
    logging.info("Loading subtitles...")
    total = len(subs)
    logging.info(f"{total} subtitle lines detected...")

    result_fragments = []
    unlabelled_fragments = []
    clip_lengths = []
    for i, sub in enumerate(subs):
        duration = sub.duration.seconds + (sub.duration.milliseconds / 1000)
        if duration >= min_length and duration <= max_length:
            start = sub.start.to_time().strftime("%H:%M:%S.%f")
            end = sub.end.to_time().strftime("%H:%M:%S.%f")
            filename = cut_audio(audio_path, start, end, output_path)
            clip_path = os.path.join(output_path, filename)

            try:
                transcript = transcription_model.transcribe(clip_path)
            except:
                logging.info(f"Could not transcribe {clip_path}")
                transcript = None

            if transcript:
                text = sub.text.strip().replace("\n", " ")
                score = similarity(transcript, text)
                if score >= min_confidence:
                    result_fragments.append(
                        {
                            "name": filename,
                            "start": start,
                            "end": end,
                            "duration": duration,
                            "transcript": transcript,
                            "text": text,
                            "score": score,
                        }
                    )
                    clip_lengths.append(duration)
            else:
                unlabelled_fragments.append(filename)
        logging.info(f"Progress - {i+1}/{total}")

    return result_fragments, unlabelled_fragments, clip_lengths
Ejemplo n.º 5
0
def test_silero():
    transcription_model = Silero()

    audio_path = os.path.join("test_samples", "audio.wav")
    transcription = transcription_model.transcribe(audio_path)
    assert similarity(TEXT, transcription) > MIN_SYNTHESIS_SCORE
Ejemplo n.º 6
0
def test_similarity():
    assert similarity("abc", "def") == 0
    assert similarity("abc", "abc") == 1