Beispiel #1
0
def merge_subtitle(sub_a, sub_b, delta):
    out = WebVTTFile()
    intervals = [item.start.ordinal for item in sub_a]
    intervals.extend([item.end.ordinal for item in sub_a])
    intervals.extend([item.start.ordinal for item in sub_b])
    intervals.extend([item.end.ordinal for item in sub_b])
    intervals.sort()

    j = k = 0
    for i in xrange(1, len(intervals)):
        start = WebVTTTime.from_ordinal(intervals[i - 1])
        end = WebVTTTime.from_ordinal(intervals[i])

        if (end - start) > delta:
            text_a, j = find_subtitle(sub_a, start, end, j)
            text_b, k = find_subtitle(sub_b, start, end, k)

            text = join_lines(text_a, text_b)
            if len(text) > 0:
                item = WebVTTItem(0, start, end, text)
                out.append(item)

    out.clean_indexes()
    return out
Beispiel #2
0
def merge_subtitle(sub_a, sub_b, delta):
    out = WebVTTFile()
    intervals = [item.start.ordinal for item in sub_a]
    intervals.extend([item.end.ordinal for item in sub_a])
    intervals.extend([item.start.ordinal for item in sub_b])
    intervals.extend([item.end.ordinal for item in sub_b])
    intervals.sort()

    j = k = 0
    for i in xrange(1, len(intervals)):
        start = WebVTTTime.from_ordinal(intervals[i - 1])
        end = WebVTTTime.from_ordinal(intervals[i])

        if (end-start) > delta:
            text_a, j = find_subtitle(sub_a, start, end, j)
            text_b, k = find_subtitle(sub_b, start, end, k)

            text = join_lines(text_a, text_b)
            if len(text) > 0:
                item = WebVTTItem(0, start, end, text)
                out.append(item)

    out.clean_indexes()
    return out
def main(options):
    # Ensure ffmpeg is around
    if not run_ffmpeg(['-version']):
        log.error(
            "ffmpeg needs to be available to strip audio from the video file.")
        exit(1)

    with NamedTemporaryFile(delete=True) as vid_file:
        log.info("Downloading %s - this might take a while." % options.vid_url)
        response = get(options.vid_url, stream=True)
        total_length = response.headers.get("content-length")
        if total_length is None:  # no content length header
            log.info("Unknown length - can't predict how long this will take.")
            f.write(response.content)
        else:
            bar = ProgressBar(max_value=int(total_length))
            dl = 0
            for data in response.iter_content(chunk_size=DOWNLOAD_CHUNK_SIZE):
                dl += len(data)
                vid_file.write(data)
                vid_file.flush()
                bar.update(dl)

        log.info("Download done. Stripping audio.")
        (wav_file, wav_file_name) = mkstemp('.wav')
        result = run_ffmpeg([
            "-y", "-i", vid_file.name, "-vn", "-acodec", "pcm_s16le", "-ar",
            "16000", "-ac", "1", wav_file_name
        ])
        if not result:
            close(wav_file)
            log.error("ffmpeg failed. Bailing.")
            exit(1)

        fs, audio = wav.read(wav_file_name)
        close(wav_file)

    log.info("Will write VTT to %s" % options.output)
    # Make sure the WAV is to code...
    log.info("Loading up WAV file...")

    if fs != 16000:
        log.error("Only 16000hz WAV files are usable.")
        exit(1)

    total_samples = len(audio)
    duration_hours, duration_minutes, duration_seconds = sample_index_to_time(
        len(audio))
    log.info("Approximate duration: %d:%02d:%02d" %
             (duration_hours, duration_minutes, duration_seconds))

    # Let's load up DeepSpeech and get it ready.
    log.info("Loading pre-trained DeepSpeech model...")
    root_model_dir = path.join(options.deepspeech_model_dir, MODEL_DIR)

    model = path.join(root_model_dir, MODEL_FILE)
    alphabet = path.join(root_model_dir, MODEL_ALPHABET)
    lang_model = path.join(root_model_dir, MODEL_LANG_MODEL)
    trie = path.join(root_model_dir, MODEL_TRIE)

    deepspeech = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH)
    log.info("Done loading model.")

    log.info("Loading language model...")
    deepspeech.enableDecoderWithLM(alphabet, lang_model, trie, LM_WEIGHT,
                                   WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)
    log.info("Done loading model.")

    playhead = 0

    out = WebVTTFile()

    bar = ProgressBar(max_value=total_samples)
    while playhead < (total_samples - 1):
        end_point = min(playhead + AUDIO_SEGMENT_SAMPLES, (total_samples - 1))
        segment = audio[playhead:end_point]
        inference = deepspeech.stt(segment, fs)
        log.debug("Inferred: %s" % inference)

        start_hours, start_minutes, start_seconds = sample_index_to_time(
            playhead)
        playhead = end_point
        end_hours, end_minutes, end_seconds = sample_index_to_time(playhead)

        if not inference or inference == "ah":
            continue

        for search, replace in INFERENCE_REPLACEMENTS.iteritems():
            inference = sub(r"\b" + search + r"\b", replace, inference)

        inference = fill(inference, width=MAX_CAPTION_WIDTH)

        start = WebVTTTime(start_hours, start_minutes, start_seconds)
        end = WebVTTTime(end_hours, end_minutes, end_seconds)

        item = WebVTTItem(0, start, end, inference)
        out.append(item)
        bar.update(playhead)

        out.save(options.output, encoding="utf-8")

    out.clean_indexes()
    out.save(options.output, encoding="utf-8")