def merge_subtitle(sub_a, sub_b, delta): out = WebVTTFile() intervals = [item.start.ordinal for item in sub_a] intervals.extend([item.end.ordinal for item in sub_a]) intervals.extend([item.start.ordinal for item in sub_b]) intervals.extend([item.end.ordinal for item in sub_b]) intervals.sort() j = k = 0 for i in xrange(1, len(intervals)): start = WebVTTTime.from_ordinal(intervals[i - 1]) end = WebVTTTime.from_ordinal(intervals[i]) if (end - start) > delta: text_a, j = find_subtitle(sub_a, start, end, j) text_b, k = find_subtitle(sub_b, start, end, k) text = join_lines(text_a, text_b) if len(text) > 0: item = WebVTTItem(0, start, end, text) out.append(item) out.clean_indexes() return out
def merge_subtitle(sub_a, sub_b, delta): out = WebVTTFile() intervals = [item.start.ordinal for item in sub_a] intervals.extend([item.end.ordinal for item in sub_a]) intervals.extend([item.start.ordinal for item in sub_b]) intervals.extend([item.end.ordinal for item in sub_b]) intervals.sort() j = k = 0 for i in xrange(1, len(intervals)): start = WebVTTTime.from_ordinal(intervals[i - 1]) end = WebVTTTime.from_ordinal(intervals[i]) if (end-start) > delta: text_a, j = find_subtitle(sub_a, start, end, j) text_b, k = find_subtitle(sub_b, start, end, k) text = join_lines(text_a, text_b) if len(text) > 0: item = WebVTTItem(0, start, end, text) out.append(item) out.clean_indexes() return out
def main(options): # Ensure ffmpeg is around if not run_ffmpeg(['-version']): log.error( "ffmpeg needs to be available to strip audio from the video file.") exit(1) with NamedTemporaryFile(delete=True) as vid_file: log.info("Downloading %s - this might take a while." % options.vid_url) response = get(options.vid_url, stream=True) total_length = response.headers.get("content-length") if total_length is None: # no content length header log.info("Unknown length - can't predict how long this will take.") f.write(response.content) else: bar = ProgressBar(max_value=int(total_length)) dl = 0 for data in response.iter_content(chunk_size=DOWNLOAD_CHUNK_SIZE): dl += len(data) vid_file.write(data) vid_file.flush() bar.update(dl) log.info("Download done. Stripping audio.") (wav_file, wav_file_name) = mkstemp('.wav') result = run_ffmpeg([ "-y", "-i", vid_file.name, "-vn", "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1", wav_file_name ]) if not result: close(wav_file) log.error("ffmpeg failed. Bailing.") exit(1) fs, audio = wav.read(wav_file_name) close(wav_file) log.info("Will write VTT to %s" % options.output) # Make sure the WAV is to code... log.info("Loading up WAV file...") if fs != 16000: log.error("Only 16000hz WAV files are usable.") exit(1) total_samples = len(audio) duration_hours, duration_minutes, duration_seconds = sample_index_to_time( len(audio)) log.info("Approximate duration: %d:%02d:%02d" % (duration_hours, duration_minutes, duration_seconds)) # Let's load up DeepSpeech and get it ready. log.info("Loading pre-trained DeepSpeech model...") root_model_dir = path.join(options.deepspeech_model_dir, MODEL_DIR) model = path.join(root_model_dir, MODEL_FILE) alphabet = path.join(root_model_dir, MODEL_ALPHABET) lang_model = path.join(root_model_dir, MODEL_LANG_MODEL) trie = path.join(root_model_dir, MODEL_TRIE) deepspeech = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH) log.info("Done loading model.") log.info("Loading language model...") deepspeech.enableDecoderWithLM(alphabet, lang_model, trie, LM_WEIGHT, WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT) log.info("Done loading model.") playhead = 0 out = WebVTTFile() bar = ProgressBar(max_value=total_samples) while playhead < (total_samples - 1): end_point = min(playhead + AUDIO_SEGMENT_SAMPLES, (total_samples - 1)) segment = audio[playhead:end_point] inference = deepspeech.stt(segment, fs) log.debug("Inferred: %s" % inference) start_hours, start_minutes, start_seconds = sample_index_to_time( playhead) playhead = end_point end_hours, end_minutes, end_seconds = sample_index_to_time(playhead) if not inference or inference == "ah": continue for search, replace in INFERENCE_REPLACEMENTS.iteritems(): inference = sub(r"\b" + search + r"\b", replace, inference) inference = fill(inference, width=MAX_CAPTION_WIDTH) start = WebVTTTime(start_hours, start_minutes, start_seconds) end = WebVTTTime(end_hours, end_minutes, end_seconds) item = WebVTTItem(0, start, end, inference) out.append(item) bar.update(playhead) out.save(options.output, encoding="utf-8") out.clean_indexes() out.save(options.output, encoding="utf-8")