def vtt_to_df(fn): """ Convert vtt to DataFrame args: fn - filepath to .vtt-file returns: DataFrame """ with open(fn) as f: text = f.read() vtt = WebVTTReader().read(text) subtitles = [] for caption in vtt.get_captions('en-US'): subtitles.append({ 'time': dt.datetime.strptime(caption.format_start(), '%H:%M:%S.%f').strftime('%-Hh%mm%Ss'), 'start': int((dt.datetime.strptime(caption.format_start(), '%H:%M:%S.%f') - dt.datetime(1900, 1, 1)).total_seconds()), 'duration': (caption.end - caption.start) / 100000, 'text': caption.get_text() }) df = pd.DataFrame(subtitles) return df
def getCaptions(url, progress_cb, so_far, task_weight): ydl = youtube_dl.YoutubeDL({ 'writesubtitles': True, 'allsubtitles': True, 'writeautomaticsub': True }) with ydl: res = ydl.extract_info(url, download=False) if res['requested_subtitles'] and res['requested_subtitles']['en']: print('Grabbing vtt file from ' + res['requested_subtitles']['en']['url']) response = requests.get(res['requested_subtitles']['en']['url'], stream=True) b = BytesIO() for block in response.iter_content(1024): b.write(block) b.seek(0) arr = WebVTTReader().read(b.read().decode('ascii')) progress_cb(so_far + task_weight, so_far + task_weight) return arr.get_captions('en-US') else: return [] print('Youtube Video does not have any english captions') return None
new_captions = [] for s, sentence in enumerate(self.sentences): for c, caption in enumerate(sentence.captions): trans = match[s][c] new_caption = deepcopy(caption.raw_caption) new_caption.nodes = [CaptionNode.create_text(trans.strip())] new_captions.append(new_caption) # print(f'"{caption.raw_text}"', f'"{trans}"') new_caption_set = CaptionSet({'en': new_captions}) return new_caption_set input_file = Path("./sendung-vom-15112020-video-ut102~_type-webvtt.vtt") read_srt = WebVTTReader().read(input_file.read_text('UTF-8'), lang='de') sentence_manager = SentenceManager() for raw_caption in read_srt.get_captions('de'): caption = MyCaption(raw_caption) sentence_manager.add_caption(caption) # sentence_manager.finish() # print(sentence_manager) # sentence_manager.write_to_file(Path("./output.txt")) match = sentence_manager.match_translation_from_file( Path("./output_fixed.txt"), Path("./translated.txt")) new_caption_set = sentence_manager.new_caption_set_from_match(match) srt_output = SRTWriter().write(new_caption_set) print(srt_output) Path("./translated.srt").write_text(srt_output, 'UTF-8')