def test_windows1252(self): vtt_string = copen(self.windows_path, encoding='windows-1252').read() vtt_file = from_string(vtt_string, encoding='windows-1252', eol='\r\n') self.assertEqual(len(vtt_file), 1332) self.assertEqual(vtt_file.eol, '\r\n') self.assertRaises(UnicodeDecodeError, vttopen, self.utf8_path, encoding='ascii')
def test_compare_from_string_and_from_path(self): unicode_content = codecs.open(self.utf8_path, encoding='utf_8').read() iterator = zip(pyvtt.open(self.utf8_path), pyvtt.from_string(unicode_content)) for file_item, string_item in iterator: self.assertEqual(str(file_item), str(string_item))
def test_utf8(self): unicode_content = codecs.open(self.utf8_path, encoding='utf_8').read() self.assertEqual(len(pyvtt.from_string(unicode_content)), 1332) self.assertRaises(UnicodeDecodeError, open(self.windows_path).read)
def vtt2bcc(path, threshold=0.1, word=True): path = path if path else "" if os.path.exists(path): subs = pyvtt.open(path) else: subs = pyvtt.from_string(path) caption_list = [] if not word: caption_list = [{ "from": sub.start.ordinal / 1000, "to": sub.end.ordinal / 1000, "location": 2, "content": sub.text_without_tags.split("\n")[-1], } for sub in subs] else: # NOTE 按照 vtt 的断词模式分隔 bcc for i, sub in enumerate(subs): text = sub.text start = sub.start.ordinal / 1000 end = sub.end.ordinal / 1000 try: idx = text.index("<") pre_text = text[:idx] regx = re.compile(r"<(.*?)><c>(.*?)</c>") for t_str, match in regx.findall(text): pre_text += match t = datetime.strptime(t_str, r"%H:%M:%S.%f") sec = (t.hour * 3600 + t.minute * 60 + t.second + t.microsecond / 10**len((str(t.microsecond)))) final_text = pre_text.split("\n")[-1] if caption_list and (sec - start <= threshold or caption_list[-1]["content"] == final_text): caption_list[-1].update({ "to": sec, "content": final_text, }) else: caption_list.append({ "from": start, "to": sec, "location": 2, "content": final_text, }) start = sec except: final_text = sub.text.split("\n")[-1] if caption_list and caption_list[-1][ "content"] == final_text: caption_list[-1].update({ "to": end, "content": final_text, }) else: if caption_list and end - start < threshold: start = caption_list[-1]["to"] caption_list.append({ "from": start, "to": end, "location": 2, "content": final_text, }) # print(len(caption_list)) # NOTE 避免超出视频长度 last = caption_list[-1] last["to"] = last.get("from") + 0.1 bcc = { "font_size": 0.4, "font_color": "#FFFFFF", "background_alpha": 0.5, "background_color": "#9C27B0", "Stroke": "none", "body": caption_list, } return bcc if subs else {}