def get_subs(vtt_subs_path): subs = [] reader = WebVTTReader() with open(vtt_subs_path, 'r') as f: text = f.read().decode(utils.get_file_encoding(vtt_subs_path)) vtt = reader.read(text) vttsubs = vtt.get_captions(vtt.get_languages()[0]) #vttsubs = pyvtt.WebVTTFile.open(vtt_subs_path) print "vttsubs total: %i " % len(vttsubs) print vttsubs[0].start print vttsubs[0].end print vttsubs[0].get_text() for s in vttsubs: subs.append({ "text": s.get_text(), "start": float(s.start) / 1000000, "end": float(s.end) / 1000000 }) return subs
def get_captions_from_output(self, output: str, language: str = 'en') -> str: reader = WebVTTReader() temp_final = [] for caption in reader.read(output, language).get_captions(language): stripped = str(caption).split("\\n")[-1].replace("'", '') timestamp = self.get_time_from_caption(str(caption)) temp_final.append(",".join([timestamp[0], timestamp[1], stripped])) lst = [[], [], []] for c in temp_final: for item in range(len(c.split(","))): lst[item].append(c.split(",")[item]) df = pd.DataFrame({'start': lst[0], 'end': lst[1], 'content': lst[2]}) # final = '' # previous = '' # for line in temp_final.split("\n"): # if previous != line: # final += "," + line # previous = line return df
class WebVTTReaderTestCase(unittest.TestCase): def setUp(self): self.reader = WebVTTReader() def test_positive_answer_for_detection(self): self.assertTrue(self.reader.detect(SAMPLE_WEBVTT)) def test_negative_answer_for_detection(self): self.assertFalse(self.reader.detect(SAMPLE_SRT)) def test_caption_length(self): captions = self.reader.read(SAMPLE_WEBVTT) self.assertEqual(len(captions.get_captions('en-US')), 7) def test_read_supports_multiple_languages(self): captions = self.reader.read(SAMPLE_WEBVTT, lang='es') self.assertIsNotNone(captions.get_captions('es')) def test_proper_timestamps(self): captions = self.reader.read(SAMPLE_WEBVTT) cue = captions.get_captions('en-US')[2] self.assertEqual(cue.start, 17000000) self.assertEqual(cue.end, 18752000) def test_webvtt_cue_components_removed_from_text(self): result = self.reader._clean( "\n" # the first line is sckipped by the cleaner "<c vIntro><b>Wikipedia</b> is a great adventure. <i>It may have " "its shortcomings</i>, but it is<u> the largest</u> collective " "knowledge construction endevour</c> <ruby>base text <rt>" "annotation</rt></ruby> <v Audry><b>Yes</b>, indeed!" ) expected = ( "Wikipedia is a great adventure. It may have " "its shortcomings, but it is the largest collective " "knowledge construction endevour base text annotation" " Audry: Yes, indeed!" ) self.assertEqual(result, expected) def test_empty_file(self): self.assertRaises( CaptionReadNoCaptions, WebVTTReader().read, SAMPLE_WEBVTT_EMPTY)
def get_captions_from_output(self, output: str, url: str) -> str: reader = WebVTTReader() captions = [] for caption in reader.read(output).get_captions('en-US'): stripped = self.remove_time_from_caption( url, str(caption).replace(r'\n', " ")) stripped += "\n" captions.append(stripped) if self.search_query == '': return ''.join(item for item in captions) return self.process_captions(captions, url)
def get_captions_from_output(self, output: str) -> str: reader = WebVTTReader() temp_final = '' for caption in reader.read(output).get_captions('en-US'): stripped = self.remove_time_from_caption( str(caption).replace(r'\n', "\n")) temp_final += stripped final = '' previous = '' for line in temp_final.split("\n"): if previous != line: final += "\n" + line previous = line return final.replace("\n", ' ')[1:]
class WebVTTReaderTestCase(unittest.TestCase): def setUp(self): self.reader = WebVTTReader() def test_positive_answer_for_detection(self): self.assertTrue(self.reader.detect(SAMPLE_WEBVTT.decode(u'utf-8'))) def test_negative_answer_for_detection(self): self.assertFalse(self.reader.detect(SAMPLE_SRT.decode(u'utf-8'))) def test_caption_length(self): captions = self.reader.read(SAMPLE_WEBVTT_2.decode(u'utf-8')) self.assertEqual(len(captions.get_captions(u'en-US')), 7) def test_read_supports_multiple_languages(self): captions = self.reader.read(SAMPLE_WEBVTT.decode(u'utf-8'), lang=u'es') self.assertIsNotNone(captions.get_captions(u'es')) def test_proper_timestamps(self): captions = self.reader.read(SAMPLE_WEBVTT.decode(u'utf-8')) cue = captions.get_captions(u'en-US')[2] self.assertEqual(cue.start, 17000000) self.assertEqual(cue.end, 18752000) def test_webvtt_cue_components_removed_from_text(self): result = self.reader._remove_styles( u"<c vIntro><b>Wikipedia</b> is a great adventure. <i>It may have " u"its shortcomings</i>, but it is<u> the largest</u> collective " u"knowledge construction endevour</c> <ruby>base text <rt>" u"annotation</rt></ruby> <v Audry><b>Yes</b>, indeed!" ) expected = ( u"Wikipedia is a great adventure. It may have " u"its shortcomings, but it is the largest collective " u"knowledge construction endevour base text annotation" u" Audry: Yes, indeed!" ) self.assertEqual(result, expected) def test_empty_file(self): self.assertRaises( CaptionReadNoCaptions, WebVTTReader().read, SAMPLE_WEBVTT_EMPTY.decode(u'utf-8')) def test_not_ignoring_timing_errors(self): self.assertRaises( CaptionReadSyntaxError, WebVTTReader(ignore_timing_errors=False).read, (u"\n" u"00:00:20,000 --> 00:00:10,000\n" u"foo bar baz") ) self.assertRaises( CaptionReadError, WebVTTReader(ignore_timing_errors=False).read, (u"00:00:20,000 --> 00:00:10,000\n" u"Start time is greater than end time.\n") ) self.assertRaises( CaptionReadError, WebVTTReader(ignore_timing_errors=False).read, (u"00:00:20,000 --> 00:00:30,000\n" u"Start times should be consecutive.\n" u"\n" u"00:00:10,000 --> 00:00:20,000\n" u"This cue starts before the previous one.\n") ) def test_ignoring_timing_errors(self): # Even if timing errors are ignored, this is worse self.assertRaises( CaptionReadSyntaxError, WebVTTReader().read, (u"\nNOTE invalid cue stamp\n" u"00:00:20,000 --> \n" u"foo bar baz\n") ) try: WebVTTReader().read( (u"\n" u"00:00:20,000 --> 00:00:10,000\n" u"Start time is greater than end time.\n") ) except CaptionReadError: self.fail(u"Shouldn't raise CaptionReadError") try: WebVTTReader().read( (u"\n" u"00:00:20,000 --> 00:00:30,000\n" u"Start times should be consecutive.\n" u"\n" u"00:00:10,000 --> 00:00:20,000\n" u"This cue starts before the previous one.\n") ) except CaptionReadError: self.fail(u"Shouldn't raise CaptionReadError") def test_invalid_files(self): self.assertRaises( CaptionReadSyntaxError, WebVTTReader().read, (u"\nNOTE Cues without text are invalid.\n" u"00:00:20,000 --> 00:00:30,000\n" u"\n" u"00:00:40,000 --> 00:00:50,000\n" u"foo bar baz\n") ) self.assertRaises( CaptionReadError, WebVTTReader(ignore_timing_errors=False).read, (u"00:00:20,000 --> 00:00:10,000\n" u"Start time is greater than end time.") ) self.assertRaises( CaptionReadError, WebVTTReader(ignore_timing_errors=False).read, (u"00:00:20,000 --> 00:00:30,000\n" u"Start times should be consecutive.\n" u"\n" u"00:00:10,000 --> 00:00:20,000\n" u"This cue starts before the previous one.\n") )
class WebVTTReaderTestCase(unittest.TestCase): def setUp(self): self.reader = WebVTTReader() def test_positive_answer_for_detection(self): self.assertTrue(self.reader.detect(SAMPLE_WEBVTT.decode(u'utf-8'))) def test_negative_answer_for_detection(self): self.assertFalse(self.reader.detect(SAMPLE_SRT.decode(u'utf-8'))) def test_caption_length(self): captions = self.reader.read(SAMPLE_WEBVTT_2.decode(u'utf-8')) self.assertEqual(len(captions.get_captions(u'en-US')), 7) def test_read_supports_multiple_languages(self): captions = self.reader.read(SAMPLE_WEBVTT.decode(u'utf-8'), lang=u'es') self.assertIsNotNone(captions.get_captions(u'es')) def test_proper_timestamps(self): captions = self.reader.read(SAMPLE_WEBVTT.decode(u'utf-8')) cue = captions.get_captions(u'en-US')[2] self.assertEqual(cue.start, 17000000) self.assertEqual(cue.end, 18752000) def test_webvtt_cue_components_removed_from_text(self): result = self.reader._remove_styles( u"<c vIntro><b>Wikipedia</b> is a great adventure. <i>It may have " u"its shortcomings</i>, but it is<u> the largest</u> collective " u"knowledge construction endevour</c> <ruby>base text <rt>" u"annotation</rt></ruby> <v Audry><b>Yes</b>, indeed!" ) expected = ( u"Wikipedia is a great adventure. It may have " u"its shortcomings, but it is the largest collective " u"knowledge construction endevour base text annotation" u" Audry: Yes, indeed!" ) self.assertEqual(result, expected) def test_empty_file(self): self.assertRaises( CaptionReadNoCaptions, WebVTTReader().read, SAMPLE_WEBVTT_EMPTY.decode(u'utf-8')) def test_invalid_files(self): self.assertRaises( CaptionReadSyntaxError, WebVTTReader().read, u""" NOTE Cues without text are invalid. 00:00:20,000 --> 00:00:10,000 """ ) self.assertRaises( CaptionReadError, WebVTTReader().read, u""" 00:00:20,000 --> 00:00:10,000 Start time is greater than end time. """ ) self.assertRaises( CaptionReadError, WebVTTReader().read, u""" 00:00:20,000 --> 00:00:30,000 Start times should be consecutive. 00:00:10,000 --> 00:00:20,000 This cue starts before the previous one. """ )
class WebVTTReaderTestCase(unittest.TestCase): def setUp(self): self.reader = WebVTTReader() def test_positive_answer_for_detection(self): self.assertTrue(self.reader.detect(SAMPLE_WEBVTT)) def test_negative_answer_for_detection(self): self.assertFalse(self.reader.detect(SAMPLE_SRT)) def test_caption_length(self): captions = self.reader.read(SAMPLE_WEBVTT_2) self.assertEqual(len(captions.get_captions(u'en-US')), 7) def test_read_supports_multiple_languages(self): captions = self.reader.read(SAMPLE_WEBVTT, lang=u'es') self.assertIsNotNone(captions.get_captions(u'es')) def test_proper_timestamps(self): captions = self.reader.read(SAMPLE_WEBVTT) cue = captions.get_captions(u'en-US')[2] self.assertEqual(cue.start, 17000000) self.assertEqual(cue.end, 18752000) def test_webvtt_cue_components_removed_from_text(self): result = self.reader._remove_styles( u"<c vIntro><b>Wikipedia</b> is a great adventure. <i>It may have " u"its shortcomings</i>, but it is<u> the largest</u> collective " u"knowledge construction endevour</c> <ruby>base text <rt>" u"annotation</rt></ruby> <v Audry><b>Yes</b>, indeed!" ) expected = ( u"Wikipedia is a great adventure. It may have " u"its shortcomings, but it is the largest collective " u"knowledge construction endevour base text annotation" u" Audry: Yes, indeed!" ) self.assertEqual(result, expected) def test_empty_file(self): self.assertRaises( CaptionReadNoCaptions, WebVTTReader().read, SAMPLE_WEBVTT_EMPTY) def test_not_ignoring_timing_errors(self): self.assertRaises( CaptionReadError, WebVTTReader(ignore_timing_errors=False).read, (u"\n" u"00:00:20.000 --> 00:00:10.000\n" u"foo bar baz") ) self.assertRaises( CaptionReadError, WebVTTReader(ignore_timing_errors=False).read, (u"00:00:20.000 --> 00:00:10.000\n" u"Start time is greater than end time.\n") ) self.assertRaises( CaptionReadError, WebVTTReader(ignore_timing_errors=False).read, (u"00:00:20.000 --> 00:00:30.000\n" u"Start times should be consecutive.\n" u"\n" u"00:00:10.000 --> 00:00:20.000\n" u"This cue starts before the previous one.\n") ) def test_ignoring_timing_errors(self): # Even if timing errors are ignored, this has to raise an exception self.assertRaises( CaptionReadSyntaxError, WebVTTReader().read, (u"\nNOTE invalid cue stamp\n" u"00:00:20.000 --> \n" u"foo bar baz\n") ) # And this too self.assertRaises( CaptionReadSyntaxError, WebVTTReader().read, (u"\n00:00:20,000 --> 00:00:22,000\n" u"Note the comma instead of point.\n") ) try: WebVTTReader().read( (u"\n" u"00:00:20.000 --> 00:00:10.000\n" u"Start time is greater than end time.\n") ) except CaptionReadError: self.fail(u"Shouldn't raise CaptionReadError") try: WebVTTReader().read( (u"\n" u"00:00:20.000 --> 00:00:30.000\n" u"Start times should be consecutive.\n" u"\n" u"00:00:10.000 --> 00:00:20.000\n" u"This cue starts before the previous one.\n") ) except CaptionReadError: self.fail(u"Shouldn't raise CaptionReadError") def test_invalid_files(self): self.assertRaises( CaptionReadSyntaxError, WebVTTReader().read, (u"\nNOTE Cues without text are invalid.\n" u"00:00:20.000 --> 00:00:30.000\n" u"\n" u"00:00:40.000 --> 00:00:50.000\n" u"foo bar baz\n") ) self.assertRaises( CaptionReadError, WebVTTReader(ignore_timing_errors=False).read, (u"00:00:20.000 --> 00:00:10.000\n" u"Start time is greater than end time.") ) self.assertRaises( CaptionReadError, WebVTTReader(ignore_timing_errors=False).read, (u"00:00:20.000 --> 00:00:30.000\n" u"Start times should be consecutive.\n" u"\n" u"00:00:10.000 --> 00:00:20.000\n" u"This cue starts before the previous one.\n") ) def test_zero_start(self): captions = self.reader.read(SAMPLE_WEBVTT_LAST_CUE_ZERO_START) cue = captions.get_captions(u'en-US')[0] self.assertEquals(cue.start, 0)
class TestWebVTTReader: def setup_method(self): self.reader = WebVTTReader() def test_positive_answer_for_detection(self, sample_webvtt): assert self.reader.detect(sample_webvtt) is True def test_negative_answer_for_detection(self, sample_srt): assert self.reader.detect(sample_srt) is False def test_caption_length(self, sample_webvtt_2): captions = self.reader.read(sample_webvtt_2) assert len(captions.get_captions('en-US')) == 7 def test_read_supports_multiple_languages(self, sample_webvtt): captions = self.reader.read(sample_webvtt, lang='es') assert captions.get_captions('es') is not None def test_proper_timestamps(self, sample_webvtt): captions = self.reader.read(sample_webvtt) cue = captions.get_captions('en-US')[2] assert cue.start == 17000000 assert cue.end == 18752000 def test_webvtt_cue_components_removed_from_text(self): result = self.reader._remove_styles( "<c vIntro><b>Wikipedia</b> is a great adventure. <i>It may have " "its shortcomings</i>, but it is<u> the largest</u> collective " "knowledge construction endevour</c> <ruby>base text <rt>" "annotation</rt></ruby> <v Audry><b>Yes</b>, indeed!") expected = ("Wikipedia is a great adventure. It may have " "its shortcomings, but it is the largest collective " "knowledge construction endevour base text annotation" " Audry: Yes, indeed!") assert result == expected def test_empty_file(self, sample_webvtt_empty): with pytest.raises(CaptionReadNoCaptions): WebVTTReader().read(sample_webvtt_empty) def test_not_ignoring_timing_errors(self): # todo: same assert w/ different arguments -> this can be parametrized; with pytest.raises(CaptionReadError): WebVTTReader(ignore_timing_errors=False).read( "\n" "00:00:20.000 --> 00:00:10.000\n" "foo bar baz") with pytest.raises(CaptionReadError): WebVTTReader(ignore_timing_errors=False).read( "00:00:20.000 --> 00:00:10.000\n" "Start time is greater than end time.\n") with pytest.raises(CaptionReadError): WebVTTReader(ignore_timing_errors=False).read( "00:00:20.000 --> 00:00:30.000\n" "Start times should be consecutive.\n" "\n" "00:00:10.000 --> 00:00:20.000\n" "This cue starts before the previous one.\n") def test_ignoring_timing_errors(self): # Even if timing errors are ignored, this has to raise an exception with pytest.raises(CaptionReadSyntaxError): WebVTTReader().read( "\nNOTE invalid cue stamp\n00:00:20.000 --> \nfoo bar baz\n") # And this too with pytest.raises(CaptionReadSyntaxError): WebVTTReader().read("\n00:00:20,000 --> 00:00:22,000\n" "Note the comma instead of point.\n") # todo: at this point it can be split into 2 separate tests try: WebVTTReader().read("\n" "00:00:20.000 --> 00:00:10.000\n" "Start time is greater than end time.\n") except CaptionReadError: pytest.fail("Shouldn't raise CaptionReadError") try: WebVTTReader().read("\n" "00:00:20.000 --> 00:00:30.000\n" "Start times should be consecutive.\n" "\n" "00:00:10.000 --> 00:00:20.000\n" "This cue starts before the previous one.\n") except CaptionReadError: pytest.fail("Shouldn't raise CaptionReadError") def test_invalid_files(self): with pytest.raises(CaptionReadError): WebVTTReader(ignore_timing_errors=False).read( "00:00:20.000 --> 00:00:10.000\n" "Start time is greater than end time.") with pytest.raises(CaptionReadError): WebVTTReader(ignore_timing_errors=False).read( "00:00:20.000 --> 00:00:30.000\n" "Start times should be consecutive.\n" "\n" "00:00:10.000 --> 00:00:20.000\n" "This cue starts before the previous one.\n") def test_zero_start(self, sample_webvtt_last_cue_zero_start): captions = self.reader.read(sample_webvtt_last_cue_zero_start) cue = captions.get_captions('en-US')[0] assert cue.start == 0 def test_webvtt_empty_cue(self, sample_webvtt_empty_cue): assert 1 == len( self.reader.read(sample_webvtt_empty_cue).get_captions('en-US'))