def get_subs(vtt_subs_path):
    subs = []

    reader = WebVTTReader()

    with open(vtt_subs_path, 'r') as f:
        text = f.read().decode(utils.get_file_encoding(vtt_subs_path))
        vtt = reader.read(text)

        vttsubs = vtt.get_captions(vtt.get_languages()[0])
        #vttsubs = pyvtt.WebVTTFile.open(vtt_subs_path)

        print "vttsubs total: %i " % len(vttsubs)

        print vttsubs[0].start
        print vttsubs[0].end
        print vttsubs[0].get_text()

        for s in vttsubs:
            subs.append({
                "text": s.get_text(),
                "start": float(s.start) / 1000000,
                "end": float(s.end) / 1000000
            })

        return subs
Beispiel #2
0
    def get_captions_from_output(self,
                                 output: str,
                                 language: str = 'en') -> str:
        reader = WebVTTReader()

        temp_final = []
        for caption in reader.read(output, language).get_captions(language):
            stripped = str(caption).split("\\n")[-1].replace("'", '')
            timestamp = self.get_time_from_caption(str(caption))
            temp_final.append(",".join([timestamp[0], timestamp[1], stripped]))

        lst = [[], [], []]

        for c in temp_final:
            for item in range(len(c.split(","))):
                lst[item].append(c.split(",")[item])

        df = pd.DataFrame({'start': lst[0], 'end': lst[1], 'content': lst[2]})

        # final = ''
        # previous = ''
        # for line in temp_final.split("\n"):
        #     if previous != line:
        #         final += "," + line
        #     previous = line
        return df
Beispiel #3
0
class WebVTTReaderTestCase(unittest.TestCase):

    def setUp(self):
        self.reader = WebVTTReader()

    def test_positive_answer_for_detection(self):
        self.assertTrue(self.reader.detect(SAMPLE_WEBVTT))

    def test_negative_answer_for_detection(self):
        self.assertFalse(self.reader.detect(SAMPLE_SRT))

    def test_caption_length(self):
        captions = self.reader.read(SAMPLE_WEBVTT)
        self.assertEqual(len(captions.get_captions('en-US')), 7)

    def test_read_supports_multiple_languages(self):
        captions = self.reader.read(SAMPLE_WEBVTT, lang='es')
        self.assertIsNotNone(captions.get_captions('es'))

    def test_proper_timestamps(self):
        captions = self.reader.read(SAMPLE_WEBVTT)
        cue = captions.get_captions('en-US')[2]
        self.assertEqual(cue.start, 17000000)
        self.assertEqual(cue.end, 18752000)

    def test_webvtt_cue_components_removed_from_text(self):
        result = self.reader._clean(
            "\n"  # the first line is sckipped by the cleaner
            "<c vIntro><b>Wikipedia</b> is a great adventure. <i>It may have "
            "its shortcomings</i>, but it is<u> the largest</u> collective "
            "knowledge construction endevour</c> <ruby>base text <rt>"
            "annotation</rt></ruby> <v Audry><b>Yes</b>, indeed!"
        )
        expected = (
            "Wikipedia is a great adventure. It may have "
            "its shortcomings, but it is the largest collective "
            "knowledge construction endevour base text annotation"
            " Audry: Yes, indeed!"
        )
        self.assertEqual(result, expected)

    def test_empty_file(self):
        self.assertRaises(
            CaptionReadNoCaptions,
            WebVTTReader().read, SAMPLE_WEBVTT_EMPTY)
Beispiel #4
0
    def get_captions_from_output(self, output: str, url: str) -> str:
        reader = WebVTTReader()

        captions = []
        for caption in reader.read(output).get_captions('en-US'):
            stripped = self.remove_time_from_caption(
                url,
                str(caption).replace(r'\n', " "))
            stripped += "\n"
            captions.append(stripped)

        if self.search_query == '':
            return ''.join(item for item in captions)

        return self.process_captions(captions, url)
Beispiel #5
0
    def get_captions_from_output(self, output: str) -> str:
        reader = WebVTTReader()

        temp_final = ''
        for caption in reader.read(output).get_captions('en-US'):
            stripped = self.remove_time_from_caption(
                str(caption).replace(r'\n', "\n"))
            temp_final += stripped

        final = ''
        previous = ''
        for line in temp_final.split("\n"):
            if previous != line:
                final += "\n" + line
            previous = line

        return final.replace("\n", ' ')[1:]
Beispiel #6
0
class WebVTTReaderTestCase(unittest.TestCase):

    def setUp(self):
        self.reader = WebVTTReader()

    def test_positive_answer_for_detection(self):
        self.assertTrue(self.reader.detect(SAMPLE_WEBVTT.decode(u'utf-8')))

    def test_negative_answer_for_detection(self):
        self.assertFalse(self.reader.detect(SAMPLE_SRT.decode(u'utf-8')))

    def test_caption_length(self):
        captions = self.reader.read(SAMPLE_WEBVTT_2.decode(u'utf-8'))
        self.assertEqual(len(captions.get_captions(u'en-US')), 7)

    def test_read_supports_multiple_languages(self):
        captions = self.reader.read(SAMPLE_WEBVTT.decode(u'utf-8'), lang=u'es')
        self.assertIsNotNone(captions.get_captions(u'es'))

    def test_proper_timestamps(self):
        captions = self.reader.read(SAMPLE_WEBVTT.decode(u'utf-8'))
        cue = captions.get_captions(u'en-US')[2]
        self.assertEqual(cue.start, 17000000)
        self.assertEqual(cue.end, 18752000)

    def test_webvtt_cue_components_removed_from_text(self):
        result = self.reader._remove_styles(
            u"<c vIntro><b>Wikipedia</b> is a great adventure. <i>It may have "
            u"its shortcomings</i>, but it is<u> the largest</u> collective "
            u"knowledge construction endevour</c> <ruby>base text <rt>"
            u"annotation</rt></ruby> <v Audry><b>Yes</b>, indeed!"
        )
        expected = (
            u"Wikipedia is a great adventure. It may have "
            u"its shortcomings, but it is the largest collective "
            u"knowledge construction endevour base text annotation"
            u" Audry: Yes, indeed!"
        )
        self.assertEqual(result, expected)

    def test_empty_file(self):
        self.assertRaises(
            CaptionReadNoCaptions,
            WebVTTReader().read, SAMPLE_WEBVTT_EMPTY.decode(u'utf-8'))

    def test_not_ignoring_timing_errors(self):
        self.assertRaises(
            CaptionReadSyntaxError,
            WebVTTReader(ignore_timing_errors=False).read,
            (u"\n"
             u"00:00:20,000 --> 00:00:10,000\n"
             u"foo bar baz")
        )

        self.assertRaises(
            CaptionReadError,
            WebVTTReader(ignore_timing_errors=False).read,
            (u"00:00:20,000 --> 00:00:10,000\n"
             u"Start time is greater than end time.\n")
        )

        self.assertRaises(
            CaptionReadError,
            WebVTTReader(ignore_timing_errors=False).read,
            (u"00:00:20,000 --> 00:00:30,000\n"
             u"Start times should be consecutive.\n"
             u"\n"
             u"00:00:10,000 --> 00:00:20,000\n"
             u"This cue starts before the previous one.\n")
        )

    def test_ignoring_timing_errors(self):
        # Even if timing errors are ignored, this is worse
        self.assertRaises(
            CaptionReadSyntaxError,
            WebVTTReader().read,
            (u"\nNOTE invalid cue stamp\n"
             u"00:00:20,000 --> \n"
             u"foo bar baz\n")
        )

        try:
            WebVTTReader().read(
                (u"\n"
                 u"00:00:20,000 --> 00:00:10,000\n"
                 u"Start time is greater than end time.\n")
        )
        except CaptionReadError:
            self.fail(u"Shouldn't raise CaptionReadError")

        try:
            WebVTTReader().read(
                (u"\n"
                 u"00:00:20,000 --> 00:00:30,000\n"
                 u"Start times should be consecutive.\n"
                 u"\n"
                 u"00:00:10,000 --> 00:00:20,000\n"
                 u"This cue starts before the previous one.\n")

        )
        except CaptionReadError:
            self.fail(u"Shouldn't raise CaptionReadError")

    def test_invalid_files(self):
        self.assertRaises(
            CaptionReadSyntaxError,
            WebVTTReader().read,
            (u"\nNOTE Cues without text are invalid.\n"
            u"00:00:20,000 --> 00:00:30,000\n"
            u"\n"
            u"00:00:40,000 --> 00:00:50,000\n"
            u"foo bar baz\n")
        )

        self.assertRaises(
            CaptionReadError,
            WebVTTReader(ignore_timing_errors=False).read,
            (u"00:00:20,000 --> 00:00:10,000\n"
            u"Start time is greater than end time.")
        )

        self.assertRaises(
            CaptionReadError,
            WebVTTReader(ignore_timing_errors=False).read,
            (u"00:00:20,000 --> 00:00:30,000\n"
            u"Start times should be consecutive.\n"
            u"\n"
            u"00:00:10,000 --> 00:00:20,000\n"
            u"This cue starts before the previous one.\n")
        )
Beispiel #7
0
class WebVTTReaderTestCase(unittest.TestCase):

    def setUp(self):
        self.reader = WebVTTReader()

    def test_positive_answer_for_detection(self):
        self.assertTrue(self.reader.detect(SAMPLE_WEBVTT.decode(u'utf-8')))

    def test_negative_answer_for_detection(self):
        self.assertFalse(self.reader.detect(SAMPLE_SRT.decode(u'utf-8')))

    def test_caption_length(self):
        captions = self.reader.read(SAMPLE_WEBVTT_2.decode(u'utf-8'))
        self.assertEqual(len(captions.get_captions(u'en-US')), 7)

    def test_read_supports_multiple_languages(self):
        captions = self.reader.read(SAMPLE_WEBVTT.decode(u'utf-8'), lang=u'es')
        self.assertIsNotNone(captions.get_captions(u'es'))

    def test_proper_timestamps(self):
        captions = self.reader.read(SAMPLE_WEBVTT.decode(u'utf-8'))
        cue = captions.get_captions(u'en-US')[2]
        self.assertEqual(cue.start, 17000000)
        self.assertEqual(cue.end, 18752000)

    def test_webvtt_cue_components_removed_from_text(self):
        result = self.reader._remove_styles(
            u"<c vIntro><b>Wikipedia</b> is a great adventure. <i>It may have "
            u"its shortcomings</i>, but it is<u> the largest</u> collective "
            u"knowledge construction endevour</c> <ruby>base text <rt>"
            u"annotation</rt></ruby> <v Audry><b>Yes</b>, indeed!"
        )
        expected = (
            u"Wikipedia is a great adventure. It may have "
            u"its shortcomings, but it is the largest collective "
            u"knowledge construction endevour base text annotation"
            u" Audry: Yes, indeed!"
        )
        self.assertEqual(result, expected)

    def test_empty_file(self):
        self.assertRaises(
            CaptionReadNoCaptions,
            WebVTTReader().read, SAMPLE_WEBVTT_EMPTY.decode(u'utf-8'))

    def test_invalid_files(self):
        self.assertRaises(
            CaptionReadSyntaxError,
            WebVTTReader().read,
            u"""
            NOTE Cues without text are invalid.

            00:00:20,000 --> 00:00:10,000
            """
        )

        self.assertRaises(
            CaptionReadError,
            WebVTTReader().read,
            u"""
            00:00:20,000 --> 00:00:10,000
            Start time is greater than end time.
            """
        )

        self.assertRaises(
            CaptionReadError,
            WebVTTReader().read,
            u"""
            00:00:20,000 --> 00:00:30,000
            Start times should be consecutive.

            00:00:10,000 --> 00:00:20,000
            This cue starts before the previous one.
            """
        )
Beispiel #8
0
class WebVTTReaderTestCase(unittest.TestCase):

    def setUp(self):
        self.reader = WebVTTReader()

    def test_positive_answer_for_detection(self):
        self.assertTrue(self.reader.detect(SAMPLE_WEBVTT))

    def test_negative_answer_for_detection(self):
        self.assertFalse(self.reader.detect(SAMPLE_SRT))

    def test_caption_length(self):
        captions = self.reader.read(SAMPLE_WEBVTT_2)
        self.assertEqual(len(captions.get_captions(u'en-US')), 7)

    def test_read_supports_multiple_languages(self):
        captions = self.reader.read(SAMPLE_WEBVTT, lang=u'es')
        self.assertIsNotNone(captions.get_captions(u'es'))

    def test_proper_timestamps(self):
        captions = self.reader.read(SAMPLE_WEBVTT)
        cue = captions.get_captions(u'en-US')[2]
        self.assertEqual(cue.start, 17000000)
        self.assertEqual(cue.end, 18752000)

    def test_webvtt_cue_components_removed_from_text(self):
        result = self.reader._remove_styles(
            u"<c vIntro><b>Wikipedia</b> is a great adventure. <i>It may have "
            u"its shortcomings</i>, but it is<u> the largest</u> collective "
            u"knowledge construction endevour</c> <ruby>base text <rt>"
            u"annotation</rt></ruby> <v Audry><b>Yes</b>, indeed!"
        )
        expected = (
            u"Wikipedia is a great adventure. It may have "
            u"its shortcomings, but it is the largest collective "
            u"knowledge construction endevour base text annotation"
            u" Audry: Yes, indeed!"
        )
        self.assertEqual(result, expected)

    def test_empty_file(self):
        self.assertRaises(
            CaptionReadNoCaptions,
            WebVTTReader().read, SAMPLE_WEBVTT_EMPTY)

    def test_not_ignoring_timing_errors(self):
        self.assertRaises(
            CaptionReadError,
            WebVTTReader(ignore_timing_errors=False).read,
            (u"\n"
             u"00:00:20.000 --> 00:00:10.000\n"
             u"foo bar baz")
        )

        self.assertRaises(
            CaptionReadError,
            WebVTTReader(ignore_timing_errors=False).read,
            (u"00:00:20.000 --> 00:00:10.000\n"
             u"Start time is greater than end time.\n")
        )

        self.assertRaises(
            CaptionReadError,
            WebVTTReader(ignore_timing_errors=False).read,
            (u"00:00:20.000 --> 00:00:30.000\n"
             u"Start times should be consecutive.\n"
             u"\n"
             u"00:00:10.000 --> 00:00:20.000\n"
             u"This cue starts before the previous one.\n")
        )

    def test_ignoring_timing_errors(self):
        # Even if timing errors are ignored, this has to raise an exception
        self.assertRaises(
            CaptionReadSyntaxError,
            WebVTTReader().read,
            (u"\nNOTE invalid cue stamp\n"
             u"00:00:20.000 --> \n"
             u"foo bar baz\n")
        )

        # And this too
        self.assertRaises(
            CaptionReadSyntaxError,
            WebVTTReader().read,
            (u"\n00:00:20,000 --> 00:00:22,000\n"
             u"Note the comma instead of point.\n")
        )

        try:
            WebVTTReader().read(
                (u"\n"
                 u"00:00:20.000 --> 00:00:10.000\n"
                 u"Start time is greater than end time.\n")
            )
        except CaptionReadError:
            self.fail(u"Shouldn't raise CaptionReadError")

        try:
            WebVTTReader().read(
                (u"\n"
                 u"00:00:20.000 --> 00:00:30.000\n"
                 u"Start times should be consecutive.\n"
                 u"\n"
                 u"00:00:10.000 --> 00:00:20.000\n"
                 u"This cue starts before the previous one.\n")

            )
        except CaptionReadError:
            self.fail(u"Shouldn't raise CaptionReadError")

    def test_invalid_files(self):
        self.assertRaises(
            CaptionReadSyntaxError,
            WebVTTReader().read,
            (u"\nNOTE Cues without text are invalid.\n"
                u"00:00:20.000 --> 00:00:30.000\n"
                u"\n"
                u"00:00:40.000 --> 00:00:50.000\n"
                u"foo bar baz\n")
        )

        self.assertRaises(
            CaptionReadError,
            WebVTTReader(ignore_timing_errors=False).read,
            (u"00:00:20.000 --> 00:00:10.000\n"
                u"Start time is greater than end time.")
        )

        self.assertRaises(
            CaptionReadError,
            WebVTTReader(ignore_timing_errors=False).read,
            (u"00:00:20.000 --> 00:00:30.000\n"
                u"Start times should be consecutive.\n"
                u"\n"
                u"00:00:10.000 --> 00:00:20.000\n"
                u"This cue starts before the previous one.\n")
        )

    def test_zero_start(self):
        captions = self.reader.read(SAMPLE_WEBVTT_LAST_CUE_ZERO_START)
        cue = captions.get_captions(u'en-US')[0]
        self.assertEquals(cue.start, 0)
Beispiel #9
0
class TestWebVTTReader:
    def setup_method(self):
        self.reader = WebVTTReader()

    def test_positive_answer_for_detection(self, sample_webvtt):
        assert self.reader.detect(sample_webvtt) is True

    def test_negative_answer_for_detection(self, sample_srt):
        assert self.reader.detect(sample_srt) is False

    def test_caption_length(self, sample_webvtt_2):
        captions = self.reader.read(sample_webvtt_2)

        assert len(captions.get_captions('en-US')) == 7

    def test_read_supports_multiple_languages(self, sample_webvtt):
        captions = self.reader.read(sample_webvtt, lang='es')

        assert captions.get_captions('es') is not None

    def test_proper_timestamps(self, sample_webvtt):
        captions = self.reader.read(sample_webvtt)
        cue = captions.get_captions('en-US')[2]

        assert cue.start == 17000000
        assert cue.end == 18752000

    def test_webvtt_cue_components_removed_from_text(self):
        result = self.reader._remove_styles(
            "<c vIntro><b>Wikipedia</b> is a great adventure. <i>It may have "
            "its shortcomings</i>, but it is<u> the largest</u> collective "
            "knowledge construction endevour</c> <ruby>base text <rt>"
            "annotation</rt></ruby> <v Audry><b>Yes</b>, indeed!")
        expected = ("Wikipedia is a great adventure. It may have "
                    "its shortcomings, but it is the largest collective "
                    "knowledge construction endevour base text annotation"
                    " Audry: Yes, indeed!")
        assert result == expected

    def test_empty_file(self, sample_webvtt_empty):
        with pytest.raises(CaptionReadNoCaptions):
            WebVTTReader().read(sample_webvtt_empty)

    def test_not_ignoring_timing_errors(self):
        # todo: same assert w/ different arguments -> this can be parametrized;
        with pytest.raises(CaptionReadError):
            WebVTTReader(ignore_timing_errors=False).read(
                "\n"
                "00:00:20.000 --> 00:00:10.000\n"
                "foo bar baz")

        with pytest.raises(CaptionReadError):
            WebVTTReader(ignore_timing_errors=False).read(
                "00:00:20.000 --> 00:00:10.000\n"
                "Start time is greater than end time.\n")

        with pytest.raises(CaptionReadError):
            WebVTTReader(ignore_timing_errors=False).read(
                "00:00:20.000 --> 00:00:30.000\n"
                "Start times should be consecutive.\n"
                "\n"
                "00:00:10.000 --> 00:00:20.000\n"
                "This cue starts before the previous one.\n")

    def test_ignoring_timing_errors(self):
        # Even if timing errors are ignored, this has to raise an exception
        with pytest.raises(CaptionReadSyntaxError):
            WebVTTReader().read(
                "\nNOTE invalid cue stamp\n00:00:20.000 --> \nfoo bar baz\n")

        # And this too
        with pytest.raises(CaptionReadSyntaxError):
            WebVTTReader().read("\n00:00:20,000 --> 00:00:22,000\n"
                                "Note the comma instead of point.\n")

        # todo: at this point it can be split into 2 separate tests
        try:
            WebVTTReader().read("\n"
                                "00:00:20.000 --> 00:00:10.000\n"
                                "Start time is greater than end time.\n")
        except CaptionReadError:
            pytest.fail("Shouldn't raise CaptionReadError")

        try:
            WebVTTReader().read("\n"
                                "00:00:20.000 --> 00:00:30.000\n"
                                "Start times should be consecutive.\n"
                                "\n"
                                "00:00:10.000 --> 00:00:20.000\n"
                                "This cue starts before the previous one.\n")
        except CaptionReadError:
            pytest.fail("Shouldn't raise CaptionReadError")

    def test_invalid_files(self):
        with pytest.raises(CaptionReadError):
            WebVTTReader(ignore_timing_errors=False).read(
                "00:00:20.000 --> 00:00:10.000\n"
                "Start time is greater than end time.")

        with pytest.raises(CaptionReadError):
            WebVTTReader(ignore_timing_errors=False).read(
                "00:00:20.000 --> 00:00:30.000\n"
                "Start times should be consecutive.\n"
                "\n"
                "00:00:10.000 --> 00:00:20.000\n"
                "This cue starts before the previous one.\n")

    def test_zero_start(self, sample_webvtt_last_cue_zero_start):
        captions = self.reader.read(sample_webvtt_last_cue_zero_start)
        cue = captions.get_captions('en-US')[0]

        assert cue.start == 0

    def test_webvtt_empty_cue(self, sample_webvtt_empty_cue):
        assert 1 == len(
            self.reader.read(sample_webvtt_empty_cue).get_captions('en-US'))