Ejemplo n.º 1
0
def vtt_to_df(fn):
    """
    Convert vtt to DataFrame

    args:
        fn - filepath to .vtt-file

    returns:
        DataFrame
    """

    with open(fn) as f:
        text = f.read()

    vtt = WebVTTReader().read(text)

    subtitles = []
    for caption in vtt.get_captions('en-US'):
        subtitles.append({
            'time':
            dt.datetime.strptime(caption.format_start(),
                                 '%H:%M:%S.%f').strftime('%-Hh%mm%Ss'),
            'start':
            int((dt.datetime.strptime(caption.format_start(), '%H:%M:%S.%f') -
                 dt.datetime(1900, 1, 1)).total_seconds()),
            'duration': (caption.end - caption.start) / 100000,
            'text':
            caption.get_text()
        })

    df = pd.DataFrame(subtitles)
    return df
Ejemplo n.º 2
0
    def test_invalid_files(self):
        self.assertRaises(
            CaptionReadSyntaxError,
            WebVTTReader().read,
            (u"\nNOTE Cues without text are invalid.\n"
            u"00:00:20,000 --> 00:00:30,000\n"
            u"\n"
            u"00:00:40,000 --> 00:00:50,000\n"
            u"foo bar baz\n")
        )

        self.assertRaises(
            CaptionReadError,
            WebVTTReader(ignore_timing_errors=False).read,
            (u"00:00:20,000 --> 00:00:10,000\n"
            u"Start time is greater than end time.")
        )

        self.assertRaises(
            CaptionReadError,
            WebVTTReader(ignore_timing_errors=False).read,
            (u"00:00:20,000 --> 00:00:30,000\n"
            u"Start times should be consecutive.\n"
            u"\n"
            u"00:00:10,000 --> 00:00:20,000\n"
            u"This cue starts before the previous one.\n")
        )
Ejemplo n.º 3
0
    def test_ignoring_timing_errors(self):
        # Even if timing errors are ignored, this has to raise an exception
        with pytest.raises(CaptionReadSyntaxError):
            WebVTTReader().read(
                "\nNOTE invalid cue stamp\n00:00:20.000 --> \nfoo bar baz\n")

        # And this too
        with pytest.raises(CaptionReadSyntaxError):
            WebVTTReader().read("\n00:00:20,000 --> 00:00:22,000\n"
                                "Note the comma instead of point.\n")

        # todo: at this point it can be split into 2 separate tests
        try:
            WebVTTReader().read("\n"
                                "00:00:20.000 --> 00:00:10.000\n"
                                "Start time is greater than end time.\n")
        except CaptionReadError:
            pytest.fail("Shouldn't raise CaptionReadError")

        try:
            WebVTTReader().read("\n"
                                "00:00:20.000 --> 00:00:30.000\n"
                                "Start times should be consecutive.\n"
                                "\n"
                                "00:00:10.000 --> 00:00:20.000\n"
                                "This cue starts before the previous one.\n")
        except CaptionReadError:
            pytest.fail("Shouldn't raise CaptionReadError")
Ejemplo n.º 4
0
    def test_not_ignoring_timing_errors(self):
        self.assertRaises(
            CaptionReadSyntaxError,
            WebVTTReader(ignore_timing_errors=False).read,
            (u"\n"
             u"00:00:20,000 --> 00:00:10,000\n"
             u"foo bar baz")
        )

        self.assertRaises(
            CaptionReadError,
            WebVTTReader(ignore_timing_errors=False).read,
            (u"00:00:20,000 --> 00:00:10,000\n"
             u"Start time is greater than end time.\n")
        )

        self.assertRaises(
            CaptionReadError,
            WebVTTReader(ignore_timing_errors=False).read,
            (u"00:00:20,000 --> 00:00:30,000\n"
             u"Start times should be consecutive.\n"
             u"\n"
             u"00:00:10,000 --> 00:00:20,000\n"
             u"This cue starts before the previous one.\n")
        )
Ejemplo n.º 5
0
    def get_captions_from_output(self,
                                 output: str,
                                 language: str = 'en') -> str:
        reader = WebVTTReader()

        temp_final = []
        for caption in reader.read(output, language).get_captions(language):
            stripped = str(caption).split("\\n")[-1].replace("'", '')
            timestamp = self.get_time_from_caption(str(caption))
            temp_final.append(",".join([timestamp[0], timestamp[1], stripped]))

        lst = [[], [], []]

        for c in temp_final:
            for item in range(len(c.split(","))):
                lst[item].append(c.split(",")[item])

        df = pd.DataFrame({'start': lst[0], 'end': lst[1], 'content': lst[2]})

        # final = ''
        # previous = ''
        # for line in temp_final.split("\n"):
        #     if previous != line:
        #         final += "," + line
        #     previous = line
        return df
Ejemplo n.º 6
0
    def test_ignoring_timing_errors(self):
        # Even if timing errors are ignored, this is worse
        self.assertRaises(
            CaptionReadSyntaxError,
            WebVTTReader().read,
            (u"\nNOTE invalid cue stamp\n"
             u"00:00:20,000 --> \n"
             u"foo bar baz\n")
        )

        try:
            WebVTTReader().read(
                (u"\n"
                 u"00:00:20,000 --> 00:00:10,000\n"
                 u"Start time is greater than end time.\n")
        )
        except CaptionReadError:
            self.fail(u"Shouldn't raise CaptionReadError")

        try:
            WebVTTReader().read(
                (u"\n"
                 u"00:00:20,000 --> 00:00:30,000\n"
                 u"Start times should be consecutive.\n"
                 u"\n"
                 u"00:00:10,000 --> 00:00:20,000\n"
                 u"This cue starts before the previous one.\n")

        )
        except CaptionReadError:
            self.fail(u"Shouldn't raise CaptionReadError")
Ejemplo n.º 7
0
    def test_ignoring_timing_errors(self):
        # Even if timing errors are ignored, this has to raise an exception
        self.assertRaises(CaptionReadSyntaxError,
                          WebVTTReader().read, ("\nNOTE invalid cue stamp\n"
                                                "00:00:20.000 --> \n"
                                                "foo bar baz\n"))

        # And this too
        self.assertRaises(CaptionReadSyntaxError,
                          WebVTTReader().read,
                          ("\n00:00:20,000 --> 00:00:22,000\n"
                           "Note the comma instead of point.\n"))

        try:
            WebVTTReader().read(("\n"
                                 "00:00:20.000 --> 00:00:10.000\n"
                                 "Start time is greater than end time.\n"))
        except CaptionReadError:
            self.fail("Shouldn't raise CaptionReadError")

        try:
            WebVTTReader().read(("\n"
                                 "00:00:20.000 --> 00:00:30.000\n"
                                 "Start times should be consecutive.\n"
                                 "\n"
                                 "00:00:10.000 --> 00:00:20.000\n"
                                 "This cue starts before the previous one.\n"))
        except CaptionReadError:
            self.fail("Shouldn't raise CaptionReadError")
def get_subs(vtt_subs_path):
    subs = []

    reader = WebVTTReader()

    with open(vtt_subs_path, 'r') as f:
        text = f.read().decode(utils.get_file_encoding(vtt_subs_path))
        vtt = reader.read(text)

        vttsubs = vtt.get_captions(vtt.get_languages()[0])
        #vttsubs = pyvtt.WebVTTFile.open(vtt_subs_path)

        print "vttsubs total: %i " % len(vttsubs)

        print vttsubs[0].start
        print vttsubs[0].end
        print vttsubs[0].get_text()

        for s in vttsubs:
            subs.append({
                "text": s.get_text(),
                "start": float(s.start) / 1000000,
                "end": float(s.end) / 1000000
            })

        return subs
    def test_invalid_files(self):
        self.assertRaises(CaptionReadError,
                          WebVTTReader(ignore_timing_errors=False).read,
                          (u"00:00:20.000 --> 00:00:10.000\n"
                           u"Start time is greater than end time."))

        self.assertRaises(CaptionReadError,
                          WebVTTReader(ignore_timing_errors=False).read,
                          (u"00:00:20.000 --> 00:00:30.000\n"
                           u"Start times should be consecutive.\n"
                           u"\n"
                           u"00:00:10.000 --> 00:00:20.000\n"
                           u"This cue starts before the previous one.\n"))
Ejemplo n.º 10
0
    def test_invalid_files(self):
        with pytest.raises(CaptionReadError):
            WebVTTReader(ignore_timing_errors=False).read(
                "00:00:20.000 --> 00:00:10.000\n"
                "Start time is greater than end time.")

        with pytest.raises(CaptionReadError):
            WebVTTReader(ignore_timing_errors=False).read(
                "00:00:20.000 --> 00:00:30.000\n"
                "Start times should be consecutive.\n"
                "\n"
                "00:00:10.000 --> 00:00:20.000\n"
                "This cue starts before the previous one.\n")
Ejemplo n.º 11
0
    def get_captions_from_output(self, output: str, url: str) -> str:
        reader = WebVTTReader()

        captions = []
        for caption in reader.read(output).get_captions('en-US'):
            stripped = self.remove_time_from_caption(
                url,
                str(caption).replace(r'\n', " "))
            stripped += "\n"
            captions.append(stripped)

        if self.search_query == '':
            return ''.join(item for item in captions)

        return self.process_captions(captions, url)
Ejemplo n.º 12
0
    def test_webvtt_to_microdvd_conversion(self, sample_microdvd,
                                           sample_webvtt):
        caption_set = WebVTTReader().read(sample_webvtt)
        results = MicroDVDWriter().write(caption_set)

        assert isinstance(results, str)
        self.assert_microdvd_equals(sample_microdvd, results)
Ejemplo n.º 13
0
    def test_webvtt_to_webvtt_conversion(self, sample_webvtt_from_webvtt,
                                         sample_webvtt):
        caption_set = WebVTTReader().read(sample_webvtt)
        results = WebVTTWriter().write(caption_set)

        assert isinstance(results, str)
        self.assert_webvtt_equals(sample_webvtt_from_webvtt, results)
Ejemplo n.º 14
0
    def test_positioning_is_kept(self,
                                 sample_webvtt_from_dfxp_with_positioning):
        caption_set = WebVTTReader().read(
            sample_webvtt_from_dfxp_with_positioning)
        results = WebVTTWriter().write(caption_set)

        assert sample_webvtt_from_dfxp_with_positioning == results
Ejemplo n.º 15
0
 def test_webvtt_to_dfxp_conversion(self):
     caption_set = WebVTTReader().read(SAMPLE_WEBVTT.decode(u'utf-8'))
     results = DFXPWriter().write(caption_set)
     self.assertTrue(isinstance(results, unicode))
     self.assertDFXPEquals(SAMPLE_DFXP_UNICODE,
                           results,
                           ignore_styling=True,
                           ignore_spans=True)
Ejemplo n.º 16
0
    def get_captions_from_output(self, output: str) -> str:
        reader = WebVTTReader()

        temp_final = ''
        for caption in reader.read(output).get_captions('en-US'):
            stripped = self.remove_time_from_caption(
                str(caption).replace(r'\n', "\n"))
            temp_final += stripped

        final = ''
        previous = ''
        for line in temp_final.split("\n"):
            if previous != line:
                final += "\n" + line
            previous = line

        return final.replace("\n", ' ')[1:]
 def test_webvtt_to_dfxp_conversion(self):
     caption_set = WebVTTReader().read(SAMPLE_WEBVTT)
     results = DFXPWriter().write(caption_set)
     self.assertTrue(isinstance(results, six.text_type))
     self.assertDFXPEquals(SAMPLE_DFXP,
                           results,
                           ignore_styling=True,
                           ignore_spans=True)
Ejemplo n.º 18
0
def convert_vtt_to_srt(dir):
    for vtt_file in glob.glob(os.path.join(dir, "*.vtt")):
        with open(os.path.splitext(vtt_file)[0] + '.srt', 'w') as srt:
            vtt = open(vtt_file, 'r')
            vttsub = vtt.read().decode('UTF-8')
            srtsub = SRTWriter().write(WebVTTReader().read(vttsub))
            srt.write(srtsub.encode('UTF-8'))
            vtt.close()
            os.remove(vtt_file)
Ejemplo n.º 19
0
    def test_webvtt_to_dfxp_conversion(self, sample_dfxp, sample_webvtt):
        caption_set = WebVTTReader().read(sample_webvtt)
        results = DFXPWriter().write(caption_set)

        assert isinstance(results, str)
        self.assert_dfxp_equals(sample_dfxp,
                                results,
                                ignore_styling=True,
                                ignore_spans=True)
Ejemplo n.º 20
0
    def test_not_ignoring_timing_errors(self):
        # todo: same assert w/ different arguments -> this can be parametrized;
        with pytest.raises(CaptionReadError):
            WebVTTReader(ignore_timing_errors=False).read(
                "\n"
                "00:00:20.000 --> 00:00:10.000\n"
                "foo bar baz")

        with pytest.raises(CaptionReadError):
            WebVTTReader(ignore_timing_errors=False).read(
                "00:00:20.000 --> 00:00:10.000\n"
                "Start time is greater than end time.\n")

        with pytest.raises(CaptionReadError):
            WebVTTReader(ignore_timing_errors=False).read(
                "00:00:20.000 --> 00:00:30.000\n"
                "Start times should be consecutive.\n"
                "\n"
                "00:00:10.000 --> 00:00:20.000\n"
                "This cue starts before the previous one.\n")
Ejemplo n.º 21
0
def get(content_id, lang):
        args = {
                'content_id': content_id
        }
        CClist = requests.get(api_url, params = args)
        CClink = etree.HTML(CClist.content).xpath('//transcripts/' + lang + '/text()')[0].replace('captions','captions_webvtt').replace('smi','vtt')
        origCC = requests.get(CClink)
        srtCC = SRTWriter().write(WebVTTReader().read(origCC.text))
        srt_file = open(content_id + '.' + lang + '.srt', 'w')
        srt_file.write(srtCC.replace('\n', '\r\n').encode('utf-8'))
        srt_file.close()
        return 0
Ejemplo n.º 22
0
def getCaptions(url, progress_cb, so_far, task_weight):
    ydl = youtube_dl.YoutubeDL({
        'writesubtitles': True,
        'allsubtitles': True,
        'writeautomaticsub': True
    })
    with ydl:
        res = ydl.extract_info(url, download=False)
        if res['requested_subtitles'] and res['requested_subtitles']['en']:
            print('Grabbing vtt file from ' +
                  res['requested_subtitles']['en']['url'])
            response = requests.get(res['requested_subtitles']['en']['url'],
                                    stream=True)
            b = BytesIO()
            for block in response.iter_content(1024):
                b.write(block)
            b.seek(0)
            arr = WebVTTReader().read(b.read().decode('ascii'))
            progress_cb(so_far + task_weight, so_far + task_weight)
            return arr.get_captions('en-US')
        else:
            return []
            print('Youtube Video does not have any english captions')
            return None
Ejemplo n.º 23
0
class WebVTTReaderTestCase(unittest.TestCase):

    def setUp(self):
        self.reader = WebVTTReader()

    def test_positive_answer_for_detection(self):
        self.assertTrue(self.reader.detect(SAMPLE_WEBVTT))

    def test_negative_answer_for_detection(self):
        self.assertFalse(self.reader.detect(SAMPLE_SRT))

    def test_caption_length(self):
        captions = self.reader.read(SAMPLE_WEBVTT)
        self.assertEqual(len(captions.get_captions('en-US')), 7)

    def test_read_supports_multiple_languages(self):
        captions = self.reader.read(SAMPLE_WEBVTT, lang='es')
        self.assertIsNotNone(captions.get_captions('es'))

    def test_proper_timestamps(self):
        captions = self.reader.read(SAMPLE_WEBVTT)
        cue = captions.get_captions('en-US')[2]
        self.assertEqual(cue.start, 17000000)
        self.assertEqual(cue.end, 18752000)

    def test_webvtt_cue_components_removed_from_text(self):
        result = self.reader._clean(
            "\n"  # the first line is sckipped by the cleaner
            "<c vIntro><b>Wikipedia</b> is a great adventure. <i>It may have "
            "its shortcomings</i>, but it is<u> the largest</u> collective "
            "knowledge construction endevour</c> <ruby>base text <rt>"
            "annotation</rt></ruby> <v Audry><b>Yes</b>, indeed!"
        )
        expected = (
            "Wikipedia is a great adventure. It may have "
            "its shortcomings, but it is the largest collective "
            "knowledge construction endevour base text annotation"
            " Audry: Yes, indeed!"
        )
        self.assertEqual(result, expected)

    def test_empty_file(self):
        self.assertRaises(
            CaptionReadNoCaptions,
            WebVTTReader().read, SAMPLE_WEBVTT_EMPTY)
Ejemplo n.º 24
0
 def test_empty_file(self):
     self.assertRaises(
         CaptionReadNoCaptions,
         WebVTTReader().read, SAMPLE_WEBVTT_EMPTY.decode(u'utf-8'))
Ejemplo n.º 25
0
 def setUp(self):
     self.reader = WebVTTReader()
 def test_positioning_is_kept(self):
     caption_set = WebVTTReader().read(
         SAMPLE_WEBVTT_FROM_DFXP_WITH_POSITIONING)
     results = WebVTTWriter().write(caption_set)
     self.assertEqual(SAMPLE_WEBVTT_FROM_DFXP_WITH_POSITIONING, results)
Ejemplo n.º 27
0
class WebVTTReaderTestCase(unittest.TestCase):

    def setUp(self):
        self.reader = WebVTTReader()

    def test_positive_answer_for_detection(self):
        self.assertTrue(self.reader.detect(SAMPLE_WEBVTT.decode(u'utf-8')))

    def test_negative_answer_for_detection(self):
        self.assertFalse(self.reader.detect(SAMPLE_SRT.decode(u'utf-8')))

    def test_caption_length(self):
        captions = self.reader.read(SAMPLE_WEBVTT_2.decode(u'utf-8'))
        self.assertEqual(len(captions.get_captions(u'en-US')), 7)

    def test_read_supports_multiple_languages(self):
        captions = self.reader.read(SAMPLE_WEBVTT.decode(u'utf-8'), lang=u'es')
        self.assertIsNotNone(captions.get_captions(u'es'))

    def test_proper_timestamps(self):
        captions = self.reader.read(SAMPLE_WEBVTT.decode(u'utf-8'))
        cue = captions.get_captions(u'en-US')[2]
        self.assertEqual(cue.start, 17000000)
        self.assertEqual(cue.end, 18752000)

    def test_webvtt_cue_components_removed_from_text(self):
        result = self.reader._remove_styles(
            u"<c vIntro><b>Wikipedia</b> is a great adventure. <i>It may have "
            u"its shortcomings</i>, but it is<u> the largest</u> collective "
            u"knowledge construction endevour</c> <ruby>base text <rt>"
            u"annotation</rt></ruby> <v Audry><b>Yes</b>, indeed!"
        )
        expected = (
            u"Wikipedia is a great adventure. It may have "
            u"its shortcomings, but it is the largest collective "
            u"knowledge construction endevour base text annotation"
            u" Audry: Yes, indeed!"
        )
        self.assertEqual(result, expected)

    def test_empty_file(self):
        self.assertRaises(
            CaptionReadNoCaptions,
            WebVTTReader().read, SAMPLE_WEBVTT_EMPTY.decode(u'utf-8'))

    def test_invalid_files(self):
        self.assertRaises(
            CaptionReadSyntaxError,
            WebVTTReader().read,
            u"""
            NOTE Cues without text are invalid.

            00:00:20,000 --> 00:00:10,000
            """
        )

        self.assertRaises(
            CaptionReadError,
            WebVTTReader().read,
            u"""
            00:00:20,000 --> 00:00:10,000
            Start time is greater than end time.
            """
        )

        self.assertRaises(
            CaptionReadError,
            WebVTTReader().read,
            u"""
            00:00:20,000 --> 00:00:30,000
            Start times should be consecutive.

            00:00:10,000 --> 00:00:20,000
            This cue starts before the previous one.
            """
        )
Ejemplo n.º 28
0
class WebVTTReaderTestCase(unittest.TestCase):

    def setUp(self):
        self.reader = WebVTTReader()

    def test_positive_answer_for_detection(self):
        self.assertTrue(self.reader.detect(SAMPLE_WEBVTT))

    def test_negative_answer_for_detection(self):
        self.assertFalse(self.reader.detect(SAMPLE_SRT))

    def test_caption_length(self):
        captions = self.reader.read(SAMPLE_WEBVTT_2)
        self.assertEqual(len(captions.get_captions(u'en-US')), 7)

    def test_read_supports_multiple_languages(self):
        captions = self.reader.read(SAMPLE_WEBVTT, lang=u'es')
        self.assertIsNotNone(captions.get_captions(u'es'))

    def test_proper_timestamps(self):
        captions = self.reader.read(SAMPLE_WEBVTT)
        cue = captions.get_captions(u'en-US')[2]
        self.assertEqual(cue.start, 17000000)
        self.assertEqual(cue.end, 18752000)

    def test_webvtt_cue_components_removed_from_text(self):
        result = self.reader._remove_styles(
            u"<c vIntro><b>Wikipedia</b> is a great adventure. <i>It may have "
            u"its shortcomings</i>, but it is<u> the largest</u> collective "
            u"knowledge construction endevour</c> <ruby>base text <rt>"
            u"annotation</rt></ruby> <v Audry><b>Yes</b>, indeed!"
        )
        expected = (
            u"Wikipedia is a great adventure. It may have "
            u"its shortcomings, but it is the largest collective "
            u"knowledge construction endevour base text annotation"
            u" Audry: Yes, indeed!"
        )
        self.assertEqual(result, expected)

    def test_empty_file(self):
        self.assertRaises(
            CaptionReadNoCaptions,
            WebVTTReader().read, SAMPLE_WEBVTT_EMPTY)

    def test_not_ignoring_timing_errors(self):
        self.assertRaises(
            CaptionReadError,
            WebVTTReader(ignore_timing_errors=False).read,
            (u"\n"
             u"00:00:20.000 --> 00:00:10.000\n"
             u"foo bar baz")
        )

        self.assertRaises(
            CaptionReadError,
            WebVTTReader(ignore_timing_errors=False).read,
            (u"00:00:20.000 --> 00:00:10.000\n"
             u"Start time is greater than end time.\n")
        )

        self.assertRaises(
            CaptionReadError,
            WebVTTReader(ignore_timing_errors=False).read,
            (u"00:00:20.000 --> 00:00:30.000\n"
             u"Start times should be consecutive.\n"
             u"\n"
             u"00:00:10.000 --> 00:00:20.000\n"
             u"This cue starts before the previous one.\n")
        )

    def test_ignoring_timing_errors(self):
        # Even if timing errors are ignored, this has to raise an exception
        self.assertRaises(
            CaptionReadSyntaxError,
            WebVTTReader().read,
            (u"\nNOTE invalid cue stamp\n"
             u"00:00:20.000 --> \n"
             u"foo bar baz\n")
        )

        # And this too
        self.assertRaises(
            CaptionReadSyntaxError,
            WebVTTReader().read,
            (u"\n00:00:20,000 --> 00:00:22,000\n"
             u"Note the comma instead of point.\n")
        )

        try:
            WebVTTReader().read(
                (u"\n"
                 u"00:00:20.000 --> 00:00:10.000\n"
                 u"Start time is greater than end time.\n")
            )
        except CaptionReadError:
            self.fail(u"Shouldn't raise CaptionReadError")

        try:
            WebVTTReader().read(
                (u"\n"
                 u"00:00:20.000 --> 00:00:30.000\n"
                 u"Start times should be consecutive.\n"
                 u"\n"
                 u"00:00:10.000 --> 00:00:20.000\n"
                 u"This cue starts before the previous one.\n")

            )
        except CaptionReadError:
            self.fail(u"Shouldn't raise CaptionReadError")

    def test_invalid_files(self):
        self.assertRaises(
            CaptionReadSyntaxError,
            WebVTTReader().read,
            (u"\nNOTE Cues without text are invalid.\n"
                u"00:00:20.000 --> 00:00:30.000\n"
                u"\n"
                u"00:00:40.000 --> 00:00:50.000\n"
                u"foo bar baz\n")
        )

        self.assertRaises(
            CaptionReadError,
            WebVTTReader(ignore_timing_errors=False).read,
            (u"00:00:20.000 --> 00:00:10.000\n"
                u"Start time is greater than end time.")
        )

        self.assertRaises(
            CaptionReadError,
            WebVTTReader(ignore_timing_errors=False).read,
            (u"00:00:20.000 --> 00:00:30.000\n"
                u"Start times should be consecutive.\n"
                u"\n"
                u"00:00:10.000 --> 00:00:20.000\n"
                u"This cue starts before the previous one.\n")
        )

    def test_zero_start(self):
        captions = self.reader.read(SAMPLE_WEBVTT_LAST_CUE_ZERO_START)
        cue = captions.get_captions(u'en-US')[0]
        self.assertEquals(cue.start, 0)
    def test_cue_settings_are_kept(self):
        caption_set = WebVTTReader().read(SAMPLE_WEBVTT_WITH_CUE_SETTINGS)

        webvtt = WebVTTWriter().write(caption_set)

        self.assertEqual(SAMPLE_WEBVTT_WITH_CUE_SETTINGS, webvtt)
Ejemplo n.º 30
0
 def setUp(self):
     self.reader = WebVTTReader()
 def test_empty_cues_are_deleted(self):
     caption_set = WebVTTReader().read(SAMPLE_WEBVTT_EMPTY_CUE)
     results = WebVTTWriter().write(caption_set)
     self.assertEqual(SAMPLE_WEBVTT_FROM_EMPTY_CUE, results)
Ejemplo n.º 32
0
 def setUpClass(cls):
     cls.captions = WebVTTReader().read(SAMPLE_WEBVTT.decode(u'utf-8'))
 def test_webvtt_to_srt_conversion(self):
     caption_set = WebVTTReader().read(SAMPLE_WEBVTT)
     results = SRTWriter().write(caption_set)
     self.assertTrue(isinstance(results, six.text_type))
     self.assertSRTEquals(SAMPLE_SRT, results)
Ejemplo n.º 34
0
class WebVTTReaderTestCase(unittest.TestCase):

    def setUp(self):
        self.reader = WebVTTReader()

    def test_positive_answer_for_detection(self):
        self.assertTrue(self.reader.detect(SAMPLE_WEBVTT.decode(u'utf-8')))

    def test_negative_answer_for_detection(self):
        self.assertFalse(self.reader.detect(SAMPLE_SRT.decode(u'utf-8')))

    def test_caption_length(self):
        captions = self.reader.read(SAMPLE_WEBVTT_2.decode(u'utf-8'))
        self.assertEqual(len(captions.get_captions(u'en-US')), 7)

    def test_read_supports_multiple_languages(self):
        captions = self.reader.read(SAMPLE_WEBVTT.decode(u'utf-8'), lang=u'es')
        self.assertIsNotNone(captions.get_captions(u'es'))

    def test_proper_timestamps(self):
        captions = self.reader.read(SAMPLE_WEBVTT.decode(u'utf-8'))
        cue = captions.get_captions(u'en-US')[2]
        self.assertEqual(cue.start, 17000000)
        self.assertEqual(cue.end, 18752000)

    def test_webvtt_cue_components_removed_from_text(self):
        result = self.reader._remove_styles(
            u"<c vIntro><b>Wikipedia</b> is a great adventure. <i>It may have "
            u"its shortcomings</i>, but it is<u> the largest</u> collective "
            u"knowledge construction endevour</c> <ruby>base text <rt>"
            u"annotation</rt></ruby> <v Audry><b>Yes</b>, indeed!"
        )
        expected = (
            u"Wikipedia is a great adventure. It may have "
            u"its shortcomings, but it is the largest collective "
            u"knowledge construction endevour base text annotation"
            u" Audry: Yes, indeed!"
        )
        self.assertEqual(result, expected)

    def test_empty_file(self):
        self.assertRaises(
            CaptionReadNoCaptions,
            WebVTTReader().read, SAMPLE_WEBVTT_EMPTY.decode(u'utf-8'))

    def test_not_ignoring_timing_errors(self):
        self.assertRaises(
            CaptionReadSyntaxError,
            WebVTTReader(ignore_timing_errors=False).read,
            (u"\n"
             u"00:00:20,000 --> 00:00:10,000\n"
             u"foo bar baz")
        )

        self.assertRaises(
            CaptionReadError,
            WebVTTReader(ignore_timing_errors=False).read,
            (u"00:00:20,000 --> 00:00:10,000\n"
             u"Start time is greater than end time.\n")
        )

        self.assertRaises(
            CaptionReadError,
            WebVTTReader(ignore_timing_errors=False).read,
            (u"00:00:20,000 --> 00:00:30,000\n"
             u"Start times should be consecutive.\n"
             u"\n"
             u"00:00:10,000 --> 00:00:20,000\n"
             u"This cue starts before the previous one.\n")
        )

    def test_ignoring_timing_errors(self):
        # Even if timing errors are ignored, this is worse
        self.assertRaises(
            CaptionReadSyntaxError,
            WebVTTReader().read,
            (u"\nNOTE invalid cue stamp\n"
             u"00:00:20,000 --> \n"
             u"foo bar baz\n")
        )

        try:
            WebVTTReader().read(
                (u"\n"
                 u"00:00:20,000 --> 00:00:10,000\n"
                 u"Start time is greater than end time.\n")
        )
        except CaptionReadError:
            self.fail(u"Shouldn't raise CaptionReadError")

        try:
            WebVTTReader().read(
                (u"\n"
                 u"00:00:20,000 --> 00:00:30,000\n"
                 u"Start times should be consecutive.\n"
                 u"\n"
                 u"00:00:10,000 --> 00:00:20,000\n"
                 u"This cue starts before the previous one.\n")

        )
        except CaptionReadError:
            self.fail(u"Shouldn't raise CaptionReadError")

    def test_invalid_files(self):
        self.assertRaises(
            CaptionReadSyntaxError,
            WebVTTReader().read,
            (u"\nNOTE Cues without text are invalid.\n"
            u"00:00:20,000 --> 00:00:30,000\n"
            u"\n"
            u"00:00:40,000 --> 00:00:50,000\n"
            u"foo bar baz\n")
        )

        self.assertRaises(
            CaptionReadError,
            WebVTTReader(ignore_timing_errors=False).read,
            (u"00:00:20,000 --> 00:00:10,000\n"
            u"Start time is greater than end time.")
        )

        self.assertRaises(
            CaptionReadError,
            WebVTTReader(ignore_timing_errors=False).read,
            (u"00:00:20,000 --> 00:00:30,000\n"
            u"Start times should be consecutive.\n"
            u"\n"
            u"00:00:10,000 --> 00:00:20,000\n"
            u"This cue starts before the previous one.\n")
        )