def test_properly_converts_timing(self): caption_set = DFXPReader().read(DFXP_WITH_ALTERNATIVE_TIMING_FORMATS) caps = caption_set.get_captions('en-US') self.assertEqual(caps[0].start, 1900000) self.assertEqual(caps[0].end, 3050000) self.assertEqual(caps[1].start, 4000000) self.assertEqual(caps[1].end, 5200000)
def test_properly_converts_frametiming(self, sample_dfxp_with_frame_timing): caption_set = DFXPReader().read(sample_dfxp_with_frame_timing) caps = caption_set.get_captions('en-US') assert caps[0].end == 12233333 assert caps[0].start == 9666666
def setUpClass(cls): cls.captions = DFXPReader().read(SAMPLE_DFXP.decode(u'utf-8')) cls.captions_utf8 = DFXPReader().read( SAMPLE_DFXP_UTF8.decode(u'utf-8')) cls.captions_unicode = DFXPReader().read(SAMPLE_DFXP_UNICODE) cls.captions_without_style_and_region = DFXPReader().read( SAMPLE_DFXP_WITHOUT_REGION_AND_STYLE.decode(u'utf-8'))
def test_merge_concurrent_captions(self, dfxp_with_concurrent_captions): initial_caption_set = DFXPReader().read(dfxp_with_concurrent_captions) initial_captions = initial_caption_set.get_captions('en-US') caption_set = merge_concurrent_captions(initial_caption_set) captions = caption_set.get_captions('en-US') assert len(initial_captions) == 5 assert len(captions) == 3
def test_merge_concurrent_captions(self): caption_set = DFXPReader().read(DFXP_WITH_CONCURRENT_CAPTIONS) captions = caption_set.get_captions('en-US') self.assertEqual(len(captions), 5) caption_set = merge_concurrent_captions(caption_set) captions = caption_set.get_captions('en-US') self.assertEqual(len(captions), 3)
def test_proper_xml_entity_escaping( self, sample_dfxp_with_escaped_apostrophe): caption_set = DFXPReader().read(sample_dfxp_with_escaped_apostrophe) cue_text = caption_set.get_captions('en-US')[0].nodes[0].content assert cue_text == "<< \"Andy's Caf\xe9 & Restaurant\" this way" result = DFXPWriter().write(caption_set) assert "<< \"Andy's Café & Restaurant\" this way" in result
def test_properly_converts_timing(self): caption_set = DFXPReader().read( DFXP_WITH_ALTERNATIVE_TIMING_FORMATS) caps = caption_set.get_captions('en-US') self.assertEqual(caps[0].start, 1900000) self.assertEqual(caps[0].end, 3050000) self.assertEqual(caps[1].start, 4000000) self.assertEqual(caps[1].end, 5200000)
def test_individual_timings_of_captions_with_matching_timespec_are_kept( self): # noqa captionset = DFXPReader().read( SAMPLE_DFXP_MULTIPLE_CAPTIONS_WITH_THE_SAME_TIMING) expected_timings = [(9209000, 12312000)] * 3 actual_timings = [(c_.start, c_.end) for c_ in captionset.get_captions('en-US')] self.assertEqual(expected_timings, actual_timings)
def test_proper_xml_entity_escaping(self): caption_set = DFXPReader().read(DFXP_WITH_ESCAPED_APOSTROPHE) cue_text = caption_set.get_captions(u'en-US')[0].nodes[0].content self.assertEqual(cue_text, u"<< \"Andy's Caf\xe9 & Restaurant\" this way") result = DFXPWriter().write(caption_set) self.assertIn(u"<< \"Andy's Café & Restaurant\" this way", result)
def test_individual_timings_of_captions_with_matching_timespec_are_kept(self): # noqa captionset = DFXPReader().read( SAMPLE_DFXP_MULTIPLE_CAPTIONS_WITH_THE_SAME_TIMING ) expected_timings = [(9209000, 12312000)] * 3 actual_timings = [(c_.start, c_.end) for c_ in captionset.get_captions('en-US')] self.assertEqual(expected_timings, actual_timings)
def test_properly_converts_timing_with_frames_with_30fps_default(self): caption_set = DFXPReader().read(SAMPLE_DFXP_FRAME_TIMING_DEFAULT_30FPS) caps = caption_set.get_captions('en-US') self.assertEqual(caps[0].start, 8100000) self.assertEqual(caps[0].end, 12190000) self.assertEqual(caps[1].start, 258066666) self.assertEqual(caps[1].end, 8168190000) self.assertEqual(caps[2].start, 43208966666) self.assertEqual(caps[2].end, 50412190000)
def test_properly_converts_timing_with_frames_with_25fps(self): caption_set = DFXPReader().read(SAMPLE_DFXP_FRAME_TIMING_25FPS) caps = caption_set.get_captions('en-US') self.assertEqual(caps[0].start, 8120000) self.assertEqual(caps[0].end, 12190000) self.assertEqual(caps[1].start, 258080000) self.assertEqual(caps[1].end, 8168190000) self.assertEqual(caps[2].start, 43208960000) self.assertEqual(caps[2].end, 50412190000)
def test_individual_timings_of_captions_with_matching_timespec_are_kept( self, sample_dfxp_multiple_captions_with_the_same_timing): captionset = DFXPReader().read( sample_dfxp_multiple_captions_with_the_same_timing) expected_timings = [(9209000, 12312000)] * 3 actual_timings = [(c_.start, c_.end) for c_ in captionset.get_captions('en-US')] assert expected_timings == actual_timings
def test_properly_converts_timing( self, sample_dfxp_with_alternative_timing_formats): caption_set = DFXPReader().read( sample_dfxp_with_alternative_timing_formats) caps = caption_set.get_captions('en-US') assert caps[0].start == 1900000 assert caps[0].end == 3050000 assert caps[1].start == 4000000 assert caps[1].end == 5200000
def test_proper_xml_entity_escaping(self): caption_set = DFXPReader().read(DFXP_WITH_ESCAPED_APOSTROPHE) cue_text = caption_set.get_captions(u'en-US')[0].nodes[0].content self.assertEqual( cue_text, u"<< \"Andy's Caf\xe9 & Restaurant\" this way") result = DFXPWriter().write(caption_set) self.assertIn( u"<< \"Andy's Café & Restaurant\" this way", result )
def test_properly_converts_timing_with_frames_with_29_97fps_multiplier( self): caption_set = DFXPReader().read( SAMPLE_DFXP_FRAME_TIMING_MULTIPLIER_29_97FPS) caps = caption_set.get_captions('en-US') self.assertEqual(caps[0].start, 8100100) self.assertEqual(caps[0].end, 12190000) self.assertEqual(caps[1].start, 258066733) self.assertEqual(caps[1].end, 8168190000) self.assertEqual(caps[2].start, 43208967633) self.assertEqual(caps[2].end, 50412190000)
def test_individual_layouts_of_captions_with_matching_timespec_are_kept(self): # noqa captionset = DFXPReader().read( SAMPLE_DFXP_MULTIPLE_CAPTIONS_WITH_THE_SAME_TIMING ) expected_layouts = [ (((10, u'%'), (10, u'%')), None, None, (u'center', u'bottom')), (((40, u'%'), (40, u'%')), None, None, (u'center', u'bottom')), (((10, u'%'), (70, u'%')), None, None, (u'center', u'bottom'))] actual_layouts = [c_.layout_info.serialized() for c_ in captionset.get_captions('en-US')] self.assertEqual(expected_layouts, actual_layouts)
def test_individual_texts_of_captions_with_matching_timespec_are_kept(self): # noqa captionset = DFXPReader().read( SAMPLE_DFXP_MULTIPLE_CAPTIONS_WITH_THE_SAME_TIMING ) expected_texts = [u'Some text here', u'Some text there', u'Caption texts are everywhere!'] actual_texts = [c_.nodes[0].content for c_ in captionset.get_captions("en-US")] self.assertEqual(expected_texts, actual_texts)
def test_individual_layouts_of_captions_with_matching_timespec_are_kept(self): # noqa captionset = DFXPReader().read( SAMPLE_DFXP_MULTIPLE_CAPTIONS_WITH_THE_SAME_TIMING ) expected_layouts = [ (((10, UnitEnum.PERCENT), (10, UnitEnum.PERCENT)), None, None, (HorizontalAlignmentEnum.CENTER, VerticalAlignmentEnum.BOTTOM)), (((40, UnitEnum.PERCENT), (40, UnitEnum.PERCENT)), None, None, (HorizontalAlignmentEnum.CENTER, VerticalAlignmentEnum.BOTTOM)), (((10, UnitEnum.PERCENT), (70, UnitEnum.PERCENT)), None, None, (HorizontalAlignmentEnum.CENTER, VerticalAlignmentEnum.BOTTOM))] actual_layouts = [c_.layout_info.serialized() for c_ in captionset.get_captions('en-US')] self.assertEqual(expected_layouts, actual_layouts)
def test_individual_texts_of_captions_with_matching_timespec_are_kept(self): # noqa captionset = DFXPReader().read( SAMPLE_DFXP_MULTIPLE_CAPTIONS_WITH_THE_SAME_TIMING ) expected_texts = ['Some text here', 'Some text there', 'Caption texts are everywhere!'] actual_texts = [c_.nodes[0].content for c_ in captionset.get_captions("en-US")] self.assertEqual(expected_texts, actual_texts)
def test_offset_time(self): reader = DFXPReader() assert 1 == reader._translate_time("0.001ms") assert 2000 == reader._translate_time("2ms") assert 1000000 == reader._translate_time("1s") assert 1234567 == reader._translate_time("1.234567s") assert 180000000 == reader._translate_time("3m") assert 14400000000 == reader._translate_time("4h") # Tick values are not supported with pytest.raises(InvalidInputError): reader._translate_time("2.3t")
def test_individual_texts_of_captions_with_matching_timespec_are_kept( self, sample_dfxp_multiple_captions_with_the_same_timing): captionset = DFXPReader().read( sample_dfxp_multiple_captions_with_the_same_timing) expected_texts = [ 'Some text here', 'Some text there', 'Caption texts are everywhere!' ] actual_texts = [ c_.nodes[0].content for c_ in captionset.get_captions("en-US") ] assert expected_texts == actual_texts
def test_dfxp_to_webvtt_preserves_proper_alignment(self): # This failed at one point when the CaptionSet had node breaks with # different positioning. It was fixed both at the DFXPReader AND the # WebVTTWriter. caption_set = DFXPReader().read(DFXP_STYLE_REGION_ALIGN_CONFLICT) results = WebVTTWriter().write(caption_set) self.assertEquals(WEBVTT_FROM_DFXP_WITH_CONFLICTING_ALIGN, results)
def test_break_node_positioning_is_ignored( self, webvtt_from_dfxp_with_conflicting_align, dfxp_style_region_align_conflict): caption_set = DFXPReader().read(dfxp_style_region_align_conflict) results = WebVTTWriter().write(caption_set) assert webvtt_from_dfxp_with_conflicting_align == results
def test_dfxp_with_positioning_to_webvtt_conversion(self): caption_set = DFXPReader().read(SAMPLE_DFXP_WITH_POSITIONING) results = WebVTTWriter(video_width=VIDEO_WIDTH, video_height=VIDEO_HEIGHT).write(caption_set) self.assertTrue(isinstance(results, str)) self.assertWebVTTEquals( SAMPLE_WEBVTT_FROM_DFXP_WITH_POSITIONING_AND_STYLE, results)
def test_default_region_p_tags(self): caption_set = DFXPReader().read(SAMPLE_DFXP) result = DFXPWriter().write(caption_set) soup = BeautifulSoup(result, u'xml') for p in soup.find_all(u'p'): self.assertEquals(p.attrs.get(u'region'), DFXP_DEFAULT_REGION_ID)
def test_default_styling_p_tags(self): caption_set = DFXPReader().read(SAMPLE_DFXP) result = DFXPWriter().write(caption_set) soup = BeautifulSoup(result, u'xml') for p in soup.find_all(u'p'): self.assertEquals(p.attrs.get(u'style'), 'p')
def test_legacy_convert(self): caption_set = DFXPReader(read_invalid_positioning=True).read( SAMPLE_DFXP_FOR_LEGACY_WRITER_INPUT) result = LegacyDFXPWriter().write(caption_set) self.assertEqual(result, SAMPLE_DFXP_FOR_LEGACY_WRITER_OUTPUT)
def load_subtitles(self, video_id, langs=('ru',)): for lang in langs: if subs_exists(video_id, lang): continue print('Loading {} subtitles for {}'.format(lang, video_id)) opts = { 'writeautomaticsub': True, 'subtitleslangs': langs, 'subtitlesformat': 'ttml', 'nooverwrites': True, 'skip_download': True, 'outtmpl': join(get_dir(video_id), video_id + '.ttml') } with youtube_dl.YoutubeDL(opts) as ytdl: ytdl.download(['https://www.youtube.com/watch?v={}'.format(video_id)]) # WevVTT captions from youtube contains duplicate phrases with overlapping time segments # It is not comfortable, that's why subtitles firstly downloaded in ttml format # Then subtitles converted to webvtt subs_path_ttml = join(get_dir(video_id), video_id + '.' + lang + '.ttml') subs_path_vtt = join(get_dir(video_id), video_id + '.' + lang + '.vtt') if exists(subs_path_ttml): print('converting subtitles') with open(subs_path_ttml, encoding='utf-8') as f: subs = DFXPReader().read(f.read()) with open(subs_path_vtt, 'w', encoding='utf-8') as f: f.write(WebVTTWriter().write(subs))
def test_is_relativized(self): # Absolute positioning settings (e.g. px) are converted to percentages caption_set = DFXPReader().read( SAMPLE_DFXP_WITH_POSITIONING.decode('utf-8')) result = DFXPWriter(video_width=VIDEO_WIDTH, video_height=VIDEO_HEIGHT).write(caption_set) self.assertEqual(result, SAMPLE_DFXP_WITH_RELATIVIZED_POSITIONING)
def test_dfxp_to_webvtt_conversion(self, sample_webvtt_from_dfxp, sample_dfxp): caption_set = DFXPReader().read(sample_dfxp) results = WebVTTWriter().write(caption_set) assert isinstance(results, str) self.assert_webvtt_equals(sample_webvtt_from_dfxp, results)
def test_dfxp_to_webvtt_adds_explicit_size( self, sample_webvtt_output_long_cue, sample_dfxp_long_cue): caption_set = DFXPReader().read(sample_dfxp_long_cue) results = WebVTTWriter().write(caption_set) assert isinstance(results, str) assert sample_webvtt_output_long_cue == results
def test_fit_to_screen(self): # Check if caption width and height are is explicitly set and # recalculate it if necessary. This prevents long captions from being # cut out of the screen. caption_set = DFXPReader().read(SAMPLE_DFXP_LONG_CUE) result = DFXPWriter().write(caption_set) self.assertEqual(result, SAMPLE_DFXP_LONG_CUE_FIT_TO_SCREEN)
def test_caption_error_for_invalid_positioning_values( self, sample_dfxp_invalid_positioning_value_template): invalid_value_dfxp = ( sample_dfxp_invalid_positioning_value_template.format( origin="px 5px")) with pytest.raises(CaptionReadSyntaxError): DFXPReader().read(invalid_value_dfxp)
def test_offset_time(self): reader = DFXPReader() self.assertEqual(1, reader._translate_time("0.001ms")) self.assertEqual(2000, reader._translate_time("2ms")) self.assertEqual(1000000, reader._translate_time("1s")) self.assertEqual(1234567, reader._translate_time("1.234567s")) self.assertEqual(180000000, reader._translate_time("3m")) self.assertEqual(14400000000, reader._translate_time("4h")) # Tick values are not supported self.assertRaises( InvalidInputError, reader._translate_time, "2.3t")
def test_caption_length(self): captions = DFXPReader().read(SAMPLE_DFXP) self.assertEquals(7, len(captions.get_captions(u"en-US")))
def test_empty_cue(self): caption_set = DFXPReader().read( SAMPLE_DFXP_EMPTY_CUE) caps = caption_set.get_captions('en-US') self.assertEquals(caps[1], [])
def test_caption_length(self): captions = DFXPReader().read(SAMPLE_DFXP.decode(u'utf-8')) self.assertEquals(8, len(captions.get_captions(u"en-US")))
def test_proper_pcc_format(self): captions = DFXPReader().read(SAMPLE_DFXP) self.assertEquals(set(["captions", "styles"]), set(captions.keys())) self.assertEquals(7, len(captions["captions"]["en-US"]))
def test_proper_timestamps(self): captions = DFXPReader().read(SAMPLE_DFXP.decode(u'utf-8')) paragraph = captions.get_captions(u"en-US")[2] self.assertEquals(17000000, paragraph.start) self.assertEquals(18752000, paragraph.end)
def test_invalid_markup_is_properly_handled(self): captions = DFXPReader().read(SAMPLE_DFXP_SYNTAX_ERROR.decode(u'utf-8')) self.assertEquals(2, len(captions.get_captions(u"en-US")))