def test_linebreaks(self): sset = SubtitleSet('en') sset.append_subtitle(0, 1000, '''line 1<br />line 2<br />line 3''', escape=False) sset.append_subtitle(1000,200, 'second sub') output = unicode(TXTGenerator(sset)) self.assertEqual(output, TXT_LINEBREAKS)
def setUp(self): self.subs = SubtitleSet(language_code='en') for x in range(0,10): self.subs.append_subtitle( from_ms=(x * 1000), to_ms=(x * 1000) + 1000, content="%s - and *italics* and **bold** and >>." % x )
class JSONParser(BaseTextParser): file_type = 'json' def __init__(self, input_string, pattern, language=None, flags=[], eager_parse=True): self.input_string = input_string self.pattern = pattern self.language = language super(JSONParser, self).__init__(input_string, pattern, language=language, flags=[], eager_parse=eager_parse) def to_internal(self): if not hasattr(self, 'sub_set'): self.sub_set = SubtitleSet(self.language) try: data = json.loads(self.input_string) except ValueError: raise SubtitleParserError("Invalid JSON data provided.") # Sort by the ``position`` key data = sorted(data, key=lambda k: k['position']) for sub in data: self.sub_set.append_subtitle(sub['start'], sub['end'], sub['text']) return self.sub_set
def test_replace_multiple_lines_with_single(self): set_1 = SubtitleSet.from_list('en', [ (0, 1000, "Hey 1"), (1000, 2000, "Hey 2"), (2000, 3000, "Hey 3"), (3000, 4000, "Hey 4"), ]) set_2 = SubtitleSet.from_list('en', [ (0, 1000, "Hey 1"), (1000, 3000, "Hey 2 and 3"), (3000, 4000, "Hey 4"), ]) result = diff(set_1, set_2) self.assertEqual(result['changed'], True) # for both time_change and text_changed, we calculate them as follows: # there are 7 total subs. 4 of those are matches and 2 in set_1 were # replaced with 1 in set_2. So the change amount is 3/7. self.assertAlmostEqual(result['time_changed'], 3/7.0) self.assertAlmostEqual(result['text_changed'], 3/7.0) self.assertEqual(len(result['subtitle_data']), 4) # check the lines that haven't changed self.check_unchanged_subtitle_data(result, set_1, set_2, 0, 3) # check the line that was inserted line1 = result['subtitle_data'][1] self.assertEquals(line1['time_changed'], True) self.assertEquals(line1['text_changed'], True) self.assertEquals(line1['subtitles'][0], set_1[1]) self.assertEquals(line1['subtitles'][1], set_2[1]) line2 = result['subtitle_data'][2] self.assertEquals(line2['time_changed'], True) self.assertEquals(line2['text_changed'], True) self.assertEquals(line2['subtitles'][0], set_1[2]) self.assertEquals(line2['subtitles'][1], self.empty_line())
def test_simple_replace(self): set_1 = SubtitleSet.from_list('en', [ (0, 1000, "Hey 1"), (1000, 2000, "Hey 2"), (2000, 3000, "Hey 3"), (3000, 4000, "Hey 4"), ]) set_2 = SubtitleSet.from_list('en', [ (0, 1000, "Hey 1"), (1000, 2000, "Hey New 2"), (2000, 3000, "Hey 3"), (3000, 4000, "Hey 4"), ]) result = diff(set_1, set_2) self.assertEqual(result['changed'], True) self.assertAlmostEqual(result['time_changed'], 0) # for text_changed, we calculate as follows: there are 8 total subs. # 6 of those are matches and 1 is different in both sets. So 2/8.0 # has been changed. self.assertAlmostEqual(result['text_changed'], 2/8.0) self.assertEqual(len(result['subtitle_data']), 4) # check the lines that haven't changed self.check_unchanged_subtitle_data(result, set_1, set_2, 0, 2, 3) # check the line that was inserted insert_sub_data = result['subtitle_data'][1] self.assertEquals(insert_sub_data['time_changed'], False) self.assertEquals(insert_sub_data['text_changed'], True) self.assertEquals(insert_sub_data['subtitles'][0], set_1[1]) self.assertEquals(insert_sub_data['subtitles'][1], set_2[1])
def to_internal(self): if not hasattr(self, 'sub_set'): try: self.sub_set = SubtitleSet(self.language) xml = etree.fromstring(self.input_string.encode('utf-8')) has_subs = False total_items = len(xml) for i,item in enumerate(xml): duration = 0 start = int(float(item.get('start')) * 1000) if hasattr(item, 'duration'): duration = int(float(item.get('dur', 0)) * 1000) elif i+1 < total_items: # youtube sometimes omits the duration attribute # in this case we're displaying until the next sub # starts next_item = xml[i+1] duration = int(float(next_item.get('start')) * 1000) - start else: # hardcode the last sub duration at 3 seconds duration = 3000 end = start + duration text = item.text and unescape_html(item.text) or u'' self.sub_set.append_subtitle(start, end, text) has_subs = True if not has_subs: raise ValueError("No subs") except Exception as e: raise SubtitleParserError(original_error=e) return self.sub_set
def merge_subtitles(cls, subtitle_sets, initial_ttml=None): """Combine multiple subtitles sets into a single XML string. """ if len(subtitle_sets) == 0: raise TypeError("DFXPGenerator.merge_subtitles: No subtitles given") if initial_ttml is None: tt = SubtitleSet('').as_etree_node() body = tt.find(TTML + 'body') body.remove(body.find(TTML + 'div')) else: tt = initial_ttml body = tt.find(TTML + 'body') if body is None: raise ValueError("no body tag") # set the default language to blank. We will create a div for each # subtitle set and set xml:lang on that. tt.set(XML + 'lang', '') # for each subtitle set we will append the body of tt for i, subtitle_set in enumerate(subtitle_sets): root_elt = subtitle_set.as_etree_node() language_code = root_elt.get(XML + 'lang') lang_div = etree.SubElement(body, TTML + 'div') lang_div.set(XML + 'lang', language_code) lang_div.extend(root_elt.find(TTML + 'body').findall(TTML + 'div')) utils.indent_ttml(tt) return etree.tostring(tt)
def test_create_translation_dependent_on_dependent(self): test_utils.invalidate_widget_video_cache.run_original_for_test() request = RequestMockup(self.user_0) session = create_two_sub_dependent_session(request) response = rpc.start_editing( request, session.video.video_id, 'fr', base_language_code=session.language.language_code) session_pk = response['session_pk'] orig_subs = SubtitleSet('en', response['original_subtitles']['subtitles']) self.assertEqual(3, len(orig_subs)) rpc.finished_subtitles(request, session_pk, create_subtitle_set().to_xml()) response = rpc.show_widget(request, VIDEO_URL, False) lang = [r for r in response['drop_down_contents'] if r['language'] == 'fr'][0] subs = rpc.fetch_subtitles(request, session.video.video_id, lang['pk']) subs = SubtitleSet('fr', subs['subtitles']) self.assertEqual(1, len(subs)) self.assertEqual('hey you 0', subs[0].text) self.assertEqual(0, subs[0].start_time) self.assertEqual(1000, subs[0].end_time)
def test_insert(self): set_1 = SubtitleSet.from_list('en', [ (0, 1000, "Hey 1"), (1000, 2000, "Hey 2"), (2000, 3000, "Hey 3"), (3000, 4000, "Hey 4"), ]) set_2 = SubtitleSet.from_list('en', [ (0, 1000, "Hey 1"), (500, 800, "Hey 1.5"), (1000, 2000, "Hey 2"), (2000, 3000, "Hey 3"), (3000, 4000, "Hey 4"), ]) result = diff(set_1, set_2) self.assertEqual(result['changed'], True) # for both time_change and text_changed, we calculate them as follows: # there are 9 total subs. 8 of those are matches and 1 is new in # set_2. So the change amount is 1/9 self.assertAlmostEqual(result['time_changed'], 1/9.0) self.assertAlmostEqual(result['text_changed'], 1/9.0) self.assertEqual(len(result['subtitle_data']), 5) # check the lines that haven't changed self.check_unchanged_subtitle_data(result, set_1, set_2, 0, 2, 3, 4) # check the line that was inserted insert_sub_data = result['subtitle_data'][1] self.assertEquals(insert_sub_data['time_changed'], True) self.assertEquals(insert_sub_data['text_changed'], True) self.assertEquals(insert_sub_data['subtitles'][0], self.empty_line()) self.assertEquals(insert_sub_data['subtitles'][1], set_2[1])
def forwards(self, orm): "Write your forwards methods here." for missing_set_version in chunkediter(orm['subtitles.SubtitleVersion'].objects.filter(serialized_subtitles='')): subtitles = SubtitleSet(missing_set_version.language_code) missing_set_version.serialized_subtitles = compress(subtitles.to_xml()) missing_set_version.subtitle_count = 0 missing_set_version.save()
def test_add_subtitles_with_complete_true_but_unsynced_subs(self): # test adding subtitles with complete=True, but the subtitles # themseleves aren't complete. For this corner case, we should not # emit subtitles_published. subs = SubtitleSet(language_code='en') subs.append_subtitle(None, None, 'content') pipeline.add_subtitles(self.video, 'en', subs, complete=True) assert_equal(self.subtitles_published_handler.call_count, 0)
def create_subtitle_set(number_of_subtitles=0, synced=True): subtitle_set = SubtitleSet('en') for x in xrange(0, number_of_subtitles+1): start = x * 1000 if synced else None end = x * 1000 + 1000 if synced else None subtitle_set.append_subtitle(start, end, 'hey you %s' % x) return subtitle_set
def setUp(self): self.en_subs = SubtitleSet('en') self.es_subs = SubtitleSet('es') self.fr_subs = SubtitleSet('fr') self.en_subs.append_subtitle(1000, 1500, 'content') self.es_subs.append_subtitle(1000, 1500, 'spanish content') self.es_subs.append_subtitle(2000, 2500, 'spanish content 2', new_paragraph=True) self.fr_subs.append_subtitle(1000, 1500, 'french content')
def test_data_ordering(self): set_1 = SubtitleSet.from_list("en", [(0, 1000, "Hey 1")]) set_2 = SubtitleSet.from_list("en", [(0, 1000, "Hey 1"), (1200, 2000, "Hey 2"), (2000, 3000, "Hey 3")]) result = diff(set_1, set_2) subs_result = result["subtitle_data"][2]["subtitles"] # make sure the 0 index subs is for set_1, test # we respect the ordering of arguments passed to diff self.assertEqual(subs_result[0].text, None) self.assertEqual(subs_result[1].text, "Hey 3")
def test_calc_changes(self): set_1 = SubtitleSet.from_list( "en", [(0, 1000, "Hey 1"), (1000, 2000, "Hey 2"), (2000, 3000, "Hey 3"), (3000, 4000, "Hey 4")] ) set_2 = SubtitleSet.from_list( "en", [(0, 1000, "Hey 1"), (1000, 2000, "Hey New 2"), (2000, 3000, "Hey 3"), (3000, 4000, "Hey 4")] ) text_changed, time_changed = calc_changes(set_1, set_2) self.assertAlmostEqual(time_changed, 0) self.assertAlmostEqual(text_changed, 2 / 8.0)
def test_unsynced_reflect_time_changes(self): set_1 = SubtitleSet.from_list('en', [ (0, 1000, "Hey 1"), ]) set_2 = SubtitleSet.from_list('en', [ (0, 1000, "Hey 1"), (None, None, "Hey 2"), ]) result = diff(set_1, set_2) self.assertAlmostEqual(result['time_changed'], 1/3.0)
def test_start_translating(self): test_utils.invalidate_widget_video_cache.run_original_for_test() request = RequestMockup(self.user_0) session = self._create_basic_version(request) sl_en = session.language # open translation dialog. response = rpc.start_editing(request, session.video.video_id, 'es', base_language_code=sl_en.language_code) session_pk = response['session_pk'] subs = response['subtitles'] self.assertEquals(True, response['can_edit']) self.assertEquals(0, subs['version']) self.assertEquals(0, len(SubtitleSet('es', subs['subtitles']))) rpc.finished_subtitles(request, session_pk, create_subtitle_set().to_xml()) video = models.Video.objects.get(id=session.video.id) translations = rpc.fetch_subtitles(request, video.video_id, video.subtitle_language('es').pk) subtitles = SubtitleSet('es',translations['subtitles']) self.assertEquals(1, len(subtitles)) self.assertEquals('hey you 0', subtitles[0][2]) language = video.subtitle_language('es') self.assertEquals(1, language.subtitleversion_set.full().count()) self.assertEquals(language.get_translation_source_language_code(), 'en') version = language.get_tip() self.assertTrue('en' in version.get_lineage()) response = rpc.start_editing(request, session.video.video_id, 'es', base_language_code=sl_en.language_code) rpc.finished_subtitles(request, session_pk, create_subtitle_set(2).to_xml()) translations = rpc.fetch_subtitles(request, video.video_id, video.subtitle_language('es').pk) subtitles = SubtitleSet('es',translations['subtitles']) self.assertEquals(3, len(subtitles)) self.assertEquals('hey you 0', subtitles[0][2]) self.assertEquals('hey you 1', subtitles[1][2]) self.assertEquals('hey you 2', subtitles[2][2]) language = video.subtitle_language('es') self.assertEquals(2, language.subtitleversion_set.full().count()) self.assertEquals(language.get_translation_source_language_code(), 'en') version = language.get_tip() self.assertTrue('en' in version.get_lineage())
def test_one_set_empty(self): set_1 = SubtitleSet.from_list('en', [ (0, 1000, "Hey 1"), (1000, 2000, "Hey 2"), (2000, 3000, "Hey 3"), (3000, 4000, "Hey 4"), ]) result = diff(set_1, SubtitleSet('en')) self.assertEqual(result['changed'], True) self.assertEqual(result['text_changed'], 1.0) self.assertEqual(result['time_changed'], 1.0)
class YoutubeParser(BaseTextParser): file_type = 'youtube' def __init__(self, input_string, language_code): self.language_code = language_code self._pattern = None self.input_string = input_string self.language = language_code def __iter__(self): if not hasattr(self, 'sub_set'): self.to_internal() for sub in self.sub_set: yield sub def to_internal(self): if not hasattr(self, 'sub_set'): try: self.sub_set = SubtitleSet(self.language) xml = etree.fromstring(self.input_string.encode('utf-8')) has_subs = False total_items = len(xml) for i,item in enumerate(xml): duration = 0 start = int(float(item.get('start')) * 1000) if hasattr(item, 'duration'): duration = int(float(item.get('dur', 0)) * 1000) elif i+1 < total_items: # youtube sometimes omits the duration attribute # in this case we're displaying until the next sub # starts next_item = xml[i+1] duration = int(float(next_item.get('start')) * 1000) - start else: # hardcode the last sub duration at 3 seconds duration = 3000 end = start + duration text = item.text and unescape_html(item.text) or u'' self.sub_set.append_subtitle(start, end, text) has_subs = True if not has_subs: raise ValueError("No subs") except Exception as e: raise SubtitleParserError(original_error=e) return self.sub_set
def to_internal(self): if not hasattr(self, 'sub_set'): self.sub_set = SubtitleSet(self.language) valid = False for item in self._result_iter(): item['text'] = item['text'].replace("\n", '<br/>') if not valid and ''.join(item['text'].split()): valid = True self.sub_set.append_subtitle(item['start'], item['end'], item['text'], escape=False) if not valid: raise SubtitleParserError("No subs") return self.sub_set
def test_text_changes(self): set_1 = SubtitleSet.from_list( "en", [(0, 1000, "Hey 1"), (1000, 2000, "Hey 2"), (2000, 3000, "Hey 3"), (3000, 4000, "Hey 4")] ) set_2 = SubtitleSet.from_list( "en", [(0, 1000, "Hey 1"), (1000, 2000, "Hey 22"), (2000, 3000, "Hey 3"), (3000, 4000, "Hey 4")] ) result = diff(set_1, set_2) self.assertEqual(result["changed"], True) self.assertEqual(result["text_changed"], 1 / 4.0) self.assertEqual(result["time_changed"], 0) self.assertEqual(len(result["subtitle_data"]), 4) # only sub #2 should have text changed for i, sub_data in enumerate(result["subtitle_data"]): self.assertEqual(sub_data["text_changed"], i == 1)
def _add_subtitles(sub_lang, num_subs, video, translated_from=None): subtitle_set = SubtitleSet(sub_lang.language_code) for i in xrange(0, num_subs): start_time=i * 1000 end_time =i + 800 subtitle_text = 'hey jude %s' % i subtitle_set.append_subtitle(start_time, end_time, subtitle_text) parents = [] if translated_from: parents.append(translated_from.get_tip()) return pipeline.add_subtitles(video, sub_lang.language_code, subtitle_set, parents=parents)
def test_data_ordering(self): set_1 = SubtitleSet.from_list('en', [ (0, 1000, "Hey 1"), ]) set_2 = SubtitleSet.from_list('en', [ (0, 1000, "Hey 1"), (1200, 2000, "Hey 2"), (2000, 3000, "Hey 3"), ]) result = diff(set_1, set_2) subs_result = result['subtitle_data'][2]['subtitles'] # make sure the 0 index subs is for set_1, test # we respect the ordering of arguments passed to diff self.assertEqual(subs_result[0].text , None) self.assertEqual(subs_result[1].text , "Hey 3")
def test_calc_changes(self): set_1 = SubtitleSet.from_list('en', [ (0, 1000, "Hey 1"), (1000, 2000, "Hey 2"), (2000, 3000, "Hey 3"), (3000, 4000, "Hey 4"), ]) set_2 = SubtitleSet.from_list('en', [ (0, 1000, "Hey 1"), (1000, 2000, "Hey New 2"), (2000, 3000, "Hey 3"), (3000, 4000, "Hey 4"), ]) text_changed, time_changed = calc_changes(set_1, set_2) self.assertAlmostEqual(time_changed, 0) self.assertAlmostEqual(text_changed, 2/8.0)
def test_log_in_then_save(self): request_0 = RequestMockup(NotAuthenticatedUser()) return_value = rpc.show_widget(request_0, VIDEO_URL, False) video_id = return_value['video_id'] return_value = rpc.start_editing( request_0, video_id, 'en', original_language_code='en') session_pk = return_value['session_pk'] sset = SubtitleSet('en') sset.append_subtitle(2300, 3400, 'hey') response = rpc.regain_lock(request_0, session_pk) self.assertEqual('ok', response['response']) request_0.user = self.user_0 rpc.finished_subtitles(request_0, session_pk, sset.to_xml()) sversion = sub_models.SubtitleVersion.objects.order_by('-pk')[0] sversion.subtitle_count = 1 self.assertEqual(request_0.user.pk, sversion.author.pk)
def test_add_alternate_urls(self): test_utils.invalidate_widget_video_cache.run_original_for_test() url_0 = VIDEO_URL url_1 = 'http://ia700406.us.archive.org/16/items/PeopleOfHtml5-BruceLawsonmp4Version/PeopleOfHtml5-BruceLawson.mp4' request = RequestMockup(self.user_0) return_value = rpc.show_widget(request, url_0, False, additional_video_urls=[url_1]) video_id = return_value['video_id'] return_value = rpc.start_editing(request, video_id, 'en', original_language_code='en') session_pk = return_value['session_pk'] rpc.finished_subtitles(request, session_pk, create_subtitle_set().to_xml()); return_value = rpc.show_widget(request, url_1, False, additional_video_urls=[url_0]) self.assertEqual(video_id, return_value['video_id']) subs = rpc.fetch_subtitles(request, video_id, return_value['drop_down_contents'][0]['pk']) self.assertEquals(1, len(SubtitleSet('en', subs['subtitles']))) return_value = rpc.show_widget(request, url_1, False) self.assertEqual(video_id, return_value['video_id'])
def _stack_version(sv, nsl): """Stack the given version onto the given new SL.""" from apps.subtitles import pipeline visibility = get_visibility_from_old_version(sv) subtitles = _get_subtitles(sv) try: subtitles = list(subtitles) # set subtitle set as the pipeline will pass escaping # otherwise and it will break sset = SubtitleSet.from_list(nsl.language_code, subtitles) nsv = pipeline.add_subtitles(nsl.video, nsl.language_code, sset, title=sv.title, description=sv.description, parents=[], visibility=visibility, author=sv.user, created=sv.datetime_started) except: log_subtitle_error(sv, subtitles) raise sv.new_subtitle_version = nsv sv.needs_sync = False sv.save(tern_sync=True) log('SubtitleVersion', 'stacked', sv.pk, nsv.pk)
def __init__(self, input_string, language=None): try: self.subtitle_set = SubtitleSet(language, input_string, normalize_time=True) except (XMLSyntaxError, ExpatError), e: raise SubtitleParserError( "There was an error while we were parsing your xml", e)
def to_internal(self): if not hasattr(self, 'sub_set'): self.sub_set = SubtitleSet(self.language) try: data = json.loads(self.input_string) except ValueError: raise SubtitleParserError("Invalid JSON data provided.") # Sort by the ``position`` key data = sorted(data, key=lambda k: k['position']) for sub in data: self.sub_set.append_subtitle(sub['start'], sub['end'], sub['text']) return self.sub_set
def to_internal(self): if not hasattr(self, 'sub_set'): match = None try: self.sub_set = SubtitleSet(self.language) for match in self._matches: item = self._get_data(match.groupdict()) text = self.get_markup(item['text']) self.sub_set.append_subtitle( item['start'], item['end'], text, region=item.get('region'), escape=False) if match is None: raise ValueError("No subs found") except Exception as e: raise SubtitleParserError(original_error=e) return self.sub_set
def test_one_set_empty(self): set_1 = SubtitleSet.from_list( "en", [(0, 1000, "Hey 1"), (1000, 2000, "Hey 2"), (2000, 3000, "Hey 3"), (3000, 4000, "Hey 4")] ) result = diff(set_1, SubtitleSet("en")) self.assertEqual(result["changed"], True) self.assertEqual(result["text_changed"], 1.0) self.assertEqual(result["time_changed"], 1.0)
def test_fetch_subtitles(self): request = RequestMockup(self.user_0) version = self._create_basic_version(request) subs = rpc.fetch_subtitles(request, version.video.video_id, version.language.pk) sset = SubtitleSet('en', initial_data=subs['subtitles']) self.assertEqual(1, len(sset))
class TXTParser(BaseTextParser): file_type = 'txt' _linebreak_re = re.compile(r"\n\n|\r\n\r\n|\r\r") def __init__(self, input_string, language=None, linebreak_re=_linebreak_re, eager_parse=True): self.language = language self.input_string = linebreak_re.split(input_string) if eager_parse: self.to_internal() def __len__(self): return len(self.input_string) def __nonzero__(self): return bool(self.input_string) def _result_iter(self): for item in self.input_string: output = {} output['start'] = None output['end'] = None output['text'] = utils.strip_tags(item) yield output def to_internal(self): if not hasattr(self, 'sub_set'): self.sub_set = SubtitleSet(self.language) valid = False for item in self._result_iter(): item['text'] = item['text'].replace("\n", '<br/>') if not valid and ''.join(item['text'].split()): valid = True self.sub_set.append_subtitle(item['start'], item['end'], item['text']) if not valid: raise SubtitleParserError("No subs") return self.sub_set
def test_edit_existing_original(self): request = RequestMockup(self.user_0) session = self._create_basic_version(request) language = sub_models.SubtitleLanguage.objects.get(pk=session.language.pk) return_value = rpc.show_widget(request, VIDEO_URL, False) return_value = rpc.start_editing(request, session.video.video_id, 'en', subtitle_language_pk=language.pk) self.assertEquals(len(SubtitleSet('en', return_value['subtitles']['subtitles'])), 1) self.assertFalse('original_subtitles' in return_value)
def test_unsynced_generator(self): subs = SubtitleSet('en') for x in xrange(0,5): subs.append_subtitle(None, None,"%s" % x) output = unicode(SBVGenerator(subs)) parsed = SBVParser(output,'en') internal = parsed.to_internal() subs = [x for x in internal.subtitle_items()] self.assertEqual(len(internal), 5) for i,sub in enumerate(subs): self.assertEqual(sub[0], None ) self.assertEqual(sub[1], None ) generated = SBVGenerator(internal) self.assertEqual(generated.format_time(None), u'9:59:59.990') self.assertIn(u'''9:59:59.990,9:59:59.990\r\n0\r\n\r\n9:59:59.990,9:59:59.990\r\n1\r\n\r\n9:59:59.990,9:59:59.990\r\n2\r\n\r\n9:59:59.990,9:59:59.990\r\n3\r\n\r\n9:59:59.990,9:59:59.990\r\n4\r\n''', unicode(generated))
def test_unsynced_generator(self): subs = SubtitleSet('en') for x in xrange(0,5): subs.append_subtitle(None, None,"%s" % x) output = unicode(DFXPGenerator(subs)) parsed = DFXPParser(output, 'en') internal = parsed.to_internal() subs = [x for x in internal.subtitle_items()] self.assertEqual(len(internal), 5) for i,sub in enumerate(subs): self.assertIsNone(sub[0]) self.assertIsNone(sub[1]) self.assertEqual(sub[2], str(i)) for node in internal.get_subtitles(): self.assertIsNone(get_attr(node, 'begin')) self.assertIsNone(get_attr(node, 'end'))
def test_unsynced_generator(self): subs = SubtitleSet('en') for x in xrange(0, 5): subs.append_subtitle(None, None, "%s" % x) output = unicode(SBVGenerator(subs, language='en')) parsed = SBVParser(output, 'en') internal = parsed.to_internal() subs = [x for x in internal.subtitle_items()] self.assertEqual(len(internal), 5) for i, sub in enumerate(subs): self.assertEqual(sub[0], None) self.assertEqual(sub[1], None) generated = SBVGenerator(internal) self.assertEqual(generated.format_time(None), u'9:59:59.000') self.assertIn( u'''9:59:59.000,9:59:59.000\r\n0\r\n\r\n9:59:59.000,9:59:59.000\r\n1\r\n\r\n9:59:59.000,9:59:59.000\r\n2\r\n\r\n9:59:59.000,9:59:59.000\r\n3\r\n\r\n9:59:59.000,9:59:59.000\r\n4\r\n''', unicode(generated))
def test_unsynced_generator(self): subs = SubtitleSet("en") for x in xrange(0, 5): subs.append_subtitle(None, None, "%s" % x) output = unicode(SBVGenerator(subs, language="en")) parsed = SBVParser(output, "en") internal = parsed.to_internal() subs = [x for x in internal.subtitle_items()] self.assertEqual(len(internal), 5) for i, sub in enumerate(subs): self.assertEqual(sub[0], None) self.assertEqual(sub[1], None) generated = SBVGenerator(internal) self.assertEqual(generated.format_time(None), u"9:59:59.000") self.assertIn( u"""9:59:59.000,9:59:59.000\r\n0\r\n\r\n9:59:59.000,9:59:59.000\r\n1\r\n\r\n9:59:59.000,9:59:59.000\r\n2\r\n\r\n9:59:59.000,9:59:59.000\r\n3\r\n\r\n9:59:59.000,9:59:59.000\r\n4\r\n""", unicode(generated), )
class TXTParser(BaseTextParser): file_type = 'txt' _linebreak_re = re.compile(r"\n\n|\r\n\r\n|\r\r") def __init__(self, input_string, language=None, linebreak_re=_linebreak_re, eager_parse=True): self.language = language self.input_string = linebreak_re.split(input_string) if eager_parse: self.to_internal() def __len__(self): return len(self.input_string) def __nonzero__(self): return bool(self.input_string) def _result_iter(self): for item in self.input_string: output = {} output['start'] = None output['end'] = None output['text'] = utils.strip_tags(item) yield output def to_internal(self): if not hasattr(self, 'sub_set'): self.sub_set = SubtitleSet(self.language) valid = False for item in self._result_iter(): item['text'] = item['text'].replace("\n", '<br/>') if not valid and ''.join(item['text'].split()): valid = True self.sub_set.append_subtitle(item['start'], item['end'], item['text'], escape=False) if not valid: raise SubtitleParserError("No subs") return self.sub_set
def to_internal(self): if not hasattr(self, 'sub_set'): self.sub_set = SubtitleSet(self.language) valid = False for item in self._result_iter(): if not valid and ''.join(item['text'].split()): valid = True self.sub_set.append_subtitle(item['start'], item['end'], item['text']) if not valid: raise SubtitleParserError("No subs") return self.sub_set
def test_unsynced_generator(self): subs = SubtitleSet('en') for x in xrange(0, 5): subs.append_subtitle(None, None, "%s" % x) output = unicode(SRTGenerator(subs)) parsed = SRTParser(output, 'en') internal = parsed.to_internal() subs = [x for x in internal.subtitle_items()] self.assertEqual(len(internal), 5) for i, sub in enumerate(subs): self.assertEqual(sub.start_time, None) self.assertEqual(sub.end_time, None) generated = SRTGenerator(internal) self.assertEqual(generated.format_time(None), u'99:59:59,999') self.assertIn( u'''1\r\n99:59:59,999 --> 99:59:59,999\r\n0\r\n\r\n2\r\n99:59:59,999 --> 99:59:59,999\r\n1\r\n\r\n3\r\n99:59:59,999 --> 99:59:59,999\r\n2\r\n\r\n4\r\n99:59:59,999 --> 99:59:59,999\r\n3\r\n\r\n5\r\n99:59:59,999 --> 99:59:59,999\r\n4\r\n''', unicode(generated))
def test_text_changes(self): set_1 = SubtitleSet.from_list('en', [ (0, 1000, "Hey 1"), (1000, 2000, "Hey 2"), (2000, 3000, "Hey 3"), (3000, 4000, "Hey 4"), ]) set_2 = SubtitleSet.from_list('en', [ (0, 1000, "Hey 1"), (1000, 2000, "Hey 22"), (2000, 3000, "Hey 3"), (3000, 4000, "Hey 4"), ]) result = diff(set_1, set_2) self.assertEqual(result['changed'], True) self.assertEqual(result['text_changed'], 1/4.0) self.assertEqual(result['time_changed'], 0) self.assertEqual(len(result['subtitle_data']), 4) # only sub #2 should have text changed for i,sub_data in enumerate(result['subtitle_data']): self.assertEqual(sub_data['text_changed'], i ==1)
def test_delete(self): set_1 = SubtitleSet.from_list( "en", [(0, 1000, "Hey 1"), (1000, 2000, "Hey 2"), (2000, 3000, "Hey 3"), (3000, 4000, "Hey 4")] ) set_2 = SubtitleSet.from_list("en", [(0, 1000, "Hey 1"), (2000, 3000, "Hey 3"), (3000, 4000, "Hey 4")]) result = diff(set_1, set_2) self.assertEqual(result["changed"], True) # for both time_change and text_changed, we calculate them as follows: # there are 7 total subs. 6 of those are matches and 1 is new in # set_2. So the change amount is 1/9 self.assertAlmostEqual(result["time_changed"], 1 / 7.0) self.assertAlmostEqual(result["text_changed"], 1 / 7.0) self.assertEqual(len(result["subtitle_data"]), 4) # check the lines that haven't changed self.check_unchanged_subtitle_data(result, set_1, set_2, 0, 2, 3) # check the line that was deleted delete_sub_data = result["subtitle_data"][1] self.assertEquals(delete_sub_data["time_changed"], True) self.assertEquals(delete_sub_data["text_changed"], True) self.assertEquals(delete_sub_data["subtitles"][1], self.empty_line()) self.assertEquals(delete_sub_data["subtitles"][0], set_1[1])
def test_time_changes(self): set_1 = SubtitleSet.from_list('en', [ (0, 1000, "Hey 1"), (1000, 2000, "Hey 2"), (2000, 3000, "Hey 3"), (3000, 4000, "Hey 4"), ]) set_2 = SubtitleSet.from_list('en', [ (0, 1000, "Hey 1"), (1200, 2000, "Hey 2"), (2000, 3000, "Hey 3"), (3000, 4000, "Hey 4"), ]) result = diff(set_1, set_2) self.assertEqual(result['changed'], True) self.assertEqual(result['time_changed'], 1/4.0) self.assertEqual(result['text_changed'], 0) self.assertEqual(len(result['subtitle_data']), 4) # only sub #2 should have text changed for i,sub_data in enumerate(result['subtitle_data']): self.assertEqual(sub_data['time_changed'], i ==1) self.assertFalse(sub_data['text_changed'])
def test_replace_single_line_with_multiple(self): set_1 = SubtitleSet.from_list( "en", [(0, 1000, "Hey 1"), (1000, 2000, "Hey 2"), (2000, 3000, "Hey 3"), (3000, 4000, "Hey 4")] ) set_2 = SubtitleSet.from_list( "en", [ (0, 1000, "Hey 1"), (1000, 1500, "Hey 2.1"), (1500, 2000, "Hey 2.2"), (2000, 3000, "Hey 3"), (3000, 4000, "Hey 4"), ], ) result = diff(set_1, set_2) self.assertEqual(result["changed"], True) # for both time_change and text_changed, we calculate them as follows: # there are 9 total subs. 6 of those are matches and 1 in set 1 was # changed to 2 in set 2. So the change amount is 3/9. self.assertAlmostEqual(result["time_changed"], 3 / 9.0) self.assertAlmostEqual(result["text_changed"], 3 / 9.0) self.assertEqual(len(result["subtitle_data"]), 5) # check the lines that haven't changed self.check_unchanged_subtitle_data(result, set_1, set_2, 0, 3, 4) # line 1 in set_1 was replaced my lines 2 and 3 in set_2 line1 = result["subtitle_data"][1] self.assertEquals(line1["time_changed"], True) self.assertEquals(line1["text_changed"], True) self.assertEquals(line1["subtitles"][0], set_1[1]) self.assertEquals(line1["subtitles"][1], set_2[1]) line2 = result["subtitle_data"][2] self.assertEquals(line2["time_changed"], True) self.assertEquals(line2["text_changed"], True) self.assertEquals(line2["subtitles"][0], self.empty_line()) self.assertEquals(line2["subtitles"][1], set_2[2])
def to_internal(self): if not hasattr(self, 'sub_set'): match = None try: self.sub_set = SubtitleSet(self.language) for match in self._matches: item = self._get_data(match.groupdict()) text = self.get_markup(item['text']) self.sub_set.append_subtitle(item['start'], item['end'], text, escape=False) if match is None: raise ValueError("No subs found") except Exception as e: raise SubtitleParserError(original_error=e) return self.sub_set
def test_dfxp_merge(self): en_subs = SubtitleSet('en') es_subs = SubtitleSet('es') en_subs.append_subtitle(1000, 1500, 'content') es_subs.append_subtitle(1000, 1500, 'spanish content') result = self.loader.dfxp_merge([en_subs, es_subs]) utils.assert_long_text_equal(result, """\ <tt xmlns:tts="http://www.w3.org/ns/ttml#styling" xmlns:ttp="http://www.w3.org/ns/ttml#parameter" xmlns:ttm="http://www.w3.org/ns/ttml#metadata" xmlns="http://www.w3.org/ns/ttml" xml:lang=""> <head> <metadata> <ttm:title></ttm:title> <ttm:description></ttm:description> <ttm:copyright/> </metadata> <styling> <style xml:id="test-style" tts:color="white" tts:fontSize="18px"/> </styling> <layout> <region xml:id="bottom" style="test-style" tts:origin="0 80%" tts:extent="100% 20%"/> <region xml:id="top" style="test-style" tts:origin="0 0" tts:extent="100% 20%"/> </layout> </head> <body region="bottom"> <div xml:lang="en"> <div> <p begin="00:00:01.000" end="00:00:01.500">content</p> </div> </div> <div xml:lang="es"> <div> <p begin="00:00:01.000" end="00:00:01.500">spanish content</p> </div> </div> </body> </tt> """)
class BaseTextParser(object): def __init__(self, input_string, pattern, language=None, flags=[], eager_parse=True): ''' If `eager_parse` is True will parse the subtitles right way, converting to our internal storage format, else only if you call `to_internal` directly (or `to`). Any errors during parsing will be of SubtitleParserError. Note that a file with no valid subs will be an error. ''' self.input_string = input_string self.pattern = pattern self.language = language self._pattern = re.compile(pattern, *flags) if eager_parse: self.to_internal() def __iter__(self): return self._result_iter() def __len__(self): return len(self._pattern.findall(self.input_string)) def __nonzero__(self): return bool(self._pattern.search(self.input_string)) def _result_iter(self): """ Should iterate over items like this: { 'start': ..., 'end': ..., 'text': ... } start_time and end_time in seconds. If it is not defined use -1. """ for item in self._matches: yield self._get_data(item.groupdict()) def _get_data(self, match): return match def _get_matches(self): return self._pattern.finditer(self.input_string) def __unicode__(self): return self.to(self.file_type) @classmethod def parse(cls, input_string, language=None): return cls(input_string, language) def to(self, type): from babelsubs import to if isinstance(type, list): type = type[0] return to(self.to_internal(), type, language=self.language) def to_internal(self): if not hasattr(self, 'sub_set'): match = None try: self.sub_set = SubtitleSet(self.language) for match in self._matches: item = self._get_data(match.groupdict()) # fix me: support markup text = self.get_markup(item['text']) self.sub_set.append_subtitle(item['start'], item['end'], text, escape=False) if match is None: raise ValueError("No subs found") except Exception as e: raise SubtitleParserError(original_error=e) return self.sub_set def get_markup(self, text): return text _matches = property(_get_matches)
def test_regions(self): subs = SubtitleSet('en') sub = subs.append_subtitle(0, 1000, "test", region="top") generator = WEBVTTGenerator(subs) self.assertEqual(generator.format_cue_header(subs.subtitle_items()[0]), u'00:00:00.000 --> 00:00:01.000 line:1')
class DFXPMergeTest(TestCase): def setUp(self): self.en_subs = SubtitleSet('en') self.es_subs = SubtitleSet('es') self.fr_subs = SubtitleSet('fr') self.en_subs.append_subtitle(1000, 1500, 'content') self.es_subs.append_subtitle(1000, 1500, 'spanish content') self.es_subs.append_subtitle(2000, 2500, 'spanish content 2', new_paragraph=True) self.fr_subs.append_subtitle(1000, 1500, 'french content') def test_dfxp_merge(self): result = DFXPGenerator.merge_subtitles( [self.en_subs, self.es_subs, self.fr_subs]) utils.assert_long_text_equal(result, """\ <tt xmlns="http://www.w3.org/ns/ttml" xmlns:tts="http://www.w3.org/ns/ttml#styling" xml:lang=""> <head> <metadata xmlns:ttm="http://www.w3.org/ns/ttml#metadata"> <ttm:title/> <ttm:description/> <ttm:copyright/> </metadata> <styling xmlns:tts="http://www.w3.org/ns/ttml#styling"> <style xml:id="amara-style" tts:color="white" tts:fontFamily="proportionalSansSerif" tts:fontSize="18px" tts:textAlign="center"/> </styling> <layout xmlns:tts="http://www.w3.org/ns/ttml#styling"> <region xml:id="amara-subtitle-area" style="amara-style" tts:extent="560px 62px" tts:padding="5px 3px" tts:backgroundColor="black" tts:displayAlign="after"/> </layout> </head> <body region="amara-subtitle-area"> <div xml:lang="en"> <div> <p begin="00:00:01.000" end="00:00:01.500">content</p> </div> </div> <div xml:lang="es"> <div> <p begin="00:00:01.000" end="00:00:01.500">spanish content</p> </div> <div> <p begin="00:00:02.000" end="00:00:02.500">spanish content 2</p> </div> </div> <div xml:lang="fr"> <div> <p begin="00:00:01.000" end="00:00:01.500">french content</p> </div> </div> </body> </tt> """) def test_merge_with_header(self): initial_ttml = etree.fromstring("""\ <tt xmlns="http://www.w3.org/ns/ttml" xmlns:tts="http://www.w3.org/ns/ttml#styling"> <head> <styling> <style xml:id="style" tts:color="foo" tts:fontSize="bar" /> </styling> <layout> <region xml:id="region" style="style" tts:extent="foo" tts:origin="bar" /> </layout> </head> <body /> </tt>""") result = DFXPGenerator.merge_subtitles( [self.en_subs, self.es_subs, self.fr_subs], initial_ttml=initial_ttml) utils.assert_long_text_equal(result, """\ <tt xmlns="http://www.w3.org/ns/ttml" xmlns:tts="http://www.w3.org/ns/ttml#styling" xml:lang=""> <head> <styling> <style xml:id="style" tts:color="foo" tts:fontSize="bar"/> </styling> <layout> <region xml:id="region" style="style" tts:extent="foo" tts:origin="bar"/> </layout> </head> <body> <div xml:lang="en"> <div> <p begin="00:00:01.000" end="00:00:01.500">content</p> </div> </div> <div xml:lang="es"> <div> <p begin="00:00:01.000" end="00:00:01.500">spanish content</p> </div> <div> <p begin="00:00:02.000" end="00:00:02.500">spanish content 2</p> </div> </div> <div xml:lang="fr"> <div> <p begin="00:00:01.000" end="00:00:01.500">french content</p> </div> </div> </body> </tt> """)
def test_span_around_newline(self): source = 'one<span fontStyle="italic"><br/></span>two' subs = SubtitleSet('en') subs.append_subtitle(0, 1000, source, escape=False) items = subs.subtitle_items(mappings=WEBVTTGenerator.MAPPINGS) self.assertEqual(items[0].text, 'one<i>\n</i>two')
def test_space_before_end_span(self): source = """<span fontStyle="italic">one<br/>two </span>three<span fontStyle="italic">four.</span>""" subs = SubtitleSet('en') subs.append_subtitle(0, 1000, source, escape=False) items = subs.subtitle_items(mappings=WEBVTTGenerator.MAPPINGS) self.assertEqual(items[0].text, '<i>one\ntwo </i>three<i>four.</i>')