def stream(cls, source_file, error_handling=ERROR_PASS): """ stream(source_file, [error_handling]) This method yield SubRipItem instances a soon as they have been parsed without storing them. It is a kind of SAX parser for .srt files. `source_file` -> Any iterable that yield unicode strings, like a file opened with `codecs.open()` or an array of unicode. Example: >>> import pysrt >>> import codecs >>> file = codecs.open('movie.srt', encoding='utf-8') >>> for sub in pysrt.stream(file): ... sub.text += "\nHello !" ... print unicode(sub) """ string_buffer = [] for index, line in enumerate(chain(source_file, '\n')): if line.strip(): string_buffer.append(line) else: source = string_buffer string_buffer = [] if source and all(source): try: yield SubRipItem.from_lines(source) except Error as error: error.args += (''.join(source), ) cls._handle_error(error, error_handling, index)
def open(cls, path='', encoding='utf-8', error_handling=ERROR_PASS, file_descriptor=None): """ open([path, [encoding]]) Encoding is set to utf-8 as default. """ new_file = cls(path=path, encoding=encoding) try: if file_descriptor is None: source_file = open(path, 'rU') else: source_file = file_descriptor string_buffer = StringIO() for index, line in enumerate(chain(source_file, '\n')): if line.strip(): string_buffer.write(line) else: string_buffer.seek(0) source = unicode(string_buffer.read(), new_file.encoding,errors='ignore') try: new_item = SubRipItem.from_string(source) new_file.append(new_item) except InvalidItem, error: cls._handle_error(error, error_handling, path, index) finally: string_buffer.truncate(0)
def __write_subs(self, out_fd, in_path): for index, caption in enumerate(self.__vtt_reader.read(in_path)): start = SubRipTime(0, 0, caption.start_in_seconds) end = SubRipTime(0, 0, caption.end_in_seconds) item = SubRipItem(index + 1, start, end, html.unescape(caption.text)) out_fd.write("%s\n" % str(item))
def setUp(self): mock_player_attrs = { u'get_filters.return_value': Defaults.DEFAULT_FILTERS, } mock_player = mock.Mock(spec=Player, **mock_player_attrs) self.verificator = Verificator(mock_player) self.verificator.set_subtitle(SubRipItem( text=u"Hello! \n[emotions]\nWorld! It's...\n testing."))
def test_is_empty(self): data = { u'[emotions]': True, u' [ emotions ] ': True, u'PERSON: [ emotions ]': True, u'. . .': True, u'PERSON: Hi': False, u'[emotions] Hi': False, } for src, result in data.iteritems(): self.verificator.set_subtitle(SubRipItem(text=src)) self.assertEqual(self.verificator.is_empty(), result, msg=u'On verifying of {0}.'.format(src))
def test_text_cleaning(self): data = { u'Hello! \n[emotions]\nWorld.': [u'hello', u'world'], u'Hello \nworld!': [u'hello', u'world'], u"It's ...\n testing.": [u"it's", u'testing'], u"It's\ntesting!": [u"it's", u'testing'], u"Word": [u'word'], } for src, result in data.iteritems(): self.verificator.set_subtitle(SubRipItem(text=src)) self.assertEqual(self.verificator.get_etalon_words(), result, msg=u'On cleaning of {0}'.format(src))
def convert(directory, filename): index = 0 vtt_filepath = f"%s\\%s.vtt" % (directory, filename) srt_filepath = f"%s\\%s.srt" % (directory, filename) srt = open(srt_filepath, "w") for caption in WebVTT().read(vtt_filepath): index += 1 start = SubRipTime(0, 0, caption.start_in_seconds) end = SubRipTime(0, 0, caption.end_in_seconds) srt.write( SubRipItem(index, start, end, html.unescape( caption.text)).__str__() + "\n")
def test_is_complete_with_punctuation(self): data = { u'Hello!': u'hello', u'Hello...': u'hello', u'. . .Hello!': u'hello', u'Hello. . .': u'hello', u'[emotions]': u'', u'Hello [ emotions ] .': u'hello', u'PERSON: Hello, people. . .': u'hello people', u'PERSON:\nHello [ emotions ]\n, World ! ! !': u'Hello, world', u'Say: " Hello, World! "': u'say hello world', u'. . . World!': u'world', } for question, answer in data.iteritems(): self.verificator.set_subtitle(SubRipItem(text=question)) self.assertTrue(self.verificator.is_complete(answer), msg=u'On verifying of {0}.'.format(question))
def convert_sub(vtt_file): index = 0 file_name, file_extension = os.path.splitext(vtt_file) if not file_extension.lower() == ".vtt": sys.stderr.write("Skipping %s.\n" % vtt_file) raise Exception("VTT file could not be found.") srt = open(file_name + ".srt", "w") for caption in WebVTT().read(vtt_file): index += 1 start = SubRipTime(0, 0, caption.start_in_seconds) end = SubRipTime(0, 0, caption.end_in_seconds) srt.write( SubRipItem(index, start, end, html.unescape( caption.text)).__str__() + "\n")
def create_srt(split_df, cris_stt_df): abs_path = os.path.dirname(split_df) df1 = pd.read_csv(split_df) df2 = pd.read_excel(cris_stt_df) df1.rename(columns={'wav_filename': 'wav_name'}, inplace=True) # This df3 contains all the info for srt creation df3 = pd.merge(df1, df2, how='inner', on='wav_name') print("Creating the srt:") new_srt = SubRipFile() for index, row in df3.iterrows(): text = str(row['transcripts'] if \ type(row['transcripts']) != float \ else "") new_srt.append( SubRipItem(index=index + 1, start=SubRipTime(milliseconds=row['start']), end=SubRipTime(milliseconds=row['end']), text=text[:-1] if text.endswith(".") else text)) new_srt.save(os.path.join(abs_path, "stt_converted.srt")) print("successfully written")
def open(cls, path='', encoding=None, error_handling=ERROR_PASS, file_descriptor=None, eol=None): """ open([path, [encoding]]) If you do not provide any encoding, it can be detected if the file contain a bit order mark, unless it is set to utf-8 as default. """ if file_descriptor is None: source_file = open(path, 'rU') else: source_file = file_descriptor encoding = encoding or cls.detect_encoding(source_file) source_file = codecs.EncodedFile(source_file, cls.DEFAULT_ENCODING, encoding) new_file = cls(path=path, encoding=encoding) string_buffer = StringIO() for index, line in enumerate(chain(source_file, '\n')): if line.strip(): string_buffer.write(line) else: string_buffer.seek(0) source = string_buffer.read() if source.strip(): try: try: source = source.decode(cls.DEFAULT_ENCODING) new_item = SubRipItem.from_string(source) new_file.append(new_item) except InvalidItem, error: cls._handle_error(error, error_handling, path, index) finally: string_buffer.truncate(0) eol = eol or cls._extract_newline(source_file) if eol is not None: new_file.eol = eol source_file.close() return new_file
def stream(cls, source_file, error_handling=ERROR_PASS): """ stream(source_file, [error_handling]) This method yield SubRipItem instances a soon as they have been parsed without storing them. It is a kind of SAX parser for .srt files. `source_file` -> Any iterable that yield unicode strings, like a file opened with `codecs.open()` or an array of unicode. Example: >>> from pysrt import SubRipFile >>> import codecs >>> file = codecs.open('movie.srt', encoding='utf-8') >>> for sub in SubRipFile.stream(file): ... sub.text += "\nHello !" ... print unicode(sub) """ string_buffer = [] # weird bug workaround if hasattr(source_file, 'seek'): position = source_file.tell() # under Python 2.5 this call return the second line of the file # instead of the first character. It's probably a buffering bug # in the codecs module. I've not found a better fix... source_file.read(1) source_file.seek(position) for index, line in enumerate(chain(source_file, u'\n')): if line.strip(): string_buffer.append(line) else: source = string_buffer string_buffer = [] if source and all(source): try: yield SubRipItem.from_lines(source) except Error, error: error.args += (''.join(source), ) cls._handle_error(error, error_handling, index)
def stream(cls, source_file, error_handling=ERROR_PASS): """ stream(source_file, [error_handling]) This method yield SubRipItem instances a soon as they have been parsed without storing them. It is a kind of SAX parser for .srt files. `source_file` -> Any iterable that yield unicode strings, like a file opened with `codecs.open()` or an array of unicode. Example: >>> import pysrt >>> import codecs >>> file = codecs.open('movie.srt', encoding='utf-8') >>> for sub in pysrt.stream(file): ... sub.text += "\nHello !" ... print unicode(sub) """ string_buffer = [] # weird bug workaround if hasattr(source_file, 'seek'): position = source_file.tell() # under Python 2.5 this call return the second line of the file # instead of the first character. It's probably a buffering bug # in the codecs module. I've not found a better fix... source_file.read(1) source_file.seek(position) for index, line in enumerate(chain(source_file, u'\n')): if line.strip(): string_buffer.append(line) else: source = string_buffer string_buffer = [] if source and all(source): try: yield SubRipItem.from_lines(source) except Error, error: error.args += (''.join(source), ) cls._handle_error(error, error_handling, index)
script = sys.argv[0] args = sys.argv[1:] def usage(): return "%s FILE...\n" % os.path.basename(script) if len(args) < 1: sys.stderr.write(usage()) sys.exit(1) for arg in args: index = 0 file_name, file_extension = os.path.splitext(arg) if not file_extension.lower() == ".vtt": sys.stderr.write("Skipping %s.\n" % arg) continue srt = open(file_name + ".srt", "w") for caption in WebVTT().read(arg): index += 1 start = SubRipTime(0, 0, caption.start_in_seconds) end = SubRipTime(0, 0, caption.end_in_seconds) srt.write( SubRipItem(index, start, end, html.unescape( caption.text)).__str__() + "\n")
help="name of output file without extention (default: output)") parser.add_argument('--eol', type=str, default="###", help="End of line marker (default: ###)") args=parser.parse_args() subs = pysrt.open(args.file) after_subs = [] indx = 1 iterator = iter(subs) while True: try: sub = next(iterator) except StopIteration: break if sub.text[-len(args.eol):] != args.eol: after_subs.append(SubRipItem(indx, sub.start, sub.end, sub.text)) else: line2_sub = next(iterator) text = sub.text[:-len(args.eol)] + '\n' + line2_sub.text combined_sub = SubRipItem(indx, sub.start, line2_sub.end, text=text) after_subs.append(combined_sub) indx += 1 after = SubRipFile(items=after_subs) after.save(args.output + '.srt', encoding='utf-8')
def test_verify_answer_and_statistics(self): data = { u'hel': [(sym, True) for sym in u'hel'], u'hello': [(sym, True) for sym in u'Hello! '], u'hez': [(sym, False if sym == u'z' else True) for sym in u'hez'], u'hezl': [(sym, False if sym == u'z' else True) for sym in u'hez'], u'hezlo': [(sym, False if sym == u'z' else True) for sym in u'hez'], u'hello ': [(sym, True) for sym in u'Hello! '], u'tests': [(u't', False)], u'hello worl': [(sym, True) for sym in u'Hello! worl'], u'hello world': [(sym, True) for sym in u'Hello! World! '], u"Hello! World! It's testing": [ (sym, True) for sym in u"Hello! World! It's... testing. "], u'hello wordm': [(sym, True if sym != u'd' else False) for sym in u'Hello! word'], u"Hello! World! it'": [(sym, True) for sym in u"Hello! World! it'"], u"mello!": [(u'm', False)], } msg = u"On verifying of {0}.\nExpected: {1}.\nReturned: {2}." for src, result in data.iteritems(): returned = self.verificator.verify_answer(src) self.assertEqual(returned, result, msg=msg.format(src, result, returned)) msg = u'\n'.join([u'For question: {3}.', msg]) question = u'PERSON: Hello, [ emotions ] world! ! !' self.verificator.clear_subtitle(complete=True) self.verificator.set_subtitle( SubRipItem(text=question)) data = { u'hel': [(sym, True) for sym in u'hel'], u'per': [(u'p', False)], u'hello': [(sym, True) for sym in u'Hello, '], u'hello world': [(sym, True) for sym in u'Hello, world! '], u'Hello, world': [(sym, True) for sym in u'Hello, world! '], } for src, result in data.iteritems(): returned = self.verificator.verify_answer(src) self.assertEqual(returned, result, msg=msg.format(src, result, returned, question)) question = u'. . .Hello . . . world. . .' self.verificator.clear_subtitle(complete=True) self.verificator.set_subtitle( SubRipItem(text=question)) data = { u'hel': [(sym, True) for sym in u'hel'], u'hello': [(sym, True) for sym in u'.Hello '], u'Hello wor': [(sym, True) for sym in u'.Hello wor'], u'Hello world': [(sym, True) for sym in u'.Hello world. '], u"Hello world. it's": [(sym, False if sym == u'i' else True) for sym in u'.Hello world. i'], } for src, result in data.iteritems(): returned = self.verificator.verify_answer(src) self.assertEqual(returned, result, msg=msg.format(src, result, returned, question)) [self.verificator.hint(ans) for ans in (u'hel', u'hello w')] self.verificator.clear_subtitle(complete=True) stats = self.verificator.get_statistics() self.assertTrue(stats.get(u'learning_time', 0) > 0) for param, value in { u'completed_fragments': 3, u'hint_used': 2, u'mistakes': 8, u'total_chars': 108 }.iteritems(): self.assertEqual(stats.get(param, None), value, msg=u'On verifying of {0}.'.format(param))