Example #1
0
    def stream(cls, source_file, error_handling=ERROR_PASS):
        """
        stream(source_file, [error_handling])

        This method yield SubRipItem instances a soon as they have been parsed
        without storing them. It is a kind of SAX parser for .srt files.

        `source_file` -> Any iterable that yield unicode strings, like a file
            opened with `codecs.open()` or an array of unicode.

        Example:
            >>> import pysrt
            >>> import codecs
            >>> file = codecs.open('movie.srt', encoding='utf-8')
            >>> for sub in pysrt.stream(file):
            ...     sub.text += "\nHello !"
            ...     print unicode(sub)
        """
        string_buffer = []
        for index, line in enumerate(chain(source_file, '\n')):
            if line.strip():
                string_buffer.append(line)
            else:
                source = string_buffer
                string_buffer = []
                if source and all(source):
                    try:
                        yield SubRipItem.from_lines(source)
                    except Error as error:
                        error.args += (''.join(source), )
                        cls._handle_error(error, error_handling, index)
Example #2
0
    def open(cls, path='', encoding='utf-8', error_handling=ERROR_PASS,
             file_descriptor=None):
        """
        open([path, [encoding]])

        Encoding is set to utf-8 as default.
        """
        new_file = cls(path=path, encoding=encoding)

	try:
        	if file_descriptor is None:
            		source_file = open(path, 'rU')
        	else:
        	    	source_file = file_descriptor

        	string_buffer = StringIO()
        	for index, line in enumerate(chain(source_file, '\n')):
            		if line.strip():
                		string_buffer.write(line)
            		else:
                		string_buffer.seek(0)
				source = unicode(string_buffer.read(), new_file.encoding,errors='ignore')
                		try:
                    			new_item = SubRipItem.from_string(source)
                    			new_file.append(new_item)
                		except InvalidItem, error:
                    			cls._handle_error(error, error_handling, path, index)
                		finally:
                    			string_buffer.truncate(0)
Example #3
0
 def __write_subs(self, out_fd, in_path):
     for index, caption in enumerate(self.__vtt_reader.read(in_path)):
         start = SubRipTime(0, 0, caption.start_in_seconds)
         end = SubRipTime(0, 0, caption.end_in_seconds)
         item = SubRipItem(index + 1, start, end,
                           html.unescape(caption.text))
         out_fd.write("%s\n" % str(item))
Example #4
0
    def stream(cls, source_file, error_handling=ERROR_PASS):
        """
        stream(source_file, [error_handling])

        This method yield SubRipItem instances a soon as they have been parsed
        without storing them. It is a kind of SAX parser for .srt files.

        `source_file` -> Any iterable that yield unicode strings, like a file
            opened with `codecs.open()` or an array of unicode.

        Example:
            >>> import pysrt
            >>> import codecs
            >>> file = codecs.open('movie.srt', encoding='utf-8')
            >>> for sub in pysrt.stream(file):
            ...     sub.text += "\nHello !"
            ...     print unicode(sub)
        """
        string_buffer = []
        for index, line in enumerate(chain(source_file, '\n')):
            if line.strip():
                string_buffer.append(line)
            else:
                source = string_buffer
                string_buffer = []
                if source and all(source):
                    try:
                        yield SubRipItem.from_lines(source)
                    except Error as error:
                        error.args += (''.join(source), )
                        cls._handle_error(error, error_handling, index)
Example #5
0
 def setUp(self):
     mock_player_attrs = {
         u'get_filters.return_value': Defaults.DEFAULT_FILTERS,
     }
     mock_player = mock.Mock(spec=Player, **mock_player_attrs)
     self.verificator = Verificator(mock_player)
     self.verificator.set_subtitle(SubRipItem(
         text=u"Hello! \n[emotions]\nWorld! It's...\n testing."))
Example #6
0
 def test_is_empty(self):
     data = {
         u'[emotions]': True, u' [ emotions ] ': True,
         u'PERSON: [ emotions ]': True, u'. . .': True,
         u'PERSON: Hi': False, u'[emotions] Hi': False,
     }
     for src, result in data.iteritems():
         self.verificator.set_subtitle(SubRipItem(text=src))
         self.assertEqual(self.verificator.is_empty(), result,
                          msg=u'On verifying of {0}.'.format(src))
Example #7
0
 def test_text_cleaning(self):
     data = {
         u'Hello! \n[emotions]\nWorld.': [u'hello', u'world'],
         u'Hello \nworld!': [u'hello', u'world'],
         u"It's ...\n testing.": [u"it's", u'testing'],
         u"It's\ntesting!": [u"it's", u'testing'],
         u"Word": [u'word'],
     }
     for src, result in data.iteritems():
         self.verificator.set_subtitle(SubRipItem(text=src))
         self.assertEqual(self.verificator.get_etalon_words(), result,
                          msg=u'On cleaning of {0}'.format(src))
Example #8
0
def convert(directory, filename):
    index = 0
    vtt_filepath = f"%s\\%s.vtt" % (directory, filename)
    srt_filepath = f"%s\\%s.srt" % (directory, filename)
    srt = open(srt_filepath, "w")

    for caption in WebVTT().read(vtt_filepath):
        index += 1
        start = SubRipTime(0, 0, caption.start_in_seconds)
        end = SubRipTime(0, 0, caption.end_in_seconds)
        srt.write(
            SubRipItem(index, start, end, html.unescape(
                caption.text)).__str__() + "\n")
Example #9
0
 def test_is_complete_with_punctuation(self):
     data = {
         u'Hello!': u'hello', u'Hello...': u'hello',
         u'. . .Hello!': u'hello', u'Hello. . .': u'hello',
         u'[emotions]': u'', u'Hello [ emotions ] .': u'hello',
         u'PERSON: Hello, people. . .': u'hello people',
         u'PERSON:\nHello [ emotions ]\n, World ! ! !': u'Hello, world',
         u'Say: " Hello, World! "': u'say hello world',
         u'. . . World!': u'world',
     }
     for question, answer in data.iteritems():
         self.verificator.set_subtitle(SubRipItem(text=question))
         self.assertTrue(self.verificator.is_complete(answer),
                         msg=u'On verifying of {0}.'.format(question))
Example #10
0
def convert_sub(vtt_file):
    index = 0

    file_name, file_extension = os.path.splitext(vtt_file)

    if not file_extension.lower() == ".vtt":
        sys.stderr.write("Skipping %s.\n" % vtt_file)
        raise Exception("VTT file could not be found.")

    srt = open(file_name + ".srt", "w")

    for caption in WebVTT().read(vtt_file):
        index += 1
        start = SubRipTime(0, 0, caption.start_in_seconds)
        end = SubRipTime(0, 0, caption.end_in_seconds)
        srt.write(
            SubRipItem(index, start, end, html.unescape(
                caption.text)).__str__() + "\n")
Example #11
0
def create_srt(split_df, cris_stt_df):
    abs_path = os.path.dirname(split_df)
    df1 = pd.read_csv(split_df)
    df2 = pd.read_excel(cris_stt_df)
    df1.rename(columns={'wav_filename': 'wav_name'}, inplace=True)
    # This df3 contains all the info for srt creation
    df3 = pd.merge(df1, df2, how='inner', on='wav_name')
    print("Creating the srt:")
    new_srt = SubRipFile()
    for index, row in df3.iterrows():
        text = str(row['transcripts'] if \
                    type(row['transcripts']) != float \
                        else "")
        new_srt.append(
            SubRipItem(index=index + 1,
                       start=SubRipTime(milliseconds=row['start']),
                       end=SubRipTime(milliseconds=row['end']),
                       text=text[:-1] if text.endswith(".") else text))
    new_srt.save(os.path.join(abs_path, "stt_converted.srt"))
    print("successfully written")
Example #12
0
    def open(cls, path='', encoding=None, error_handling=ERROR_PASS,
             file_descriptor=None, eol=None):
        """
        open([path, [encoding]])

        If you do not provide any encoding, it can be detected if the file
        contain a bit order mark, unless it is set to utf-8 as default.
        """
        if file_descriptor is None:
            source_file = open(path, 'rU')
        else:
            source_file = file_descriptor

        encoding = encoding or cls.detect_encoding(source_file)
        source_file = codecs.EncodedFile(source_file, cls.DEFAULT_ENCODING,
                                         encoding)

        new_file = cls(path=path, encoding=encoding)
        string_buffer = StringIO()
        for index, line in enumerate(chain(source_file, '\n')):
            if line.strip():
                string_buffer.write(line)
            else:
                string_buffer.seek(0)
                source = string_buffer.read()
                if source.strip():
                    try:
                        try:
                            source = source.decode(cls.DEFAULT_ENCODING)
                            new_item = SubRipItem.from_string(source)
                            new_file.append(new_item)
                        except InvalidItem, error:
                            cls._handle_error(error, error_handling, path, index)
                    finally:
                        string_buffer.truncate(0)

        eol = eol or cls._extract_newline(source_file)
        if eol is not None:
            new_file.eol = eol
        source_file.close()
        return new_file
Example #13
0
    def stream(cls, source_file, error_handling=ERROR_PASS):
        """
        stream(source_file, [error_handling])

        This method yield SubRipItem instances a soon as they have been parsed
        without storing them. It is a kind of SAX parser for .srt files.

        `source_file` -> Any iterable that yield unicode strings, like a file
            opened with `codecs.open()` or an array of unicode.

        Example:
            >>> from pysrt import SubRipFile
            >>> import codecs
            >>> file = codecs.open('movie.srt', encoding='utf-8')
            >>> for sub in SubRipFile.stream(file):
            ...     sub.text += "\nHello !"
            ...     print unicode(sub)
        """
        string_buffer = []

        # weird bug workaround
        if hasattr(source_file, 'seek'):
            position = source_file.tell()
            # under Python 2.5 this call return the second line of the file
            # instead of the first character. It's probably a buffering bug
            # in the codecs module. I've not found a better fix...
            source_file.read(1)
            source_file.seek(position)

        for index, line in enumerate(chain(source_file, u'\n')):
            if line.strip():
                string_buffer.append(line)
            else:
                source = string_buffer
                string_buffer = []
                if source and all(source):
                    try:
                        yield SubRipItem.from_lines(source)
                    except Error, error:
                        error.args += (''.join(source), )
                        cls._handle_error(error, error_handling, index)
Example #14
0
    def stream(cls, source_file, error_handling=ERROR_PASS):
        """
        stream(source_file, [error_handling])

        This method yield SubRipItem instances a soon as they have been parsed
        without storing them. It is a kind of SAX parser for .srt files.

        `source_file` -> Any iterable that yield unicode strings, like a file
            opened with `codecs.open()` or an array of unicode.

        Example:
            >>> import pysrt
            >>> import codecs
            >>> file = codecs.open('movie.srt', encoding='utf-8')
            >>> for sub in pysrt.stream(file):
            ...     sub.text += "\nHello !"
            ...     print unicode(sub)
        """
        string_buffer = []

        # weird bug workaround
        if hasattr(source_file, 'seek'):
            position = source_file.tell()
            # under Python 2.5 this call return the second line of the file
            # instead of the first character. It's probably a buffering bug
            # in the codecs module. I've not found a better fix...
            source_file.read(1)
            source_file.seek(position)

        for index, line in enumerate(chain(source_file, u'\n')):
            if line.strip():
                string_buffer.append(line)
            else:
                source = string_buffer
                string_buffer = []
                if source and all(source):
                    try:
                        yield SubRipItem.from_lines(source)
                    except Error, error:
                        error.args += (''.join(source), )
                        cls._handle_error(error, error_handling, index)
Example #15
0
script = sys.argv[0]
args = sys.argv[1:]


def usage():
    return "%s FILE...\n" % os.path.basename(script)


if len(args) < 1:
    sys.stderr.write(usage())
    sys.exit(1)

for arg in args:
    index = 0

    file_name, file_extension = os.path.splitext(arg)

    if not file_extension.lower() == ".vtt":
        sys.stderr.write("Skipping %s.\n" % arg)
        continue

    srt = open(file_name + ".srt", "w")

    for caption in WebVTT().read(arg):
        index += 1
        start = SubRipTime(0, 0, caption.start_in_seconds)
        end = SubRipTime(0, 0, caption.end_in_seconds)
        srt.write(
            SubRipItem(index, start, end, html.unescape(
                caption.text)).__str__() + "\n")
Example #16
0
                 help="name of output file without extention (default: output)")

    parser.add_argument('--eol', type=str, default="###",
                 help="End of line marker (default: ###)")
    
    args=parser.parse_args()

    subs = pysrt.open(args.file)
    after_subs = []

    indx = 1
    iterator = iter(subs)

    while True:
        try:
            sub = next(iterator)
        except StopIteration:
            break

        if sub.text[-len(args.eol):] != args.eol:
            after_subs.append(SubRipItem(indx, sub.start, sub.end, sub.text))
        else:
            line2_sub = next(iterator)
            text = sub.text[:-len(args.eol)] + '\n' + line2_sub.text
            combined_sub = SubRipItem(indx, sub.start, line2_sub.end, text=text)
            after_subs.append(combined_sub)
        indx += 1

    after = SubRipFile(items=after_subs)
    after.save(args.output + '.srt', encoding='utf-8')
Example #17
0
 def test_verify_answer_and_statistics(self):
     data = {
         u'hel': [(sym, True) for sym in u'hel'],
         u'hello': [(sym, True) for sym in u'Hello! '],
         u'hez': [(sym, False if sym == u'z' else True) for sym in u'hez'],
         u'hezl': [(sym, False if sym == u'z' else True) for sym in u'hez'],
         u'hezlo': [(sym, False if sym == u'z' else True)
                    for sym in u'hez'],
         u'hello ': [(sym, True) for sym in u'Hello! '],
         u'tests': [(u't', False)],
         u'hello worl': [(sym, True) for sym in u'Hello! worl'],
         u'hello world': [(sym, True) for sym in u'Hello! World! '],
         u"Hello! World! It's testing": [
             (sym, True) for sym in u"Hello! World! It's... testing. "],
         u'hello wordm': [(sym, True if sym != u'd' else False)
                          for sym in u'Hello! word'],
         u"Hello! World! it'": [(sym, True)
                                for sym in u"Hello! World! it'"],
         u"mello!": [(u'm', False)],
     }
     msg = u"On verifying of {0}.\nExpected: {1}.\nReturned: {2}."
     for src, result in data.iteritems():
         returned = self.verificator.verify_answer(src)
         self.assertEqual(returned, result,
                          msg=msg.format(src, result, returned))
     msg = u'\n'.join([u'For question: {3}.', msg])
     question = u'PERSON: Hello, [ emotions ] world! ! !'
     self.verificator.clear_subtitle(complete=True)
     self.verificator.set_subtitle(
         SubRipItem(text=question))
     data = {
         u'hel': [(sym, True) for sym in u'hel'],
         u'per': [(u'p', False)],
         u'hello': [(sym, True) for sym in u'Hello, '],
         u'hello world': [(sym, True) for sym in u'Hello, world! '],
         u'Hello, world': [(sym, True) for sym in u'Hello, world! '],
     }
     for src, result in data.iteritems():
         returned = self.verificator.verify_answer(src)
         self.assertEqual(returned, result,
                          msg=msg.format(src, result, returned, question))
     question = u'. . .Hello . . . world. . .'
     self.verificator.clear_subtitle(complete=True)
     self.verificator.set_subtitle(
         SubRipItem(text=question))
     data = {
         u'hel': [(sym, True) for sym in u'hel'],
         u'hello': [(sym, True) for sym in u'.Hello '],
         u'Hello wor': [(sym, True) for sym in u'.Hello wor'],
         u'Hello world': [(sym, True) for sym in u'.Hello world. '],
         u"Hello world. it's": [(sym, False if sym == u'i' else True)
                                for sym in u'.Hello world. i'],
     }
     for src, result in data.iteritems():
         returned = self.verificator.verify_answer(src)
         self.assertEqual(returned, result,
                          msg=msg.format(src, result, returned, question))
     [self.verificator.hint(ans) for ans in (u'hel', u'hello w')]
     self.verificator.clear_subtitle(complete=True)
     stats = self.verificator.get_statistics()
     self.assertTrue(stats.get(u'learning_time', 0) > 0)
     for param, value in {
         u'completed_fragments': 3, u'hint_used': 2, u'mistakes': 8,
         u'total_chars': 108
     }.iteritems():
         self.assertEqual(stats.get(param, None), value,
                          msg=u'On verifying of {0}.'.format(param))