def _test_srt_to_scc_to_srt_conversion(self, srt_captions): captions_1 = SRTReader().read(srt_captions) scc_results = SCCWriter().write(captions_1) scc_captions = SCCReader().read(scc_results) srt_results = SRTWriter().write(scc_captions) captions_2 = SRTReader().read(srt_results) self.assertCaptionSetAlmostEquals(captions_1, captions_2, TOLERANCE_MICROSECONDS)
def test_multiple_lines_for_one_sentence(self, samples_srt_same_time): caption_set = SRTReader().read(samples_srt_same_time) results = SRTWriter().write(caption_set) sentences = re.split(r"\d{2}:\d{2}:\d{2},\d{3} -->", results) sentences.pop(0) assert 3 == len(sentences)
def test_srt_to_webvtt_conversion(self, sample_webvtt_from_srt, sample_srt): caption_set = SRTReader().read(sample_srt) results = WebVTTWriter().write(caption_set) assert isinstance(results, str) self.assert_webvtt_equals(sample_webvtt_from_srt, results)
def test_srt_to_dfxp_conversion(self): caption_set = SRTReader().read(SAMPLE_SRT) results = DFXPWriter().write(caption_set) self.assertTrue(isinstance(results, six.text_type)) self.assertDFXPEquals(SAMPLE_DFXP, results, ignore_styling=True, ignore_spans=True)
def test_srt_to_dfxp_conversion(self, sample_dfxp, sample_srt): caption_set = SRTReader().read(sample_srt) results = DFXPWriter().write(caption_set) assert isinstance(results, str) self.assert_dfxp_equals(sample_dfxp, results, ignore_styling=True, ignore_spans=True)
def _srt_gen_from_url(base_url, end_time=3660, verbose=True): dt = 60 t0 = 0 t1 = t0 + dt has_next = True first = True srt = '' last_end = 0.0 while has_next: if verbose: print('fetching captions from ' + base_url + '?t={}/{}'.format(t0, t1)) if first: first = False res = requests.get(base_url, params={'t': '{}/{}'.format(t0, t1)}) res.raise_for_status() srt = res.text.replace(u'\ufeff', '') else: res = requests.get(base_url, params={'t': '{}/{}'.format(t0, t1)}) res.raise_for_status() srt = res.text t0 = t1 + 1 t1 = t1 + dt has_next = t1 <= end_time if srt: cc = CaptionConverter() cc.read(srt, SRTReader()) captions = cc.captions.get_captions(lang='en-US') if first: last_end = captions[-1].end else: for caption in captions: caption.start += last_end caption.end += last_end last_end = captions[-1].end srt = cc.write(SRTWriter()) yield srt.replace('\n\n', ' \n\n') else: yield ''
def route_subtitles(course_id, lecture_id): subtitles_url = ( 'https://class.coursera.org/%s-001/lecture/subtitles?q=%d_en' % (course_id, lecture_id)) r = requests.get(subtitles_url) try: converter = CaptionConverter() converter.read(r.text, SRTReader()) subtitles = converter.write(WebVTTWriter()) except CaptionReadNoCaptions: subtitles = '' return Response(subtitles, content_type='text/vtt')
def srt2ttml(srt_file_path, ttml_file_path=None): """Convert SubRip subtitles to TTML subtitles. Arguments: srt_file_path {string} -- The path to the SubRip file. ttml_file_path {string} -- The path to the TTML file. """ converter = CaptionConverter() with open(srt_file_path, "r", encoding="utf8") as file: converter.read(file.read(), SRTReader()) if ttml_file_path is None: ttml_file_path = srt_file_path.replace(".srt", ".xml") with open(ttml_file_path, "wb") as file: file.write(converter.write(DFXPWriter()).encode("utf-8"))
def convert_srt_to_dfxp(times, generate_output=False): all_time = 0 counter = 0 input_files = os.listdir(INPUT_DIRECTORY) input_files_count = len(input_files) skipped_files = 0 for input_file in input_files: if input_file.startswith('.'): # Skip hidden files and SRT master file skipped_files += 1 continue filename = input_file.split('.')[0] file_path = '{}/{}'.format(INPUT_DIRECTORY, input_file) with open(file_path, 'r', encoding='utf-8') as fh: try: srt_data = fh.read() except UnicodeDecodeError: print("Problem with {}".format(file_path)) raise total_file_time = 0 for _ in range(times): t0 = timer() dfxp_data = DFXPWriter().write(SRTReader().read(srt_data)) t1 = timer() time_taken = t1 - t0 total_file_time += time_taken if generate_output and "{}.xml".format(filename) not in os.listdir( OUTPUT_DIRECTORY): with open("{}/{}.xml".format(OUTPUT_DIRECTORY, filename), "w") as new_dfxp_file: new_dfxp_file.write(dfxp_data) all_time += total_file_time counter += 1 sys.stdout.write("\r{}/{} files completed.".format( counter, input_files_count)) print( "\nConverting {} files took an average of {} seconds over {} iteration{}.\n{} files were skipped." .format(counter, all_time / times, times, "s" if times > 1 else "", skipped_files))
def _make_ts_from_srt(srt): c = CaptionConverter() srt = re.sub('$', ' ', srt).replace('\n\n', ' \n\n') srt = unicodedata.normalize('NFC', srt) srt = ''.join(i for i in srt if unicodedata.category(i)[0] != 'C' or i == '\n') c.read(srt, SRTReader()) ts = c.write(TranscriptWriter()).replace(u'>>> ', u'>>').replace('\n', ' ') return ts.split('>>')
def srt2ttml(srt_file_path: str, ttml_file_path: Optional[str] = None) -> None: """Convert SubRip subtitles to TTML subtitles. Arguments: srt_file_path {string} -- The path to the SubRip file. ttml_file_path {string} -- The path to the TTML file. """ file: Union[TextIO, BinaryIO] converter = CaptionConverter() encoding = Utils.detect_encoding(srt_file_path) with open(srt_file_path, "r", encoding=encoding) as file: converter.read(file.read(), SRTReader()) if ttml_file_path is None: ttml_file_path = srt_file_path.replace(".srt", ".xml") with open(ttml_file_path, "wb") as file: file.write(converter.write(DFXPWriter()).encode(encoding))
def srt2sami(srt_file_path: str, sami_file_path: Optional[str] = None) -> None: """Convert SubRip subtitles to SAMI subtitles. Arguments: srt_file_path {string} -- The path to the SubRip file. sami_file_path {string} -- The path to the SAMI file. """ file: Union[TextIO, BinaryIO] converter = CaptionConverter() encoding = Utils.detect_encoding(srt_file_path) with open(srt_file_path, "r", encoding=encoding) as file: converter.read(file.read(), SRTReader()) if sami_file_path is None: sami_file_path = srt_file_path.replace(".srt", ".smi") with open(sami_file_path, "wb") as file: file.write(converter.write(SAMIWriter()).encode(encoding))
def subtitle(request, title, no): t = re.sub('\(.*?\)', '', title)[:-1] film = subscene.search(t, "English") zip = requests.get(subscene.zipped_url(film.subtitles[int(no)])) fp = StringIO(zip.content) archive = zipfile.ZipFile(fp, 'r') srt = archive.read(archive.namelist()[0]) soup = BeautifulSoup(srt) # print(soup.originalEncoding) converter = CaptionConverter() unistring = unicode(srt.decode(soup.originalEncoding)) if "utf-8" in soup.originalEncoding: unistring = unistring[1:] converter.read(unistring, SRTReader()) html_parser = HTMLParser.HTMLParser() return HttpResponse(html_parser.unescape(converter.write(WebVTTWriter()).encode('ascii', 'ignore')), content_type="text/vtt")
def convert_subtitles_to_vtt(input_file: str, output_file: str): """Convert .srt subtitles to .vtt for web playback.""" logger.info(f'Converting {input_file} to {output_file}') with open(input_file, mode='rb') as raw_input_content: encoding = chardet.detect(raw_input_content.read())['encoding'] with open(input_file, mode='r', encoding=encoding) as srt_file: srt_contents = str(srt_file.read()) converter = CaptionConverter() try: converter.read(srt_contents, SRTReader()) except CaptionReadNoCaptions: logger.exception(f'Failed to convert {input_file} to {output_file}') return False # Likely UTF-16 subtitles vtt_captions = converter.write(WebVTTWriter()) with open(output_file, mode='w', encoding='utf-8-sig') as vtt_file: vtt_file.write(vtt_captions) return True
def from_srt(input_f, output_f): """ Takes an input SRT file or filename and writes out VTT contents to the given output file or filename """ with vtt_open(input_f, 'r') as f: orig = f.read() detect = chardet.detect(orig) encoding = detect['encoding'] confidence = detect['confidence'] default_subrip_encoding = 'cp1252' # standard for SubRip files if confidence < 0.9: encoding = default_subrip_encoding backups = [default_subrip_encoding,'utf8'] while True: try: print "ENCODING: " + encoding contents = orig.decode(encoding) break except UnicodeDecodeError as e: if len(backups) is 0: raise break encoding = backups.pop(0) # caption converter seems to have a tough time with the BOM on # Python < 2.7.8, so ditch it if it exists. contents = contents[3:] if contents[:3] == codecs.BOM_UTF8 else contents converter = CaptionConverter() converter.read(contents, SRTReader()) contents = converter.write(WebVTTWriter()) with vtt_open(output_f, 'w') as o: o.write(contents.encode('utf-8')[:-1])
def dfxpconv(filename=str, ckeep=bool): fsrt = open(filename, "r", encoding='utf-8', errors='ignore') srtcont = fsrt.read() fdfxp = open(filename.replace(".srt", ".dfxp"), "wb") # Super Netflix Compatibility # Converter that is used (DFXPWriter) uses a different set of rules # than what Super Netflix (and Netflix) wants. # Doing this will avoid the "M7034" error. # It will also remove any formatting as there is no such thing as formatting # in the Netflix Player. dfxpedit = DFXPWriter().write(SRTReader().read(srtcont)) dfxpedit = dfxpedit.replace( "<tt xml:lang=\"en\" xmlns:=\"http://www.w3.org/ns/ttml\" " "xmlns:tts=\"http://www.w3.org/ns/ttml#styling\">", "<tt xml:lang='en' xmlns='http://www.w3.org/2006/10/ttaf1' " "xmlns:tts='http://www.w3.org/2006/10/ttaf1#style'>") dfxpedit = dfxpedit.replace("<div region=\"bottom\" xml:lang=\"en-US\">", "<div xml:id=\"captions\">") dfxpedit = dfxpedit.replace( "<font face=\"Open Sans Semibold\" size=\"36\">", "") dfxpedit = dfxpedit.replace("</font>", "") dfxpedit = dfxpedit.replace(" region=\"bottom\" style=\"default\"", "") dfxpedit = dfxpedit.replace("<b>", "") dfxpedit = dfxpedit.replace("</b>", "") dfxpedit = dfxpedit.replace("<i>", "") dfxpedit = dfxpedit.replace("</i>", "") dfxpedit = dfxpedit.replace("{\\an8}", "") dfxpedit = dfxpedit.encode('utf-8', errors='replace') fdfxp.write(dfxpedit) fsrt.close() fdfxp.close() if ckeep: return if not ckeep: os.remove(filename) return
def test_caption_length(self): captions = SRTReader().read(SAMPLE_SRT) self.assertEquals(7, len(captions.get_captions(u"en-US")))
def test_extra_empty_line(self): captions = SRTReader().read(SAMPLE_SRT_BLANK_LINES) self.assertEquals(2, len(captions.get_captions("en-US")))
def test_empty_file(self): self.assertRaises(CaptionReadNoCaptions, SRTReader().read, SAMPLE_SRT_EMPTY)
def test_proper_timestamps(self): captions = SRTReader().read(SAMPLE_SRT) paragraph = captions.get_captions(u"en-US")[2] self.assertEquals(17000000, paragraph.start) self.assertEquals(18752000, paragraph.end)
def test_detection(self): self.assertTrue(SRTReader().detect(SAMPLE_SRT.decode(u'utf-8')))
def test_extra_trailing_empty_line(self): captions = SRTReader().read(SAMPLE_SRT_TRAILING_BLANKS) self.assertEquals(2, len(captions.get_captions(u"en-US")))
def build_srt_reader(): return SubtitleReader(SRTReader(), requires_language=True)
def test_extra_trailing_empty_line(self): captions = SRTReader().read(SAMPLE_SRT_TRAILING_BLANKS) self.assertEqual(2, len(captions.get_captions(u"en-US")))
def test_srt_to_microdvd_conversion(self, sample_microdvd, sample_srt): caption_set = SRTReader().read(sample_srt) results = MicroDVDWriter().write(caption_set) assert isinstance(results, str) self.assert_microdvd_equals(sample_microdvd, results)
def test_proper_pcc_format(self): captions = SRTReader().read(SAMPLE_SRT) self.assertEquals(set(["captions", "styles"]), set(captions.keys())) self.assertEquals(7, len(captions["captions"]["en-US"]))
def test_numeric_captions(self): captions = SRTReader().read(SAMPLE_SRT_NUMERIC) self.assertEquals(7, len(captions.get_captions(u"en-US")))
def test_srt_to_sami_conversion(self): caption_set = SRTReader().read(SAMPLE_SRT) results = SAMIWriter().write(caption_set) self.assertTrue(isinstance(results, six.text_type)) self.assertSAMIEquals(SAMPLE_SAMI, results)
def test_caption_length(self): captions = SRTReader().read(SAMPLE_SRT.decode(u'utf-8')) self.assertEquals(8, len(captions.get_captions(u"en-US")))
def test_srt_to_webvtt_conversion(self): caption_set = SRTReader().read(SAMPLE_SRT) results = WebVTTWriter().write(caption_set) self.assertTrue(isinstance(results, six.text_type)) self.assertWebVTTEquals(SAMPLE_WEBVTT_FROM_SRT, results)
def setUpClass(cls): cls.captions = SRTReader().read(SAMPLE_SRT.decode(u'utf-8')) cls.captions_utf8 = SRTReader().read(SAMPLE_SRT_UTF8.decode(u'utf-8')) cls.captions_unicode = SRTReader().read(SAMPLE_SRT_UNICODE)
def test_srt_reader_only_supports_unicode_input(self): with self.assertRaises(InvalidInputError): SRTReader().read('')
def test_detection(self): self.assertTrue(SRTReader().detect(SAMPLE_SRT))