def _srt_gen_from_url(base_url, end_time=3660, verbose=True): dt = 60 t0 = 0 t1 = t0 + dt has_next = True first = True srt = '' last_end = 0.0 while has_next: if verbose: print('fetching captions from ' + base_url + '?t={}/{}'.format(t0, t1)) if first: first = False res = requests.get(base_url, params={'t': '{}/{}'.format(t0, t1)}) res.raise_for_status() srt = res.text.replace(u'\ufeff', '') else: res = requests.get(base_url, params={'t': '{}/{}'.format(t0, t1)}) res.raise_for_status() srt = res.text t0 = t1 + 1 t1 = t1 + dt has_next = t1 <= end_time if srt: cc = CaptionConverter() cc.read(srt, SRTReader()) captions = cc.captions.get_captions(lang='en-US') if first: last_end = captions[-1].end else: for caption in captions: caption.start += last_end caption.end += last_end last_end = captions[-1].end srt = cc.write(SRTWriter()) yield srt.replace('\n\n', ' \n\n') else: yield ''
def toSrt(file_name): converter = CaptionConverter() transcript = open(file_name + '.xml') new_transcript = transcript.read().replace('♪', '') converter.read(unicode(new_transcript), DFXPReader()) f = open(file_name + '.srt', 'w') string = converter.write(SRTWriter()) f.write(string) f.close()
def route_subtitles(course_id, lecture_id): subtitles_url = ( 'https://class.coursera.org/%s-001/lecture/subtitles?q=%d_en' % (course_id, lecture_id)) r = requests.get(subtitles_url) try: converter = CaptionConverter() converter.read(r.text, SRTReader()) subtitles = converter.write(WebVTTWriter()) except CaptionReadNoCaptions: subtitles = '' return Response(subtitles, content_type='text/vtt')
def srt2ttml(srt_file_path, ttml_file_path=None): """Convert SubRip subtitles to TTML subtitles. Arguments: srt_file_path {string} -- The path to the SubRip file. ttml_file_path {string} -- The path to the TTML file. """ converter = CaptionConverter() with open(srt_file_path, "r", encoding="utf8") as file: converter.read(file.read(), SRTReader()) if ttml_file_path is None: ttml_file_path = srt_file_path.replace(".srt", ".xml") with open(ttml_file_path, "wb") as file: file.write(converter.write(DFXPWriter()).encode("utf-8"))
def ttml2srt(ttml_file_path, srt_file_path=None): """Convert TTML subtitles to SubRip subtitles. Arguments: ttml_file_path {string} -- The path to the TTML file. srt_file_path {string} -- The path to the SubRip file. """ converter = CaptionConverter() with open(ttml_file_path, "r", encoding="utf8") as file: converter.read(file.read(), DFXPReader()) if srt_file_path is None: srt_file_path = ttml_file_path.replace(".xml", ".srt") with open(srt_file_path, "wb") as file: file.write(converter.write(SRTWriter()).encode("utf-8"))
def _make_ts_from_srt(srt): c = CaptionConverter() srt = re.sub('$', ' ', srt).replace('\n\n', ' \n\n') srt = unicodedata.normalize('NFC', srt) srt = ''.join(i for i in srt if unicodedata.category(i)[0] != 'C' or i == '\n') c.read(srt, SRTReader()) ts = c.write(TranscriptWriter()).replace(u'>>> ', u'>>').replace('\n', ' ') return ts.split('>>')
def dfxpToSrt(myFile, keep, verbose): if verbose: print "--- Renaming to %s.dfxp" % myFile os.rename(myFile, "%s.dfxp" % myFile) print "--- Converting to srt" sourceFile = open("%s.dfxp" % myFile) # open copy as source caps = sourceFile.read() # read source converter = CaptionConverter() # set pycaptions converter converter.read(caps, DFXPReader()) # read sami with open(myFile, "w") as targetFile: # open target targetFile.write(converter.write(SRTWriter())) # write target sourceFile.close() # close source targetFile.close() # close target if not keep: if verbose: print "--- Deleting temporary file %s.dfxp" % myFile os.remove("%s.dfxp" % myFile)
def webvttToSrt(myFile, keep, verbose): if verbose: print "--- Renaming to %s.webvtt" % myFile os.rename(myFile, "%s.webvtt" % myFile) print "--- Converting to srt" sourceFile = codecs.open("%s.webvtt" % myFile, "r", encoding="utf8") # open copy as source caps = sourceFile.read() # read source converter = CaptionConverter() # set pycaptions converter converter.read(caps, WebVTTReader()) # read sami with codecs.open(myFile, "w", encoding="utf8") as targetFile: # open target targetFile.write(converter.write(SRTWriter())) # write target sourceFile.close() # close source targetFile.close() # close target if not keep: if verbose: print "--- Deleting temporary file %s.webvtt" % myFile os.remove("%s.webvtt" % myFile)
def ttml2srt(ttml_file_path: str, srt_file_path: Optional[str] = None) -> None: """Convert TTML subtitles to SubRip subtitles. Arguments: ttml_file_path {string} -- The path to the TTML file. srt_file_path {string} -- The path to the SubRip file. """ file: Union[TextIO, BinaryIO] converter = CaptionConverter() encoding = Utils.detect_encoding(ttml_file_path) with open(ttml_file_path, "r", encoding=encoding) as file: converter.read(file.read(), DFXPReader()) if srt_file_path is None: srt_file_path = ttml_file_path.replace(".xml", ".srt") with open(srt_file_path, "wb") as file: file.write(converter.write(SRTWriter()).encode(encoding))
def srt2sami(srt_file_path: str, sami_file_path: Optional[str] = None) -> None: """Convert SubRip subtitles to SAMI subtitles. Arguments: srt_file_path {string} -- The path to the SubRip file. sami_file_path {string} -- The path to the SAMI file. """ file: Union[TextIO, BinaryIO] converter = CaptionConverter() encoding = Utils.detect_encoding(srt_file_path) with open(srt_file_path, "r", encoding=encoding) as file: converter.read(file.read(), SRTReader()) if sami_file_path is None: sami_file_path = srt_file_path.replace(".srt", ".smi") with open(sami_file_path, "wb") as file: file.write(converter.write(SAMIWriter()).encode(encoding))
def main(argv): inputfile = '' try: opts, args = getopt.getopt(argv, "hi:m", ["ifile="]) except getopt.GetoptError: print('script2scc.py -i <inputfile>') sys.exit(2) for opt, arg in opts: if opt == '-h': print('script2scc.py -i <inputfile>') sys.exit() elif opt in ("-i", "--ifile"): inputfile = arg file = open(inputfile, 'r', encoding='iso-8859-1') technoleads_content = data = file.read() converter = CaptionConverter() converter.read(technoleads_content, TechnoleadsReader(lang='fr')) # print (converter.write(WebVTTWriter())) print(converter.write(SCCWriter()))
def subtitle(request, title, no): t = re.sub('\(.*?\)', '', title)[:-1] film = subscene.search(t, "English") zip = requests.get(subscene.zipped_url(film.subtitles[int(no)])) fp = StringIO(zip.content) archive = zipfile.ZipFile(fp, 'r') srt = archive.read(archive.namelist()[0]) soup = BeautifulSoup(srt) # print(soup.originalEncoding) converter = CaptionConverter() unistring = unicode(srt.decode(soup.originalEncoding)) if "utf-8" in soup.originalEncoding: unistring = unistring[1:] converter.read(unistring, SRTReader()) html_parser = HTMLParser.HTMLParser() return HttpResponse(html_parser.unescape(converter.write(WebVTTWriter()).encode('ascii', 'ignore')), content_type="text/vtt")
def convert_subtitles_to_vtt(input_file: str, output_file: str): """Convert .srt subtitles to .vtt for web playback.""" logger.info(f'Converting {input_file} to {output_file}') with open(input_file, mode='rb') as raw_input_content: encoding = chardet.detect(raw_input_content.read())['encoding'] with open(input_file, mode='r', encoding=encoding) as srt_file: srt_contents = str(srt_file.read()) converter = CaptionConverter() try: converter.read(srt_contents, SRTReader()) except CaptionReadNoCaptions: logger.exception(f'Failed to convert {input_file} to {output_file}') return False # Likely UTF-16 subtitles vtt_captions = converter.write(WebVTTWriter()) with open(output_file, mode='w', encoding='utf-8-sig') as vtt_file: vtt_file.write(vtt_captions) return True
def from_srt(input_f, output_f): """ Takes an input SRT file or filename and writes out VTT contents to the given output file or filename """ with vtt_open(input_f, 'r') as f: orig = f.read() detect = chardet.detect(orig) encoding = detect['encoding'] confidence = detect['confidence'] default_subrip_encoding = 'cp1252' # standard for SubRip files if confidence < 0.9: encoding = default_subrip_encoding backups = [default_subrip_encoding,'utf8'] while True: try: print "ENCODING: " + encoding contents = orig.decode(encoding) break except UnicodeDecodeError as e: if len(backups) is 0: raise break encoding = backups.pop(0) # caption converter seems to have a tough time with the BOM on # Python < 2.7.8, so ditch it if it exists. contents = contents[3:] if contents[:3] == codecs.BOM_UTF8 else contents converter = CaptionConverter() converter.read(contents, SRTReader()) contents = converter.write(WebVTTWriter()) with vtt_open(output_f, 'w') as o: o.write(contents.encode('utf-8')[:-1])
def convertText(inputfile): file_names = os.listdir(inputfile) for sample_name in file_names: with open(inputfile+"/" + sample_name, 'r') as f: read_data = f.read() f.closed read_data = read_data.decode("utf8") read_data = unicode(read_data) converter = CaptionConverter() converter.read(read_data, WebVTTReader()) f = converter.write(SRTWriter()) trythis = list(f) myre = '([0-9]){2}:([0-9]){2}:([0-9]){2},([0-9]){3}' myre2 = '([0-9])+' char = "" words = [] for i in trythis: if i!="\n": char = char + i else: if re.search(myre, char): char = "" elif re.search(myre2, char): char = "" elif char == '-->': char = "" else: words.append(char) char = "" words = set(words) f = " ".join(words) output_file = open("/Users/lavanyasunder1/Subtitles/TXT/%s.txt" % (sample_name), 'w+') output_file.write(f.encode("utf-8")) output_file.write("\n") output_file.close()
from pycaption import SCCReader from pycaption import WebVTTWriter from pycaption import SCCWriter from pycaption import CaptionConverter from pycaption import TechnoleadsReader from pprint import pprint # file=open('journal.scc','r') # scc_content = data=file.read() # pycaps = SCCReader().read(scc_content, lang='fr') # converter = CaptionConverter() # converter.read(scc_content, SCCReader()) # print (converter.write(SCCWriter())) # print (converter.write(WebVTTWriter())) # pprint(pycaps) file = open('conseiller_Le_GA00120742_MF0HP.txt', 'r', encoding='iso-8859-1') technoleads_content = data = file.read() converter = CaptionConverter() converter.read(technoleads_content, TechnoleadsReader(lang='fr')) # print (converter.write(WebVTTWriter())) print(converter.write(SCCWriter()))