def merge_srt(chn_file, eng_file, output_file): delta = SubRipTime(milliseconds=500) subs_a = SubRipFile.open(chn_file) subs_b = SubRipFile.open(eng_file) out = merge_subtitle(subs_a, subs_b, delta) if os.path.isfile(output_file): os.remove(output_file) out.save(output_file, encoding='utf8')
def add_videos_to_index(subtitle_index, output_file, index): vindexReader = csv.reader(open(subtitle_index, 'rb')) vinfoWriter = csv.writer(open(output_file, 'wt')) vinfoWriter.writerow(['title', 'filename', 'id', 'views', 'type', 'url', 'text']) for row in vindexReader: try: filename = row[1] + '.en.srt' url = 'http://www.youtube.com/watch?v=' + row[2] text = open(filename).read() text_ascii = removeNonAscii(text) subtitles = SubRipFile.open(filename) vinfoWriter.writerow([row[0], row[1], row[2], row[3], row[4], url, text_ascii]) punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~' stopwords = [''] with open('/Users/connormendenhall/Python/DaveDaveFind/DaveDaveFind/data/stopwords.csv', 'rb') as f: wordlist = csv.reader(f) for stopword in wordlist: stopwords.append(stopword[0]) for sentence in subtitles: text = (sentence.text) wordlist = text.split() for word in wordlist: word = word.lstrip(punctuation) word = word.rstrip(punctuation) word = word.lower() if word not in stopwords: add_to_index(index, word, url) except: pass print "[add_videos_to_index()] Videos added." return index
def mostrarSubtitulos(self, escena, ruta): if (self.ok==1): self.escena= escena #subs = SubRipFile.open(ruta, encoding='iso-8859-1') subs = SubRipFile.open(ruta, encoding='UTF-8') # Con esta codificacion logramos ver los tildes #print("Hay" ,subs.__len__()," subtitulos") #print "SEGUNDOS=", cant_segs if (self.tmp== subs.__len__()): # cuando llega al final de los subtitulos #self.tmp= subs.__len__()-1 self.tmp= 0 self.ok= 0 #print("entro en tiempo " ,self.tiempoActual) self.tiempoActual= 0 linea= subs[self.tmp] tics_ini = (linea.start.minutes*60*1000)+(linea.start.seconds*1000)+linea.start.milliseconds tics_fin = (linea.end.minutes*60*1000)+(linea.end.seconds*1000)+linea.end.milliseconds if ((tics_ini<=(pygame.time.get_ticks()-self.offset)) and ((pygame.time.get_ticks()-self.offset)<=tics_fin)): if (self.imprimir==1): self.escena.draw() # reimprime la escena self.printTexto(linea.text) # imprime mensaje self.imprimir= 0 self.tmp= self.tmp+1 self.entrar= 1 else: if (self.entrar==1): self.printTexto("") self.imprimir= 1 self.entrar=0
def test_save(self): srt_file = SubRipFile.open(self.windows_path, encoding='windows-1252') srt_file.save(self.temp_path, eol='\n', encoding='utf-8') self.assertEquals( open(self.temp_path, 'rb').read(), open(self.utf8_path, 'rb').read()) os.remove(self.temp_path)
def __init__(self, filename): self.filename = filename self.model = Gtk.ListStore(object, str) self.srt_model = [] if not os.path.exists(filename): raise (FileNameError(filename)) try: self.srt_model = SubRipFile.open(path=filename) except UnicodeDecodeError as unic: debug(unic) try: info("trying ...", "ISO-8859-1") self.srt_model = SubRipFile(path=filename, encoding="iso-8859-1") except Exception as excep: debug(excep) self.model = None except IOError as error: info( "Impossible de lire le fichier de sous titre: error {}".format( error)) for line in self.srt_model: # print("appending",line) self.model.append([line, line.text])
def test_windows1252(self): srt_file = SubRipFile.open(self.windows_path, encoding='windows-1252') self.assertEquals(len(srt_file), 1332) self.assertEquals(srt_file.eol, '\r\n') self.assertRaises(UnicodeDecodeError, SubRipFile.open, self.utf8_path, encoding='ascii')
def input_file(self): if not hasattr(self, '_source_file'): encoding = detect(open(self.arguments.file).read()).get('encoding') self._source_file = SubRipFile.open( self.arguments.file, encoding=encoding, error_handling=SubRipFile.ERROR_LOG) return self._source_file
def input_file(self): if not hasattr(self, '_source_file'): with open(self.arguments.file, 'rb') as f: content = f.read() encoding = detect(content).get('encoding') encoding = self.normalize_encoding(encoding) self._source_file = SubRipFile.open(self.arguments.file, encoding=encoding, error_handling=SubRipFile.ERROR_LOG) return self._source_file
def test_eol_conversion(self): input_file = open(self.windows_path, 'rU') input_file.read() self.assertEquals(input_file.newlines, '\r\n') srt_file = SubRipFile.open(self.windows_path, encoding='windows-1252') srt_file.save(self.temp_path, eol='\n') output_file = open(self.temp_path, 'rU') output_file.read() self.assertEquals(output_file.newlines, '\n')
def test_eol_conversion(self): input_file = open(self.windows_path, 'rU') input_file.read() self.assertEquals(input_file.newlines, '\r\n') srt_file = SubRipFile.open(self.windows_path, encoding='windows-1252') srt_file.save(self.temp_path, eol='\n') output_file = open(self.temp_path, 'rU') output_file.read() self.assertEquals(output_file.newlines, '\n')
def test_eol_conversion(self): input_file = open(self.windows_path, "rU") input_file.read() self.assertEquals(input_file.newlines, "\r\n") srt_file = SubRipFile.open(self.windows_path, encoding="windows-1252") srt_file.save(self.temp_path, eol="\n") output_file = open(self.temp_path, "rU") output_file.read() self.assertEquals(output_file.newlines, "\n")
def input_file(self): if not hasattr(self, '_source_file'): with open(self.arguments.file, 'rb') as f: content = f.read() encoding = detect(content).get('encoding') encoding = self.normalize_encoding(encoding) self._source_file = SubRipFile.open( self.arguments.file, encoding=encoding, error_handling=SubRipFile.ERROR_LOG) return self._source_file
def save(self, *args, **kwargs): episode = super(Episode, self).save(*args, **kwargs) # Delete existing subtitles self.subtitle_set.all().delete() # Import subtitles from file subs = SubRipFile.open(self.subtitles.path) with transaction.commit_on_success(): for sub in subs: self.subtitle_set.create( start=sub.start.ordinal, end=sub.end.ordinal, text=sub.text)
def generate_vocap_file(self): ######### Generate subs in vocap format subs = SubRipFile.open(self.path+"/"+self.srt_file, encoding="utf-8") fileobj=codecs.open(self.path+"/"+self.vocap_file, "w", "utf-8") for i in range(len(subs)): text = subs[i].text text = text.replace(u"###", u"#.#.#") text = text.replace(u"\n", u" ") #text = cgi.escape(text) start = subs[i].start.seconds start += 60*subs[i].start.minutes start += 3600*subs[i].start.hours time = unicode(str(start),"utf-8") line = u"###"+time+u" "+text+u"\n" fileobj.write(line) fileobj.close()
def readSrt(input_file, output_file, input_language, output_language): print('processing file', input_file) subs = SubRipFile.open(input_file) print(">", "read file", input_file) for sentence in subs: print(sentence.text) translateSentence = translate(sentence.text, input_language, output_language) if both_language: sentence.text = sentence.text + " (" + translateSentence + ")" else: sentence.text = translateSentence print(sentence.text) subs.save(output_file, 'utf-8') webvtt = WebVTT().from_srt(output_file) webvtt.save() os.rename(input_file, input_file + ".old") os.remove(output_file) os.rename(output_file.replace(".srt", ".vtt"), input_file) print(">", output_file, "saved!")
def add_videos_to_index(subtitle_index, output_file, index): vindexReader = csv.reader(open(subtitle_index, 'rb')) vinfoWriter = csv.writer(open(output_file, 'wt')) vinfoWriter.writerow( ['title', 'filename', 'id', 'views', 'type', 'url', 'text']) for row in vindexReader: try: filename = row[1] + '.en.srt' url = 'http://www.youtube.com/watch?v=' + row[2] text = open(filename).read() text_ascii = removeNonAscii(text) subtitles = SubRipFile.open(filename) vinfoWriter.writerow( [row[0], row[1], row[2], row[3], row[4], url, text_ascii]) punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~' stopwords = [''] with open( '/Users/connormendenhall/Python/DaveDaveFind/DaveDaveFind/data/stopwords.csv', 'rb') as f: wordlist = csv.reader(f) for stopword in wordlist: stopwords.append(stopword[0]) for sentence in subtitles: text = (sentence.text) wordlist = text.split() for word in wordlist: word = word.lstrip(punctuation) word = word.rstrip(punctuation) word = word.lower() if word not in stopwords: add_to_index(index, word, url) except: pass print "[add_videos_to_index()] Videos added." return index
def __init__(self, filename): self.filename = filename self.model = Gtk.ListStore(object, str) self.srt_model = [] if not os.path.exists(filename) : raise(FileNameError(filename)) try: self.srt_model = SubRipFile.open(path=filename) except UnicodeDecodeError as unic: debug(unic) try: info("trying ...", "ISO-8859-1") self.srt_model = SubRipFile(path = filename, encoding = "iso-8859-1") except Exception as excep : debug(excep) self.model = None except IOError as error: info("Impossible de lire le fichier de sous titre: error {}".format(error)) for line in self.srt_model: # print("appending",line) self.model.append([line, line.text])
def test_utf8(self): self.assertEquals(len(SubRipFile.open(self.utf8_path)), 1332) self.assertRaises(UnicodeDecodeError, SubRipFile.open, self.windows_path)
def test_save(self): srt_file = SubRipFile.open(self.windows_path, encoding='windows-1252') srt_file.save(self.temp_path, eol='\n', encoding='utf-8') self.assertEquals(open(self.temp_path, 'rb').read(), open(self.utf8_path, 'rb').read()) os.remove(self.temp_path)
from pysrt import SubRipFile import sys import itertools from collections import defaultdict target_file = open(sys.argv[1], "w+") for line in sys.stdin: line = line.strip() lines = SubRipFile.open(line, encoding='iso-8859-1') lines = map(lambda k: k.text.lower().strip().replace("\n", " ").replace(",", "").replace("-", "").replace(".", ""), lines) for line in lines: try: if not line: continue target_file.write(line.strip() + "\n") except: pass target_file.close()
def test_empty_file(self): file = SubRipFile.open('/dev/null', error_handling=SubRipFile.ERROR_RAISE) self.assertEquals(len(file), 0)
def test_windows1252(self): srt_file = SubRipFile.open(self.windows_path, encoding="windows-1252") self.assertEquals(len(srt_file), 1332) self.assertEquals(srt_file.eol, "\r\n") self.assertRaises(UnicodeDecodeError, SubRipFile.open, self.utf8_path, encoding="ascii")
def test_save(self): srt_file = SubRipFile.open(self.windows_path, encoding="windows-1252") srt_file.save(self.temp_path, eol="\n", encoding="utf-8") self.assertEquals(open(self.temp_path, "rb").read(), open(self.utf8_path, "rb").read()) os.remove(self.temp_path)
def test_length(self): path = os.path.join(self.base_path, "capability_tester.srt") file = SubRipFile.open(path) self.assertEquals(len(file), 37)
def test_file_with_empty_items(self): path = os.path.join(self.base_path, "empty.srt") file = SubRipFile.open(path) self.assertEquals(len(file), 7)
def test_compare_from_string_and_from_path(self): unicode_content = codecs.open(self.utf8_path, encoding='utf_8').read() iterator = izip(SubRipFile.open(self.utf8_path), SubRipFile.from_string(unicode_content)) for file_item, string_item in iterator: self.assertEquals(unicode(file_item), unicode(string_item))
import glob # Change paths where applicable path = "/home/elastictest/srt/" x = glob.glob(path + '*.srt') EsPath = "http://192.168.1.48:2600/subtitles/subtitle/" for i in x: # This needs to change for the srt paths sonwell has as they are numbers (just get a key/val of the numbers) subsName = i subsName = subsName[:-4] subsName = subsName.replace(path, '') # // end needs to change subs = SubRipFile.open(i) for i, val in enumerate(subs): d = {} d['title'] = subsName h = str(subs[i].start.hours).zfill(2) m = str(subs[i].start.minutes).zfill(2) s = str(subs[i].start.seconds).zfill(2) ms = str(subs[i].start.milliseconds).zfill(3) hms = '%s:%s:%s,%s' % (h, m, s, ms) d['startTime'] = hms h = str(subs[i].end.hours).zfill(2) m = str(subs[i].end.minutes).zfill(2) s = str(subs[i].end.seconds).zfill(2) ms = str(subs[i].end.milliseconds).zfill(3) hms = '%s:%s:%s,%s' % (h, m, s, ms) d['endTime'] = hms
def input_file(self): if not hasattr(self, '_source_file'): self._source_file = SubRipFile.open(self.arguments.file, error_handling=SubRipFile.ERROR_LOG) return self._source_file
opts, args = getopt.getopt(sys.argv[1:], 'hd:e:', ["help", "encoding=", "delta="]) except getopt.GetoptError, err: print str(err) usage() sys.exit(2) #Settings default values delta = SubRipTime(milliseconds=500) encoding="utf_8" #- if len(args) <> 3: usage() sys.exit(2) for o, a in opts: if o in ("-d", "--delta"): delta = SubRipTime(milliseconds=int(a)) elif o in ("-e", "--encoding"): encoding = a elif o in ("-h", "--help"): usage() sys.exit() subs_a = SubRipFile.open(args[0], encoding=encoding) subs_b = SubRipFile.open(args[1], encoding=encoding) out = merge_subtitle(subs_a, subs_b, delta) out.save(args[2], encoding=encoding) if __name__ == "__main__": main()
#!/usr/bin/python from pysrt import SubRipFile import sys subs = SubRipFile.open(sys.argv[1]) for s in subs: print s.text print s.start.milliseconds
def main(): os.chdir(sys.argv[1]) raw_input("are the subtitle timings correct?".upper()) # ##### extract quotes from IMDB html-file ################################################### f = open(r"quotes.htm", "r") parser = etree.HTMLParser() tree = etree.parse(f, parser) f.close() root = tree.getroot() quotes = [] for div in root.xpath("//div"): try: c = div.attrib["class"] if c == "sodatext": s = etree.tostring(div) s = re.sub("\<div.*\>\n", "", s) s = re.sub("\</div.*\>", "", s) #s = re.sub("\<b\>.*\</b\>:\n", "- ", s) # names s = re.sub("\<b\>\<a.*\"\>", "", s) s = re.sub("\</a\>\</b\>:\n", ": ", s) # share this quote s = re.sub("\<p.*\>.*\</p\>", "", s) s = re.sub("\<span.*\>.*\</span\>", "", s) s = re.sub("\[.*\]", "", s) # stage directions s = re.sub("\<br/\>", "", s) s = re.sub(" ", " ", s) lines = [line.strip() for line in s.split("\n")] lines = [line for line in lines if len(line) > 0] if len(lines) == 1: lines[0] = lines[0][1:].strip() quote = "\n".join(lines) # ####### '''if len(quote) >= QUOTE_MIN_LEN and len(quote) <= QUOTE_MAX_LEN: quotes.append(quote)''' quotes.append(quote) # ####### except: continue quotes = list(set(quotes)) quotes_clean = [ re.sub("[%s]+" % re.escape(string.punctuation), "", x) for x in quotes ] quotes_clean = [x.lower().strip() for x in quotes_clean] """for quote in quotes_clean: print quote, "\n" """ # ##### read subtitles from srt-file ################################################### subs = SubRipFile.open('subtitles.srt') """for sub in subs: #print sub.from_string() print sub.index #print sub.shift() print sub.start print sub.end print sub.text print "\n" """ #print dir(subs) timecode_quote = {} for item in subs: item.text = re.sub("[%s]+" % re.escape(string.punctuation), "", item.text) item.text = item.text.lower().strip() text = item.text.split("\n")[0] # first line only for i, quote in enumerate(quotes_clean): if len( text.split(" ") ) >= 3 and text in quote: # we'll get a lot of false hits with only one word :/ if quotes[i] not in timecode_quote.values(): timecode_quote[str(item.start)] = quotes[i] # ##### ################################################### tree = et.parse("project.xml") movie = tree.getroot() fps = float(movie.attrib["fps"]) frames = float(movie.attrib["frames"]) seconds = frames / fps #print seconds """start_frame = float( movie.attrib["start_frame"] ) start_sec = startframe / fps""" # sort by timecode timecodes = timecode_quote.keys() timecodes.sort() f = open("quotes.txt", "w") for tc in timecodes: #print tc print "%.1f" % (100 * timecode_to_seconds(tc) / seconds) + "%", tc print timecode_quote[tc] print "" f.write("%f#%s\n" % (timecode_to_seconds(tc) / seconds, timecode_quote[tc].replace("\n", "#"))) f.close() print "<<", len(timecodes), "QUOTES >>" #raw_input("- done -") return
except getopt.GetoptError, err: print str(err) usage() sys.exit(2) #Settings default values delta = SubRipTime(milliseconds=500) encoding = "utf_8" #- if len(args) <> 3: usage() sys.exit(2) for o, a in opts: if o in ("-d", "--delta"): delta = SubRipTime(milliseconds=int(a)) elif o in ("-e", "--encoding"): encoding = a elif o in ("-h", "--help"): usage() sys.exit() subs_a = SubRipFile.open(args[0], encoding=encoding) subs_b = SubRipFile.open(args[1], encoding=encoding) out = merge_subtitle(subs_a, subs_b, delta) out.save(args[2], encoding=encoding) if __name__ == "__main__": main()
def input_file(self): if not hasattr(self, '_source_file'): self._source_file = SubRipFile.open( self.arguments.file, error_handling=SubRipFile.ERROR_LOG) return self._source_file
from pysrt import SubRipFile, SubRipTime # hay que ver todavia como usar SubRipTime que nos va a solucionar la lectura subs = SubRipFile.open("14Blades.srt", encoding="iso-8859-1") print("Hay", subs.__len__(), " subtitulos") linea = subs[0] print(linea.text) print("inicio", linea.start.seconds, " segundos.") print("fin", linea.end.seconds, " segundos.") linea = subs[1] print(linea.text) print("inicio", linea.start.seconds, " segundos.") print("fin", linea.end.seconds, " segundos.") linea = subs[14] print(linea.text) print("inicio", linea.start.minutes, "minutos con", linea.start.seconds, "segundos.") print("fin", linea.start.minutes, "minutos con", linea.end.seconds, "segundos.") # equivalent # part = subs.slice(ends_after=SubRipTime(0, 0, 40)) # part = subs.slice(ends_after=(0, 0, 40)) # part = subs.slice(ends_after={'seconds': 40}) # part.shift(seconds=-2) # subs.save('other/path.srt', 'utf-8');
def __test_encoding(self, encoding): srt_file = SubRipFile.open(os.path.join(self.base_path, encoding)) self.assertEquals(len(srt_file), 7) self.assertEquals(srt_file[0].index, 1)
def merge_video_subtitle(video_id): """ 将video_id的中英vtt字幕转换为srt字幕,然后合并为srt格式的字幕 :param video_id: :return: """ video = Video.objects.get(pk=video_id) # Settings default values delta = SubRipTime(milliseconds=500) encoding = "utf_8" if (video.subtitle_cn != '') & (video.subtitle_en != ''): # convert_file(input_captions = video.subtitle_cn, output_writer) # vtt格式的字幕 # subs_cn_vtt = SubRipFile.open(video.subtitle_cn.path, # encoding=encoding) # subs_en_vtt = SubRipFile.open(video.subtitle_en.path, # encoding=encoding) # 将vtt字幕转换为srt subs_cn_srt_filename = '%s-%s.cn.srt' % (get_valid_filename( video.title), video.video_id) subs_cn_srt_path = os.path.join(YOUTUBE_DOWNLOAD_DIR, subs_cn_srt_filename) # 此功能失效 # subs_cn_srt_result = convert_file( # input_captions=video.subtitle_cn.path,output_writer=subs_cn_srt) subs_cn_srt_result = convert_subtilte_format( srt_file=video.subtitle_cn.path, ass_file=subs_cn_srt_path) subs_en_srt_filename = '%s-%s.en.srt' % (get_valid_filename( video.title), video.video_id) subs_en_srt_path = os.path.join(YOUTUBE_DOWNLOAD_DIR, subs_en_srt_filename) # subs_en_srt_result = convert_file( # input_captions=video.subtitle_en.path,output_writer = subs_en_srt) subs_en_srt_path = convert_subtilte_format( srt_file=video.subtitle_en.path, ass_file=subs_en_srt_path) subs_cn_srt = SubRipFile.open(subs_cn_srt_path, encoding=encoding) subs_en_srt = SubRipFile.open(subs_en_srt_path, encoding=encoding) merge_subs = merge_subtitle(subs_cn_srt, subs_en_srt, delta) # 某些youtube视频的title有非ASCII的字符,或者/等不能出现在文件名中的字符 # 所以使用django utils自带的get_valid_filename()转化一下 # 注意:与youtube-dl自带的restrictfilenames获得的文件名不一样, # 也就是merge_subs_filename 与 subtitle_cn, subtitle_cn中名称可能会不一样 # 标题中的 . 依然会保留 merge_subs_filename = '%s-%s.zh-Hans.en.srt' % (get_valid_filename( video.title), video.video_id) merge_subs_path = os.path.join(YOUTUBE_DOWNLOAD_DIR, merge_subs_filename) merge_subs.save(merge_subs_path, encoding=encoding) video.subtitle_merge = merge_subs_path video.save(update_fields=['subtitle_merge']) return merge_subs_path else: return False
def test_empty_file(self): file = SubRipFile.open("/dev/null", error_handling=SubRipFile.ERROR_RAISE) self.assertEquals(len(file), 0)
def setUp(self): self.file = SubRipFile.open(os.path.join(file_path, "tests", "static", "utf-8.srt"))
def test_utf8(self): self.assertEquals(len(SubRipFile.open(self.utf8_path)), 1332) self.assertRaises(UnicodeDecodeError, SubRipFile.open, self.windows_path, encoding="utf_8")
def input_file(self): if not hasattr(self, '_source_file'): encoding = detect(open(self.arguments.file).read()).get('encoding') self._source_file = SubRipFile.open(self.arguments.file, encoding=encoding, error_handling=SubRipFile.ERROR_LOG) return self._source_file
def test_compare_from_string_and_from_path(self): unicode_content = codecs.open(self.utf8_path, encoding="utf_8").read() iterator = izip(SubRipFile.open(self.utf8_path), SubRipFile.from_string(unicode_content)) for file_item, string_item in iterator: self.assertEquals(unicode(file_item), unicode(string_item))
def test_length(self): path = os.path.join(self.base_path, 'capability_tester.srt') file = SubRipFile.open(path) self.assertEquals(len(file), 37)
def setUp(self): self.file = SubRipFile.open( os.path.join(file_path, 'tests', 'static', 'utf-8.srt'))
length1Time = get_video_length(args.inputVideo1[0]) offset2Time = SubRipTime.from_string(zeroTime) inVid1 = args.inputVideo1[0] if args.offset2: offset2Time = SubRipTime.from_string(args.offset2[0]) length1Time = SubRipTime.from_string(zeroTime) offset2 = args.offset2[0] inSubName1 = args.input1 inSubName2 = args.input2 outSubName = args.output if args.encoding: encoding = args.encoding[0] else: encoding = args.encoding try: inSub1 = SubRipFile.open(inSubName1,encoding) except AttributeError: print "No such file: ",inSubName1 sys.exit(1) except LookupError: print "No such encoding: ",encoding sys.exit(1) except UnicodeDecodeError: print "Not encoded as utf-8" sys.exit(1) try: inSub2 = SubRipFile.open(inSubName2,encoding) except AttributeError: print "No such file: ",inSubName1 sys.exit(1) except LookupError:
#! /usr/bin/python import sys from pysrt import SubRipFile if len(sys.argv) != 2: print "Usage: subfiller <file.srt>" sys.exit(1) srt = SubRipFile.open(sys.argv[1], 'cp1250') letter = 'A' for s in srt: title_len = len(s.text.strip()) if title_len == 0: s.text = letter + '\n' if letter == 'Z': letter = 'A' else: letter = chr(ord(letter) + 1) elif title_len == 1 and s.text[0].isupper(): letter = chr(ord(s.text[0]) + 1) srt.save(eol='\r\n')
def __test_encoding(self, encoding): srt_file = SubRipFile.open(os.path.join(self.base_path, encoding)) self.assertEquals(len(srt_file), 7) self.assertEquals(srt_file[0].index, 1)
def onInit( self ): filename = os.path.join(os.path.split(xbmc.Player().getPlayingFile())[0], xbmc.Player().getSubtitles()) if not os.path.exists(filename): filename = os.path.join("special://temp", xbmc.Player().getSubtitles()) if not os.path.exists(filename): xbmc.log(__scriptname__ + ": cannot find subtitle file!", xbmc.LOGERROR) dialog = xbmcgui.Dialog() dialog.ok('SubSeek', 'Sorry, the subtitle file could not be found...') xbmc.executebuiltin('XBMC.RunPlugin(plugin://script.xbmc.subtitles/)') self.exit_script() if not xbmc.Player().getSubtitles().split('.')[-1] == "srt": xbmc.log(__scriptname__ + ": incompatible subtitles", xbmc.LOGERROR) dialog = xbmcgui.Dialog() dialog.ok('SubSeek', 'Sorry, the subtitle file is not compatible. Please load a .srt') xbmc.executebuiltin('XBMC.RunPlugin(plugin://script.xbmc.subtitles/)') self.exit_script() xbmc.log(__scriptname__ + ": Subtitle file: " + filename, xbmc.LOGDEBUG) hashmatch = False pDialog = xbmcgui.DialogProgress() pDialog.create('SubSeek', 'Hashing subtitle file...') pDialog.update(0) f = open(filename, 'r') m = md5py.md5() for line in f: m.update(line) hash = m.hexdigest() f.close() xbmc.log(__scriptname__ + ": Subtitle hash is "+hash, xbmc.LOGDEBUG) if os.path.exists(os.path.join("special://temp","subseek-indexdir","hash.txt")): f = open(os.path.join('special://temp', 'subseek-indexdir',"hash.txt"), 'r') if f.readline() == hash: hashmatch = True xbmc.log(__scriptname__ + ": Subtitle hash matches stored database, reusing archive", xbmc.LOGDEBUG) else: xbmc.log(__scriptname__ + ": Subtitle hash does not match stored database, building new database", xbmc.LOGDEBUG) f.close() if not hashmatch: shutil.rmtree(os.path.join("special://temp","subseek-indexdir")) os.mkdir(os.path.join("special://temp","subseek-indexdir")) self.archive = Nucular.Nucular(os.path.join("special://temp","subseek-indexdir")) (self.archive).create() pDialog = xbmcgui.DialogProgress() pDialog.create('SubSeek', 'Opening Subtitle File...') pDialog.update(0) subs = SubRipFile.open(filename, encoding='iso-8859-1') pDialog.create('SubSeek', 'Populating Database...') pDialog.update(0) for i in range(len(subs)): sub = subs[i] D = { "content": sub.text.replace("\n", " ").replace("<i>", "[I]").replace("</i>", "[/I]"), "start": str(datetime.datetime(1,1,1, sub.start.hours, sub.start.minutes, sub.start.seconds, sub.start.milliseconds*1000)).split()[1]} (self.archive).indexDictionary(str(uuid.uuid4()), D) pDialog.update(int(math.floor(100*i/len(subs)))) pDialog.update(100, 'Storing Database...') (self.archive).store(lazy=False) f = open(os.path.join('special://temp', 'subseek-indexdir',"hash.txt"), "w") f.write(hash) f.close() else: (self.archive) = Nucular.Nucular(os.path.join("special://temp","subseek-indexdir"), readOnly=True) pDialog.close() pass
def main(): os.chdir(sys.argv[1]) raw_input("are the subtitle timings correct?".upper()) # ##### extract quotes from IMDB html-file ################################################### f = open(r"quotes.htm", "r") parser = etree.HTMLParser() tree = etree.parse(f, parser) f.close() root = tree.getroot() quotes = [] for div in root.xpath("//div"): try: c = div.attrib["class"] if c == "sodatext": s = etree.tostring(div) s = re.sub("\<div.*\>\n", "", s) s = re.sub("\</div.*\>", "", s) #s = re.sub("\<b\>.*\</b\>:\n", "- ", s) # names s = re.sub("\<b\>\<a.*\"\>", "", s) s = re.sub("\</a\>\</b\>:\n", ": ", s) # share this quote s = re.sub("\<p.*\>.*\</p\>", "", s) s = re.sub("\<span.*\>.*\</span\>", "", s) s = re.sub("\[.*\]", "", s) # stage directions s = re.sub("\<br/\>", "", s) s = re.sub(" ", " ", s) lines = [line.strip() for line in s.split("\n")] lines = [line for line in lines if len(line) > 0] if len(lines) == 1: lines[0] = lines[0][1:].strip() quote = "\n".join(lines) # ####### '''if len(quote) >= QUOTE_MIN_LEN and len(quote) <= QUOTE_MAX_LEN: quotes.append(quote)''' quotes.append(quote) # ####### except: continue quotes = list( set(quotes) ) quotes_clean = [re.sub("[%s]+" % re.escape(string.punctuation), "", x) for x in quotes] quotes_clean = [x.lower().strip() for x in quotes_clean] """for quote in quotes_clean: print quote, "\n" """ # ##### read subtitles from srt-file ################################################### subs = SubRipFile.open('subtitles.srt') """for sub in subs: #print sub.from_string() print sub.index #print sub.shift() print sub.start print sub.end print sub.text print "\n" """ #print dir(subs) timecode_quote = {} for item in subs: item.text = re.sub("[%s]+" % re.escape(string.punctuation), "", item.text) item.text = item.text.lower().strip() text = item.text.split("\n")[0] # first line only for i, quote in enumerate(quotes_clean): if len(text.split(" ")) >= 3 and text in quote: # we'll get a lot of false hits with only one word :/ if quotes[i] not in timecode_quote.values(): timecode_quote[str(item.start)] = quotes[i] # ##### ################################################### tree = et.parse("project.xml") movie = tree.getroot() fps = float( movie.attrib["fps"] ) frames = float( movie.attrib["frames"] ) seconds = frames / fps #print seconds """start_frame = float( movie.attrib["start_frame"] ) start_sec = startframe / fps""" # sort by timecode timecodes = timecode_quote.keys() timecodes.sort() f = open("quotes.txt", "w") for tc in timecodes: #print tc print "%.1f" % (100 * timecode_to_seconds(tc) / seconds) + "%", tc print timecode_quote[tc] print "" f.write("%f#%s\n" % (timecode_to_seconds(tc) / seconds, timecode_quote[tc].replace("\n", "#"))) f.close() print "<<", len(timecodes), "QUOTES >>" #raw_input("- done -") return
def merge_video_subtitle(video_id): """ 将video_id的中英vtt字幕转换为srt字幕,然后合并为srt格式的字幕 :param video_id: :return: """ video = Video.objects.get(pk=video_id) # Settings default values delta = SubRipTime(milliseconds=500) encoding = "utf_8" if (video.subtitle_cn != '') & (video.subtitle_en != ''): # convert_file(input_captions = video.subtitle_cn, output_writer) # vtt格式的字幕 # subs_cn_vtt = SubRipFile.open(video.subtitle_cn.path, # encoding=encoding) # subs_en_vtt = SubRipFile.open(video.subtitle_en.path, # encoding=encoding) # 将vtt字幕转换为srt subs_cn_srt_filename = '%s-%s.cn.srt' % ( get_valid_filename(video.title), video.video_id) subs_cn_srt_path = os.path.join(YOUTUBE_DOWNLOAD_DIR, subs_cn_srt_filename) # 此功能失效 # subs_cn_srt_result = convert_file( # input_captions=video.subtitle_cn.path,output_writer=subs_cn_srt) subs_cn_srt_result = convert_subtilte_format(srt_file= video.subtitle_cn.path, ass_file=subs_cn_srt_path) subs_en_srt_filename = '%s-%s.en.srt' % ( get_valid_filename(video.title), video.video_id) subs_en_srt_path = os.path.join(YOUTUBE_DOWNLOAD_DIR, subs_en_srt_filename) # subs_en_srt_result = convert_file( # input_captions=video.subtitle_en.path,output_writer = subs_en_srt) subs_en_srt_path = convert_subtilte_format(srt_file= video.subtitle_en.path, ass_file=subs_en_srt_path) subs_cn_srt = SubRipFile.open(subs_cn_srt_path, encoding=encoding) subs_en_srt = SubRipFile.open(subs_en_srt_path, encoding=encoding) merge_subs = merge_subtitle(subs_cn_srt, subs_en_srt, delta) # 某些youtube视频的title有非ASCII的字符,或者/等不能出现在文件名中的字符 # 所以使用django utils自带的get_valid_filename()转化一下 # 注意:与youtube-dl自带的restrictfilenames获得的文件名不一样, # 也就是merge_subs_filename 与 subtitle_cn, subtitle_cn中名称可能会不一样 # 标题中的 . 依然会保留 merge_subs_filename = '%s-%s.zh-Hans.en.srt' % ( get_valid_filename(video.title), video.video_id) merge_subs_path = os.path.join(YOUTUBE_DOWNLOAD_DIR, merge_subs_filename) merge_subs.save(merge_subs_path, encoding=encoding) video.subtitle_merge = merge_subs_path video.save(update_fields=['subtitle_merge']) return merge_subs_path else: return False
def makeL1L2(L1_srt, L2_srt, out_srt, levels, save_sync, out_L1_utf8bom_srt, out_L2_utf8bom_srt, \ show_L2, encoding, L1_color, L1_size, L2_color, L2_size): """ Joins L1_srt and L2_srt subtitles and saves the result to out_srt. If save_sync is True, saves the synced srt files. If out_L1_utf8bom_srt is not empty, saves the L1 srt file converted to utf8-BOM to that path. If out_L2_utf8bom_srt is not empty, saves the L2 srt file converted to utf8-BOM to that path. If L1_color, L1_size, L2_color, L2_size are given, the subs are formatted accordingly """ log("L1_srt: " + L1_srt) log("L2_srt: " + L2_srt) log("show_L2: " + show_L2) log("encoding: " + encoding) log("save_sync: ", save_sync) log("levels: ", levels) log("L1 color: {}, size: {}.".format(L1_color, L1_size)) log("L2 color: {}, size: {}.".format(L2_color, L2_size)) log("out_L1_utf8bom_srt: ", out_L1_utf8bom_srt) log("out_L2_utf8bom_srt: ", out_L2_utf8bom_srt) setSrtTemplates(L1_color, L1_size, L2_color, L2_size) # try to decode and save as utf8-bom L1_srt_bom = L1_srt + ".utf8bom" L2_srt_bom = L2_srt + ".utf8bom" makeFileUtf8Bom(L1_srt, L1_srt_bom) makeFileUtf8Bom(L2_srt, L2_srt_bom) subs_L1_orig = SubRipFile.open(L1_srt_bom) subs_L2_orig = SubRipFile.open(L2_srt_bom) subs_L1, dupes, fixed, subs_L2 = syncSrts(subs_L1_orig, subs_L2_orig) if save_sync: out_synced_L1 = L1_srt.replace(".srt", ".synced.srt") out_synced_L2 = L2_srt.replace(".srt", ".synced.srt") subs_L1.save(out_synced_L1, encoding=encoding) subs_L2.save(out_synced_L2, encoding=encoding) log("Saved {} and {}. Duplicate lines: {} Fixed: {}".format( out_synced_L1, out_synced_L2, dupes, fixed)) outs = {} removed_lines = {} out_srts = {} for level in levels: out_srts[level] = out_srt.replace("{{LEVEL}}", level) outs[level] = SubRipFile() removed_lines[level] = 0 for i in range(0, len(subs_L2)): processSub(subs_L1[i], subs_L2[i], levels, outs, removed_lines, show_L2) for level in levels: summary = "level_criteria: {}. Hidden L1 lines: {} out of {}".format( level_criterias[level] if level != "0" else 'none', removed_lines[level], len(subs_L2)) summaryItem = SubRipItem(1, {'milliseconds': 0}, {'milliseconds': 1}, summary) outs[level].append(summaryItem) outs[level].clean_indexes() outs[level].save(path=out_srts[level], encoding=encoding) log("Saved {}. {} ".format(out_srts[level], summary)) if (out_L1_utf8bom_srt): if os.path.isfile(out_L1_utf8bom_srt): os.remove(out_L1_utf8bom_srt) os.rename(L1_srt_bom, out_L1_utf8bom_srt) else: os.remove(L1_srt_bom) if (out_L2_utf8bom_srt): if os.path.isfile(out_L2_utf8bom_srt): os.remove(out_L2_utf8bom_srt) os.rename(L2_srt_bom, out_L2_utf8bom_srt) else: os.remove(L2_srt_bom)