Esempio n. 1
0
def merge_srt(chn_file, eng_file, output_file):
    delta = SubRipTime(milliseconds=500)
    subs_a = SubRipFile.open(chn_file)
    subs_b = SubRipFile.open(eng_file)
    out = merge_subtitle(subs_a, subs_b, delta)
    if os.path.isfile(output_file):
        os.remove(output_file)
    out.save(output_file, encoding='utf8')
Esempio n. 2
0
def add_videos_to_index(subtitle_index, output_file, index):
	vindexReader = csv.reader(open(subtitle_index, 'rb'))
	vinfoWriter = csv.writer(open(output_file, 'wt'))
	vinfoWriter.writerow(['title', 'filename', 'id', 'views', 'type', 'url', 'text'])
	for row in vindexReader:
		try:
			filename = row[1] + '.en.srt'
			url = 'http://www.youtube.com/watch?v=' + row[2]
			text = open(filename).read()
			text_ascii = removeNonAscii(text)
			subtitles = SubRipFile.open(filename)
			vinfoWriter.writerow([row[0], row[1], row[2], row[3], row[4], url, text_ascii])
			punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
			stopwords = ['']
			with open('/Users/connormendenhall/Python/DaveDaveFind/DaveDaveFind/data/stopwords.csv', 'rb') as f:
				wordlist = csv.reader(f)
				for stopword in wordlist:
					stopwords.append(stopword[0])
			for sentence in subtitles:
				text = (sentence.text)
				wordlist = text.split()
				for word in wordlist:
					word = word.lstrip(punctuation)
					word = word.rstrip(punctuation)
					word = word.lower()
					if word not in stopwords:
						add_to_index(index, word, url)
				
		except:
			pass
	print "[add_videos_to_index()] Videos added."
	return index
Esempio n. 3
0
    def mostrarSubtitulos(self, escena, ruta):
            if (self.ok==1):
                
                self.escena= escena
                
                #subs = SubRipFile.open(ruta, encoding='iso-8859-1')
                subs = SubRipFile.open(ruta, encoding='UTF-8') # Con esta codificacion logramos ver los tildes
                
                #print("Hay" ,subs.__len__()," subtitulos")
                
                #print "SEGUNDOS=", cant_segs
                if (self.tmp== subs.__len__()): # cuando llega al final de los subtitulos
                    #self.tmp= subs.__len__()-1                
                    self.tmp= 0
                    self.ok= 0
                    #print("entro en tiempo " ,self.tiempoActual)
                    self.tiempoActual= 0

                linea= subs[self.tmp]
                tics_ini = (linea.start.minutes*60*1000)+(linea.start.seconds*1000)+linea.start.milliseconds
                tics_fin = (linea.end.minutes*60*1000)+(linea.end.seconds*1000)+linea.end.milliseconds
                
                if ((tics_ini<=(pygame.time.get_ticks()-self.offset)) and ((pygame.time.get_ticks()-self.offset)<=tics_fin)): 
                    if (self.imprimir==1):
                        self.escena.draw()          # reimprime la escena
                        self.printTexto(linea.text) # imprime mensaje
                        self.imprimir= 0
                        self.tmp= self.tmp+1
                        self.entrar= 1
                        
                else:
                    if (self.entrar==1):   
                        self.printTexto("")                                   
                        self.imprimir= 1                
                        self.entrar=0
Esempio n. 4
0
 def test_save(self):
     srt_file = SubRipFile.open(self.windows_path, encoding='windows-1252')
     srt_file.save(self.temp_path, eol='\n', encoding='utf-8')
     self.assertEquals(
         open(self.temp_path, 'rb').read(),
         open(self.utf8_path, 'rb').read())
     os.remove(self.temp_path)
Esempio n. 5
0
    def __init__(self, filename):
        self.filename = filename

        self.model = Gtk.ListStore(object, str)
        self.srt_model = []
        if not os.path.exists(filename):
            raise (FileNameError(filename))

        try:
            self.srt_model = SubRipFile.open(path=filename)
        except UnicodeDecodeError as unic:
            debug(unic)
            try:
                info("trying ...", "ISO-8859-1")
                self.srt_model = SubRipFile(path=filename,
                                            encoding="iso-8859-1")
            except Exception as excep:
                debug(excep)
                self.model = None
        except IOError as error:
            info(
                "Impossible de lire le fichier de sous titre: error {}".format(
                    error))

        for line in self.srt_model:
            # print("appending",line)
            self.model.append([line, line.text])
Esempio n. 6
0
 def test_windows1252(self):
     srt_file = SubRipFile.open(self.windows_path, encoding='windows-1252')
     self.assertEquals(len(srt_file), 1332)
     self.assertEquals(srt_file.eol, '\r\n')
     self.assertRaises(UnicodeDecodeError,
                       SubRipFile.open,
                       self.utf8_path,
                       encoding='ascii')
Esempio n. 7
0
 def input_file(self):
     if not hasattr(self, '_source_file'):
         encoding = detect(open(self.arguments.file).read()).get('encoding')
         self._source_file = SubRipFile.open(
             self.arguments.file,
             encoding=encoding,
             error_handling=SubRipFile.ERROR_LOG)
     return self._source_file
Esempio n. 8
0
    def input_file(self):
        if not hasattr(self, '_source_file'):
            with open(self.arguments.file, 'rb') as f:
                content = f.read()
                encoding = detect(content).get('encoding')
                encoding = self.normalize_encoding(encoding)

            self._source_file = SubRipFile.open(self.arguments.file,
                encoding=encoding, error_handling=SubRipFile.ERROR_LOG)
        return self._source_file
Esempio n. 9
0
    def test_eol_conversion(self):
        input_file = open(self.windows_path, 'rU')
        input_file.read()
        self.assertEquals(input_file.newlines, '\r\n')

        srt_file = SubRipFile.open(self.windows_path, encoding='windows-1252')
        srt_file.save(self.temp_path, eol='\n')

        output_file = open(self.temp_path, 'rU')
        output_file.read()
        self.assertEquals(output_file.newlines, '\n')
Esempio n. 10
0
    def test_eol_conversion(self):
        input_file = open(self.windows_path, 'rU')
        input_file.read()
        self.assertEquals(input_file.newlines, '\r\n')

        srt_file = SubRipFile.open(self.windows_path, encoding='windows-1252')
        srt_file.save(self.temp_path, eol='\n')

        output_file = open(self.temp_path, 'rU')
        output_file.read()
        self.assertEquals(output_file.newlines, '\n')
Esempio n. 11
0
    def test_eol_conversion(self):
        input_file = open(self.windows_path, "rU")
        input_file.read()
        self.assertEquals(input_file.newlines, "\r\n")

        srt_file = SubRipFile.open(self.windows_path, encoding="windows-1252")
        srt_file.save(self.temp_path, eol="\n")

        output_file = open(self.temp_path, "rU")
        output_file.read()
        self.assertEquals(output_file.newlines, "\n")
Esempio n. 12
0
    def input_file(self):
        if not hasattr(self, '_source_file'):
            with open(self.arguments.file, 'rb') as f:
                content = f.read()
                encoding = detect(content).get('encoding')
                encoding = self.normalize_encoding(encoding)

            self._source_file = SubRipFile.open(
                self.arguments.file,
                encoding=encoding,
                error_handling=SubRipFile.ERROR_LOG)
        return self._source_file
Esempio n. 13
0
    def save(self, *args, **kwargs):
        episode = super(Episode, self).save(*args, **kwargs)

        # Delete existing subtitles
        self.subtitle_set.all().delete()

        # Import subtitles from file
        subs = SubRipFile.open(self.subtitles.path)

        with transaction.commit_on_success():
            for sub in subs:
                self.subtitle_set.create(
                    start=sub.start.ordinal, end=sub.end.ordinal,
                    text=sub.text)
Esempio n. 14
0
    def generate_vocap_file(self):

        ######### Generate subs in vocap format
        subs = SubRipFile.open(self.path+"/"+self.srt_file, encoding="utf-8")
        fileobj=codecs.open(self.path+"/"+self.vocap_file, "w", "utf-8")
        for i in range(len(subs)):
            text = subs[i].text
            text = text.replace(u"###", u"#.#.#")
            text = text.replace(u"\n", u" ")
            #text = cgi.escape(text)

            start = subs[i].start.seconds
            start += 60*subs[i].start.minutes
            start += 3600*subs[i].start.hours
            time = unicode(str(start),"utf-8")

            line = u"###"+time+u" "+text+u"\n"

            fileobj.write(line)
        fileobj.close()
Esempio n. 15
0
def readSrt(input_file, output_file, input_language, output_language):
    print('processing file', input_file)
    subs = SubRipFile.open(input_file)
    print(">", "read file", input_file)
    for sentence in subs:
        print(sentence.text)
        translateSentence = translate(sentence.text, input_language,
                                      output_language)
        if both_language:
            sentence.text = sentence.text + " (" + translateSentence + ")"
        else:
            sentence.text = translateSentence
        print(sentence.text)
    subs.save(output_file, 'utf-8')
    webvtt = WebVTT().from_srt(output_file)
    webvtt.save()
    os.rename(input_file, input_file + ".old")
    os.remove(output_file)
    os.rename(output_file.replace(".srt", ".vtt"), input_file)
    print(">", output_file, "saved!")
Esempio n. 16
0
def add_videos_to_index(subtitle_index, output_file, index):
    vindexReader = csv.reader(open(subtitle_index, 'rb'))
    vinfoWriter = csv.writer(open(output_file, 'wt'))
    vinfoWriter.writerow(
        ['title', 'filename', 'id', 'views', 'type', 'url', 'text'])
    for row in vindexReader:
        try:
            filename = row[1] + '.en.srt'
            url = 'http://www.youtube.com/watch?v=' + row[2]
            text = open(filename).read()
            text_ascii = removeNonAscii(text)
            subtitles = SubRipFile.open(filename)
            vinfoWriter.writerow(
                [row[0], row[1], row[2], row[3], row[4], url, text_ascii])
            punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
            stopwords = ['']
            with open(
                    '/Users/connormendenhall/Python/DaveDaveFind/DaveDaveFind/data/stopwords.csv',
                    'rb') as f:
                wordlist = csv.reader(f)
                for stopword in wordlist:
                    stopwords.append(stopword[0])
            for sentence in subtitles:
                text = (sentence.text)
                wordlist = text.split()
                for word in wordlist:
                    word = word.lstrip(punctuation)
                    word = word.rstrip(punctuation)
                    word = word.lower()
                    if word not in stopwords:
                        add_to_index(index, word, url)

        except:
            pass
    print "[add_videos_to_index()] Videos added."
    return index
Esempio n. 17
0
	def __init__(self, filename):
		self.filename = filename
			
		self.model = Gtk.ListStore(object, str)
		self.srt_model = []
		if not os.path.exists(filename) :
			raise(FileNameError(filename))

		try:
			self.srt_model = SubRipFile.open(path=filename)
		except UnicodeDecodeError as unic:
			debug(unic)
			try:
				info("trying ...", "ISO-8859-1")
				self.srt_model = SubRipFile(path = filename, encoding = "iso-8859-1")
			except Exception as excep :
				debug(excep)
				self.model = None
		except IOError as error:
			info("Impossible de lire le fichier de sous titre: error {}".format(error))

		for line in self.srt_model:
			# print("appending",line)
			self.model.append([line, line.text])
Esempio n. 18
0
 def test_utf8(self):
     self.assertEquals(len(SubRipFile.open(self.utf8_path)), 1332)
     self.assertRaises(UnicodeDecodeError, SubRipFile.open,
                       self.windows_path)
Esempio n. 19
0
 def test_save(self):
     srt_file = SubRipFile.open(self.windows_path, encoding='windows-1252')
     srt_file.save(self.temp_path, eol='\n', encoding='utf-8')
     self.assertEquals(open(self.temp_path, 'rb').read(),
                       open(self.utf8_path, 'rb').read())
     os.remove(self.temp_path)
Esempio n. 20
0
from pysrt import SubRipFile
import sys
import itertools
from collections import defaultdict

target_file = open(sys.argv[1], "w+")

for line in sys.stdin:
	line = line.strip()
	lines = SubRipFile.open(line, encoding='iso-8859-1')
	lines = map(lambda k: k.text.lower().strip().replace("\n", " ").replace(",", "").replace("-", "").replace(".", ""), lines)

	for line in lines:
		try:
			if not line:
				continue
			target_file.write(line.strip() + "\n")
		except: pass

target_file.close()
Esempio n. 21
0
 def test_empty_file(self):
     file = SubRipFile.open('/dev/null',
                            error_handling=SubRipFile.ERROR_RAISE)
     self.assertEquals(len(file), 0)
Esempio n. 22
0
 def test_windows1252(self):
     srt_file = SubRipFile.open(self.windows_path, encoding="windows-1252")
     self.assertEquals(len(srt_file), 1332)
     self.assertEquals(srt_file.eol, "\r\n")
     self.assertRaises(UnicodeDecodeError, SubRipFile.open, self.utf8_path, encoding="ascii")
Esempio n. 23
0
 def test_save(self):
     srt_file = SubRipFile.open(self.windows_path, encoding="windows-1252")
     srt_file.save(self.temp_path, eol="\n", encoding="utf-8")
     self.assertEquals(open(self.temp_path, "rb").read(), open(self.utf8_path, "rb").read())
     os.remove(self.temp_path)
Esempio n. 24
0
 def test_length(self):
     path = os.path.join(self.base_path, "capability_tester.srt")
     file = SubRipFile.open(path)
     self.assertEquals(len(file), 37)
Esempio n. 25
0
 def test_file_with_empty_items(self):
     path = os.path.join(self.base_path, "empty.srt")
     file = SubRipFile.open(path)
     self.assertEquals(len(file), 7)
Esempio n. 26
0
 def test_compare_from_string_and_from_path(self):
     unicode_content = codecs.open(self.utf8_path, encoding='utf_8').read()
     iterator = izip(SubRipFile.open(self.utf8_path),
                     SubRipFile.from_string(unicode_content))
     for file_item, string_item in iterator:
         self.assertEquals(unicode(file_item), unicode(string_item))
Esempio n. 27
0
import glob

# Change paths where applicable
path = "/home/elastictest/srt/"
x = glob.glob(path + '*.srt') 
EsPath = "http://192.168.1.48:2600/subtitles/subtitle/"



for i in x:
	# This needs to change for the srt paths sonwell has as they are numbers (just get a key/val of the numbers)
	subsName = i
	subsName = subsName[:-4]
	subsName = subsName.replace(path, '')
	# // end needs to change
	subs = SubRipFile.open(i)
	for i, val in enumerate(subs):
		d = {}
		d['title'] = subsName
		h = str(subs[i].start.hours).zfill(2)
		m = str(subs[i].start.minutes).zfill(2)
		s = str(subs[i].start.seconds).zfill(2)
		ms = str(subs[i].start.milliseconds).zfill(3)
		hms = '%s:%s:%s,%s' % (h, m, s, ms)
		d['startTime'] = hms
		h = str(subs[i].end.hours).zfill(2)
		m = str(subs[i].end.minutes).zfill(2)
		s = str(subs[i].end.seconds).zfill(2)
		ms = str(subs[i].end.milliseconds).zfill(3)
		hms = '%s:%s:%s,%s' % (h, m, s, ms)
		d['endTime'] = hms
Esempio n. 28
0
 def input_file(self):
     if not hasattr(self, '_source_file'):
         self._source_file = SubRipFile.open(self.arguments.file,
             error_handling=SubRipFile.ERROR_LOG)
     return self._source_file
Esempio n. 29
0
        opts, args = getopt.getopt(sys.argv[1:], 'hd:e:', ["help", "encoding=", "delta="])
    except getopt.GetoptError, err:
        print str(err)
        usage()
        sys.exit(2)

    #Settings default values
    delta = SubRipTime(milliseconds=500)
    encoding="utf_8"
    #-

    if len(args) <> 3:
        usage()
        sys.exit(2)

    for o, a in opts:
        if o in ("-d", "--delta"):
            delta = SubRipTime(milliseconds=int(a))
        elif o in ("-e", "--encoding"):
            encoding = a
        elif o in ("-h", "--help"):
            usage()
            sys.exit()

    subs_a = SubRipFile.open(args[0], encoding=encoding)
    subs_b = SubRipFile.open(args[1], encoding=encoding)
    out = merge_subtitle(subs_a, subs_b, delta)
    out.save(args[2], encoding=encoding)

if __name__ == "__main__":
    main()
Esempio n. 30
0
#!/usr/bin/python

from pysrt import SubRipFile
import sys

subs = SubRipFile.open(sys.argv[1])
for s in subs:
    print s.text
    print s.start.milliseconds
Esempio n. 31
0
def main():
    os.chdir(sys.argv[1])

    raw_input("are the subtitle timings correct?".upper())

    # ##### extract quotes from IMDB html-file ###################################################
    f = open(r"quotes.htm", "r")
    parser = etree.HTMLParser()
    tree = etree.parse(f, parser)
    f.close()
    root = tree.getroot()

    quotes = []
    for div in root.xpath("//div"):
        try:
            c = div.attrib["class"]
            if c == "sodatext":
                s = etree.tostring(div)
                s = re.sub("\<div.*\>\n", "", s)
                s = re.sub("\</div.*\>", "", s)

                #s = re.sub("\<b\>.*\</b\>:\n", "- ", s) # names
                s = re.sub("\<b\>\<a.*\"\>", "", s)
                s = re.sub("\</a\>\</b\>:\n", ": ", s)

                # share this quote
                s = re.sub("\<p.*\>.*\</p\>", "", s)
                s = re.sub("\<span.*\>.*\</span\>", "", s)

                s = re.sub("\[.*\]", "", s)  # stage directions
                s = re.sub("\<br/\>", "", s)
                s = re.sub("  ", " ", s)
                lines = [line.strip() for line in s.split("\n")]
                lines = [line for line in lines if len(line) > 0]
                if len(lines) == 1:
                    lines[0] = lines[0][1:].strip()
                quote = "\n".join(lines)
                # #######
                '''if len(quote) >= QUOTE_MIN_LEN and len(quote) <= QUOTE_MAX_LEN:
					quotes.append(quote)'''
                quotes.append(quote)
                # #######
        except:
            continue

    quotes = list(set(quotes))
    quotes_clean = [
        re.sub("[%s]+" % re.escape(string.punctuation), "", x) for x in quotes
    ]
    quotes_clean = [x.lower().strip() for x in quotes_clean]
    """for quote in quotes_clean:
		print quote, "\n" """

    # ##### read subtitles from srt-file ###################################################
    subs = SubRipFile.open('subtitles.srt')
    """for sub in subs:
		#print sub.from_string()
		print sub.index
		#print sub.shift()
		print sub.start
		print sub.end
		print sub.text
		print "\n" """
    #print dir(subs)

    timecode_quote = {}
    for item in subs:
        item.text = re.sub("[%s]+" % re.escape(string.punctuation), "",
                           item.text)
        item.text = item.text.lower().strip()
        text = item.text.split("\n")[0]  # first line only

        for i, quote in enumerate(quotes_clean):
            if len(
                    text.split(" ")
            ) >= 3 and text in quote:  # we'll get a lot of false hits with only one word :/
                if quotes[i] not in timecode_quote.values():
                    timecode_quote[str(item.start)] = quotes[i]

    # #####  ###################################################
    tree = et.parse("project.xml")
    movie = tree.getroot()
    fps = float(movie.attrib["fps"])
    frames = float(movie.attrib["frames"])
    seconds = frames / fps
    #print seconds
    """start_frame = float( movie.attrib["start_frame"] )
	start_sec = startframe / fps"""

    # sort by timecode
    timecodes = timecode_quote.keys()
    timecodes.sort()

    f = open("quotes.txt", "w")
    for tc in timecodes:
        #print tc
        print "%.1f" % (100 * timecode_to_seconds(tc) / seconds) + "%", tc
        print timecode_quote[tc]
        print ""
        f.write("%f#%s\n" % (timecode_to_seconds(tc) / seconds,
                             timecode_quote[tc].replace("\n", "#")))
    f.close()

    print "<<", len(timecodes), "QUOTES >>"

    #raw_input("- done -")
    return
Esempio n. 32
0
    except getopt.GetoptError, err:
        print str(err)
        usage()
        sys.exit(2)

    #Settings default values
    delta = SubRipTime(milliseconds=500)
    encoding = "utf_8"
    #-

    if len(args) <> 3:
        usage()
        sys.exit(2)

    for o, a in opts:
        if o in ("-d", "--delta"):
            delta = SubRipTime(milliseconds=int(a))
        elif o in ("-e", "--encoding"):
            encoding = a
        elif o in ("-h", "--help"):
            usage()
            sys.exit()

    subs_a = SubRipFile.open(args[0], encoding=encoding)
    subs_b = SubRipFile.open(args[1], encoding=encoding)
    out = merge_subtitle(subs_a, subs_b, delta)
    out.save(args[2], encoding=encoding)


if __name__ == "__main__":
    main()
Esempio n. 33
0
 def input_file(self):
     if not hasattr(self, '_source_file'):
         self._source_file = SubRipFile.open(
             self.arguments.file, error_handling=SubRipFile.ERROR_LOG)
     return self._source_file
Esempio n. 34
0
from pysrt import SubRipFile, SubRipTime

# hay que ver todavia como usar SubRipTime que nos va a solucionar la lectura

subs = SubRipFile.open("14Blades.srt", encoding="iso-8859-1")

print("Hay", subs.__len__(), " subtitulos")

linea = subs[0]
print(linea.text)
print("inicio", linea.start.seconds, " segundos.")
print("fin", linea.end.seconds, " segundos.")


linea = subs[1]
print(linea.text)
print("inicio", linea.start.seconds, " segundos.")
print("fin", linea.end.seconds, " segundos.")

linea = subs[14]
print(linea.text)
print("inicio", linea.start.minutes, "minutos con", linea.start.seconds, "segundos.")
print("fin", linea.start.minutes, "minutos con", linea.end.seconds, "segundos.")

# equivalent
# part = subs.slice(ends_after=SubRipTime(0, 0, 40))
# part = subs.slice(ends_after=(0, 0, 40))
# part = subs.slice(ends_after={'seconds': 40})

# part.shift(seconds=-2)
# subs.save('other/path.srt', 'utf-8');
Esempio n. 35
0
 def __test_encoding(self, encoding):
     srt_file = SubRipFile.open(os.path.join(self.base_path, encoding))
     self.assertEquals(len(srt_file), 7)
     self.assertEquals(srt_file[0].index, 1)
Esempio n. 36
0
def merge_video_subtitle(video_id):
    """
    将video_id的中英vtt字幕转换为srt字幕,然后合并为srt格式的字幕
    :param video_id:
    :return:
    """
    video = Video.objects.get(pk=video_id)

    # Settings default values
    delta = SubRipTime(milliseconds=500)
    encoding = "utf_8"

    if (video.subtitle_cn != '') & (video.subtitle_en != ''):
        # convert_file(input_captions = video.subtitle_cn, output_writer)

        # vtt格式的字幕
        # subs_cn_vtt = SubRipFile.open(video.subtitle_cn.path,
        # encoding=encoding)
        # subs_en_vtt = SubRipFile.open(video.subtitle_en.path,
        # encoding=encoding)

        # 将vtt字幕转换为srt
        subs_cn_srt_filename = '%s-%s.cn.srt' % (get_valid_filename(
            video.title), video.video_id)
        subs_cn_srt_path = os.path.join(YOUTUBE_DOWNLOAD_DIR,
                                        subs_cn_srt_filename)

        # 此功能失效
        # subs_cn_srt_result = convert_file(
        # input_captions=video.subtitle_cn.path,output_writer=subs_cn_srt)

        subs_cn_srt_result = convert_subtilte_format(
            srt_file=video.subtitle_cn.path, ass_file=subs_cn_srt_path)

        subs_en_srt_filename = '%s-%s.en.srt' % (get_valid_filename(
            video.title), video.video_id)
        subs_en_srt_path = os.path.join(YOUTUBE_DOWNLOAD_DIR,
                                        subs_en_srt_filename)
        # subs_en_srt_result = convert_file(
        # input_captions=video.subtitle_en.path,output_writer = subs_en_srt)
        subs_en_srt_path = convert_subtilte_format(
            srt_file=video.subtitle_en.path, ass_file=subs_en_srt_path)

        subs_cn_srt = SubRipFile.open(subs_cn_srt_path, encoding=encoding)
        subs_en_srt = SubRipFile.open(subs_en_srt_path, encoding=encoding)
        merge_subs = merge_subtitle(subs_cn_srt, subs_en_srt, delta)

        # 某些youtube视频的title有非ASCII的字符,或者/等不能出现在文件名中的字符
        # 所以使用django utils自带的get_valid_filename()转化一下
        # 注意:与youtube-dl自带的restrictfilenames获得的文件名不一样,
        # 也就是merge_subs_filename  与 subtitle_cn, subtitle_cn中名称可能会不一样
        # 标题中的 . 依然会保留
        merge_subs_filename = '%s-%s.zh-Hans.en.srt' % (get_valid_filename(
            video.title), video.video_id)

        merge_subs_path = os.path.join(YOUTUBE_DOWNLOAD_DIR,
                                       merge_subs_filename)

        merge_subs.save(merge_subs_path, encoding=encoding)

        video.subtitle_merge = merge_subs_path
        video.save(update_fields=['subtitle_merge'])
        return merge_subs_path
    else:
        return False
Esempio n. 37
0
 def test_empty_file(self):
     file = SubRipFile.open("/dev/null", error_handling=SubRipFile.ERROR_RAISE)
     self.assertEquals(len(file), 0)
Esempio n. 38
0
 def setUp(self):
     self.file = SubRipFile.open(os.path.join(file_path, "tests", "static", "utf-8.srt"))
Esempio n. 39
0
 def test_utf8(self):
     self.assertEquals(len(SubRipFile.open(self.utf8_path)), 1332)
     self.assertRaises(UnicodeDecodeError, SubRipFile.open, self.windows_path, encoding="utf_8")
Esempio n. 40
0
 def input_file(self):
     if not hasattr(self, '_source_file'):
         encoding = detect(open(self.arguments.file).read()).get('encoding')
         self._source_file = SubRipFile.open(self.arguments.file,
             encoding=encoding, error_handling=SubRipFile.ERROR_LOG)
     return self._source_file
Esempio n. 41
0
 def test_compare_from_string_and_from_path(self):
     unicode_content = codecs.open(self.utf8_path, encoding="utf_8").read()
     iterator = izip(SubRipFile.open(self.utf8_path), SubRipFile.from_string(unicode_content))
     for file_item, string_item in iterator:
         self.assertEquals(unicode(file_item), unicode(string_item))
Esempio n. 42
0
 def test_length(self):
     path = os.path.join(self.base_path, 'capability_tester.srt')
     file = SubRipFile.open(path)
     self.assertEquals(len(file), 37)
Esempio n. 43
0
 def setUp(self):
     self.file = SubRipFile.open(
         os.path.join(file_path, 'tests', 'static', 'utf-8.srt'))
Esempio n. 44
0
     length1Time = get_video_length(args.inputVideo1[0])
     offset2Time = SubRipTime.from_string(zeroTime)
     inVid1 = args.inputVideo1[0]
 if args.offset2:
     offset2Time = SubRipTime.from_string(args.offset2[0])
     length1Time = SubRipTime.from_string(zeroTime)
     offset2 = args.offset2[0]
 inSubName1 = args.input1
 inSubName2 = args.input2
 outSubName = args.output
 if args.encoding:
     encoding = args.encoding[0]
 else:
     encoding = args.encoding
 try:
     inSub1 = SubRipFile.open(inSubName1,encoding)
 except AttributeError:
     print "No such file: ",inSubName1
     sys.exit(1)
 except LookupError:
     print "No such encoding: ",encoding
     sys.exit(1)
 except UnicodeDecodeError:
     print "Not encoded as utf-8"
     sys.exit(1)
 try:
     inSub2 = SubRipFile.open(inSubName2,encoding)
 except AttributeError:
     print "No such file: ",inSubName1
     sys.exit(1)
 except LookupError:
Esempio n. 45
0
#! /usr/bin/python

import sys
from pysrt import SubRipFile

if len(sys.argv) != 2:
    print "Usage: subfiller <file.srt>"
    sys.exit(1)

srt = SubRipFile.open(sys.argv[1], 'cp1250')

letter = 'A'

for s in srt:
    title_len = len(s.text.strip())

    if title_len == 0:
        s.text = letter + '\n'

        if letter == 'Z':
            letter = 'A'
        else:
            letter = chr(ord(letter) + 1)
    elif title_len == 1 and s.text[0].isupper():
        letter = chr(ord(s.text[0]) + 1)

srt.save(eol='\r\n')
Esempio n. 46
0
 def __test_encoding(self, encoding):
     srt_file = SubRipFile.open(os.path.join(self.base_path, encoding))
     self.assertEquals(len(srt_file), 7)
     self.assertEquals(srt_file[0].index, 1)
Esempio n. 47
0
  def onInit( self ):
    filename = os.path.join(os.path.split(xbmc.Player().getPlayingFile())[0], xbmc.Player().getSubtitles())
		
    if not os.path.exists(filename):
      filename = os.path.join("special://temp", xbmc.Player().getSubtitles())

    if not os.path.exists(filename):
      xbmc.log(__scriptname__ + ": cannot find subtitle file!", xbmc.LOGERROR)
      dialog = xbmcgui.Dialog()
      dialog.ok('SubSeek', 'Sorry, the subtitle file could not be found...')
      xbmc.executebuiltin('XBMC.RunPlugin(plugin://script.xbmc.subtitles/)')
      self.exit_script()
      
    if not xbmc.Player().getSubtitles().split('.')[-1] == "srt":
      xbmc.log(__scriptname__ + ": incompatible subtitles", xbmc.LOGERROR)
      dialog = xbmcgui.Dialog()
      dialog.ok('SubSeek', 'Sorry, the subtitle file is not compatible. Please load a .srt')
      xbmc.executebuiltin('XBMC.RunPlugin(plugin://script.xbmc.subtitles/)')
      self.exit_script()

    xbmc.log(__scriptname__ + ": Subtitle file: " + filename, xbmc.LOGDEBUG)

    hashmatch = False
    pDialog = xbmcgui.DialogProgress()
    pDialog.create('SubSeek', 'Hashing subtitle file...')
    pDialog.update(0)
    f = open(filename, 'r')
    m = md5py.md5()
    for line in f:
    	m.update(line)
    hash = m.hexdigest()
    f.close()
    xbmc.log(__scriptname__ + ": Subtitle hash is "+hash, xbmc.LOGDEBUG)

    if os.path.exists(os.path.join("special://temp","subseek-indexdir","hash.txt")):
      f = open(os.path.join('special://temp', 'subseek-indexdir',"hash.txt"), 'r')
      if f.readline() == hash:
        hashmatch = True
        xbmc.log(__scriptname__ + ": Subtitle hash matches stored database, reusing archive", xbmc.LOGDEBUG)
      else:
        xbmc.log(__scriptname__ + ": Subtitle hash does not match stored database, building new database", xbmc.LOGDEBUG)
      f.close()
      
    if not hashmatch:
      shutil.rmtree(os.path.join("special://temp","subseek-indexdir"))
      os.mkdir(os.path.join("special://temp","subseek-indexdir"))

      self.archive = Nucular.Nucular(os.path.join("special://temp","subseek-indexdir"))
    
      (self.archive).create()

      pDialog = xbmcgui.DialogProgress()

      pDialog.create('SubSeek', 'Opening Subtitle File...')
      pDialog.update(0)

      subs = SubRipFile.open(filename, encoding='iso-8859-1')

      pDialog.create('SubSeek', 'Populating Database...')
      pDialog.update(0)
	
      for i in range(len(subs)):
        sub = subs[i]
        D = {   "content": sub.text.replace("\n", " ").replace("<i>", "[I]").replace("</i>", "[/I]"),
                "start": str(datetime.datetime(1,1,1,
                        sub.start.hours,
                        sub.start.minutes,
                        sub.start.seconds,
                        sub.start.milliseconds*1000)).split()[1]}
        (self.archive).indexDictionary(str(uuid.uuid4()), D)
        pDialog.update(int(math.floor(100*i/len(subs))))

      pDialog.update(100, 'Storing Database...')      
      (self.archive).store(lazy=False)
      
      f = open(os.path.join('special://temp', 'subseek-indexdir',"hash.txt"), "w")
      f.write(hash)
      f.close()
    else:
      (self.archive) = Nucular.Nucular(os.path.join("special://temp","subseek-indexdir"), readOnly=True)
    
    pDialog.close()
    pass
Esempio n. 48
0
def main():
	os.chdir(sys.argv[1])
	
	raw_input("are the subtitle timings correct?".upper())
	
	# ##### extract quotes from IMDB html-file ###################################################
	f = open(r"quotes.htm", "r")
	parser = etree.HTMLParser()
	tree = etree.parse(f, parser)
	f.close()
	root = tree.getroot()

	quotes = []
	for div in root.xpath("//div"):
		try:
			c = div.attrib["class"]
			if c == "sodatext":
				s = etree.tostring(div)
				s = re.sub("\<div.*\>\n", "", s)
				s = re.sub("\</div.*\>", "", s)
				
				#s = re.sub("\<b\>.*\</b\>:\n", "- ", s) # names
				s = re.sub("\<b\>\<a.*\"\>", "", s)
				s = re.sub("\</a\>\</b\>:\n", ": ", s)
				
				# share this quote
				s = re.sub("\<p.*\>.*\</p\>", "", s)
				s = re.sub("\<span.*\>.*\</span\>", "", s)
				
				s = re.sub("\[.*\]", "", s) # stage directions
				s = re.sub("\<br/\>", "", s)
				s = re.sub("  ", " ", s)
				lines = [line.strip() for line in s.split("\n")]
				lines = [line for line in lines if len(line) > 0]
				if len(lines) == 1:
					lines[0] = lines[0][1:].strip()
				quote = "\n".join(lines)
				# #######
				'''if len(quote) >= QUOTE_MIN_LEN and len(quote) <= QUOTE_MAX_LEN:
					quotes.append(quote)'''
				quotes.append(quote)
				# #######
		except:
			continue
	
	quotes = list( set(quotes) )
	quotes_clean = [re.sub("[%s]+" % re.escape(string.punctuation), "", x) for x in quotes]
	quotes_clean = [x.lower().strip() for x in quotes_clean]
	"""for quote in quotes_clean:
		print quote, "\n" """
	
	# ##### read subtitles from srt-file ###################################################
	subs = SubRipFile.open('subtitles.srt')
	"""for sub in subs:
		#print sub.from_string()
		print sub.index
		#print sub.shift()
		print sub.start
		print sub.end
		print sub.text
		print "\n" """
	#print dir(subs)
	
	timecode_quote = {}
	for item in subs:
		item.text = re.sub("[%s]+" % re.escape(string.punctuation), "", item.text)
		item.text = item.text.lower().strip()
		text = item.text.split("\n")[0] # first line only
		
		for i, quote in enumerate(quotes_clean):
			if len(text.split(" ")) >= 3 and text in quote: # we'll get a lot of false hits with only one word :/
				if quotes[i] not in timecode_quote.values():
					timecode_quote[str(item.start)] = quotes[i]
	
	# #####  ###################################################
	tree = et.parse("project.xml")
	movie = tree.getroot()
	fps = float( movie.attrib["fps"] )
	frames = float( movie.attrib["frames"] )
	seconds = frames / fps
	#print seconds
	"""start_frame = float( movie.attrib["start_frame"] )
	start_sec = startframe / fps"""
	
	# sort by timecode
	timecodes = timecode_quote.keys()
	timecodes.sort()
	
	f = open("quotes.txt", "w")
	for tc in timecodes:
		#print tc
		print "%.1f" % (100 * timecode_to_seconds(tc) / seconds) + "%", tc
		print timecode_quote[tc]
		print ""
		f.write("%f#%s\n" % (timecode_to_seconds(tc) / seconds, timecode_quote[tc].replace("\n", "#")))
	f.close()
	
	print "<<", len(timecodes), "QUOTES >>"
	
	
	#raw_input("- done -")
	return
Esempio n. 49
0
def merge_video_subtitle(video_id):
    """
    将video_id的中英vtt字幕转换为srt字幕,然后合并为srt格式的字幕
    :param video_id:
    :return:
    """
    video = Video.objects.get(pk=video_id)

    # Settings default values
    delta = SubRipTime(milliseconds=500)
    encoding = "utf_8"

    if (video.subtitle_cn != '') & (video.subtitle_en != ''):
        # convert_file(input_captions = video.subtitle_cn, output_writer)

        # vtt格式的字幕
        # subs_cn_vtt = SubRipFile.open(video.subtitle_cn.path,
        # encoding=encoding)
        # subs_en_vtt = SubRipFile.open(video.subtitle_en.path,
        # encoding=encoding)

        # 将vtt字幕转换为srt
        subs_cn_srt_filename = '%s-%s.cn.srt' % (
            get_valid_filename(video.title), video.video_id)
        subs_cn_srt_path = os.path.join(YOUTUBE_DOWNLOAD_DIR,
                                        subs_cn_srt_filename)

        # 此功能失效
        # subs_cn_srt_result = convert_file(
        # input_captions=video.subtitle_cn.path,output_writer=subs_cn_srt)

        subs_cn_srt_result = convert_subtilte_format(srt_file=
                                                     video.subtitle_cn.path,
                                                     ass_file=subs_cn_srt_path)

        subs_en_srt_filename = '%s-%s.en.srt' % (
            get_valid_filename(video.title), video.video_id)
        subs_en_srt_path = os.path.join(YOUTUBE_DOWNLOAD_DIR,
                                        subs_en_srt_filename)
        # subs_en_srt_result = convert_file(
        # input_captions=video.subtitle_en.path,output_writer = subs_en_srt)
        subs_en_srt_path = convert_subtilte_format(srt_file=
                                                   video.subtitle_en.path,
                                                   ass_file=subs_en_srt_path)

        subs_cn_srt = SubRipFile.open(subs_cn_srt_path, encoding=encoding)
        subs_en_srt = SubRipFile.open(subs_en_srt_path, encoding=encoding)
        merge_subs = merge_subtitle(subs_cn_srt, subs_en_srt, delta)

        # 某些youtube视频的title有非ASCII的字符,或者/等不能出现在文件名中的字符
        # 所以使用django utils自带的get_valid_filename()转化一下
        # 注意:与youtube-dl自带的restrictfilenames获得的文件名不一样,
        # 也就是merge_subs_filename  与 subtitle_cn, subtitle_cn中名称可能会不一样
        # 标题中的 . 依然会保留
        merge_subs_filename = '%s-%s.zh-Hans.en.srt' % (
            get_valid_filename(video.title), video.video_id)

        merge_subs_path = os.path.join(YOUTUBE_DOWNLOAD_DIR,
                                       merge_subs_filename)

        merge_subs.save(merge_subs_path, encoding=encoding)

        video.subtitle_merge = merge_subs_path
        video.save(update_fields=['subtitle_merge'])
        return merge_subs_path
    else:
        return False
Esempio n. 50
0
def makeL1L2(L1_srt, L2_srt, out_srt, levels, save_sync, out_L1_utf8bom_srt, out_L2_utf8bom_srt, \
    show_L2, encoding, L1_color, L1_size, L2_color, L2_size):
    """
    Joins L1_srt and L2_srt subtitles and saves the result to out_srt.
    If save_sync is True, saves the synced srt files.
    If out_L1_utf8bom_srt is not empty, saves the L1 srt file converted to utf8-BOM to that path.
    If out_L2_utf8bom_srt is not empty, saves the L2 srt file converted to utf8-BOM to that path.
    If L1_color, L1_size, L2_color, L2_size are given, the subs are formatted accordingly
    """

    log("L1_srt: " + L1_srt)
    log("L2_srt: " + L2_srt)
    log("show_L2: " + show_L2)
    log("encoding: " + encoding)
    log("save_sync: ", save_sync)
    log("levels: ", levels)
    log("L1 color: {}, size: {}.".format(L1_color, L1_size))
    log("L2 color: {}, size: {}.".format(L2_color, L2_size))
    log("out_L1_utf8bom_srt: ", out_L1_utf8bom_srt)
    log("out_L2_utf8bom_srt: ", out_L2_utf8bom_srt)

    setSrtTemplates(L1_color, L1_size, L2_color, L2_size)

    # try to decode and save as utf8-bom
    L1_srt_bom = L1_srt + ".utf8bom"
    L2_srt_bom = L2_srt + ".utf8bom"

    makeFileUtf8Bom(L1_srt, L1_srt_bom)
    makeFileUtf8Bom(L2_srt, L2_srt_bom)

    subs_L1_orig = SubRipFile.open(L1_srt_bom)
    subs_L2_orig = SubRipFile.open(L2_srt_bom)

    subs_L1, dupes, fixed, subs_L2 = syncSrts(subs_L1_orig, subs_L2_orig)

    if save_sync:
        out_synced_L1 = L1_srt.replace(".srt", ".synced.srt")
        out_synced_L2 = L2_srt.replace(".srt", ".synced.srt")

        subs_L1.save(out_synced_L1, encoding=encoding)
        subs_L2.save(out_synced_L2, encoding=encoding)
        log("Saved {} and {}. Duplicate lines: {} Fixed: {}".format(
            out_synced_L1, out_synced_L2, dupes, fixed))

    outs = {}
    removed_lines = {}
    out_srts = {}
    for level in levels:
        out_srts[level] = out_srt.replace("{{LEVEL}}", level)
        outs[level] = SubRipFile()
        removed_lines[level] = 0

    for i in range(0, len(subs_L2)):
        processSub(subs_L1[i], subs_L2[i], levels, outs, removed_lines,
                   show_L2)

    for level in levels:
        summary = "level_criteria: {}. Hidden L1 lines: {} out of {}".format(
            level_criterias[level] if level != "0" else 'none',
            removed_lines[level], len(subs_L2))
        summaryItem = SubRipItem(1, {'milliseconds': 0}, {'milliseconds': 1},
                                 summary)
        outs[level].append(summaryItem)
        outs[level].clean_indexes()
        outs[level].save(path=out_srts[level], encoding=encoding)
        log("Saved {}. {} ".format(out_srts[level], summary))

    if (out_L1_utf8bom_srt):
        if os.path.isfile(out_L1_utf8bom_srt):
            os.remove(out_L1_utf8bom_srt)
        os.rename(L1_srt_bom, out_L1_utf8bom_srt)
    else:
        os.remove(L1_srt_bom)

    if (out_L2_utf8bom_srt):
        if os.path.isfile(out_L2_utf8bom_srt):
            os.remove(out_L2_utf8bom_srt)
        os.rename(L2_srt_bom, out_L2_utf8bom_srt)
    else:
        os.remove(L2_srt_bom)