コード例 #1
ファイル: subtitle_sync.py プロジェクト: delameko/cinemetrics
def main():
	tc_1 = timecode_to_seconds("00:00:53,840")
	tc_1_real = timecode_to_seconds("00:00:49,000")
	tc_2 = timecode_to_seconds("01:27:22,640")
	tc_2_real = timecode_to_seconds("01:27:18,000")
	speed = (tc_1 - tc_2) / (tc_1_real - tc_2_real)
	#print speed
	offset =  tc_1_real - (tc_1 / speed)
	#print offset
	#print (tc_1 / speed) + offset, tc_1_real
	#print "00:00:59,225"
	#print seconds_to_timecode(timecode_to_seconds("00:00:59,225"))
	#print corrected_seconds(tc_1, speed, offset), seconds_to_timecode(corrected_seconds(tc_1, speed, offset))
	file_orig = pysrt.SubRipFile.open("projects\\" + PROJECT + "\\subtitles.srt~")
	for sub in file_orig:
		sub.start = timecode_to_seconds(str(sub.start))
		sub.start = (sub.start / speed) + offset
		sub.start = seconds_to_timecode(sub.start)
		sub.end = timecode_to_seconds(str(sub.end))
		sub.end = (sub.end / speed) + offset
		sub.end = seconds_to_timecode(sub.end)
	file_orig.save("projects\\" + PROJECT + "\\subtitles.srt", "utf-8") # , "utf-8", "\n"
コード例 #2
def main():
	raw_input("are the subtitle timings correct?".upper())
	# ##### extract quotes from IMDB html-file ###################################################
	f = open(r"quotes.htm", "r")
	parser = etree.HTMLParser()
	tree = etree.parse(f, parser)
	root = tree.getroot()

	quotes = []
	for div in root.xpath("//div"):
			c = div.attrib["class"]
			if c == "sodatext":
				s = etree.tostring(div)
				s = re.sub("\<div.*\>\n", "", s)
				s = re.sub("\</div.*\>", "", s)
				#s = re.sub("\<b\>.*\</b\>:\n", "- ", s) # names
				s = re.sub("\<b\>\<a.*\"\>", "", s)
				s = re.sub("\</a\>\</b\>:\n", ": ", s)
				# share this quote
				s = re.sub("\<p.*\>.*\</p\>", "", s)
				s = re.sub("\<span.*\>.*\</span\>", "", s)
				s = re.sub("\[.*\]", "", s) # stage directions
				s = re.sub("\<br/\>", "", s)
				s = re.sub("  ", " ", s)
				lines = [line.strip() for line in s.split("\n")]
				lines = [line for line in lines if len(line) > 0]
				if len(lines) == 1:
					lines[0] = lines[0][1:].strip()
				quote = "\n".join(lines)
				# #######
				'''if len(quote) >= QUOTE_MIN_LEN and len(quote) <= QUOTE_MAX_LEN:
				# #######
	quotes = list( set(quotes) )
	quotes_clean = [re.sub("[%s]+" % re.escape(string.punctuation), "", x) for x in quotes]
	quotes_clean = [x.lower().strip() for x in quotes_clean]
	"""for quote in quotes_clean:
		print quote, "\n" """
	# ##### read subtitles from srt-file ###################################################
	subs = SubRipFile.open('subtitles.srt')
	"""for sub in subs:
		#print sub.from_string()
		print sub.index
		#print sub.shift()
		print sub.start
		print sub.end
		print sub.text
		print "\n" """
	#print dir(subs)
	timecode_quote = {}
	for item in subs:
		item.text = re.sub("[%s]+" % re.escape(string.punctuation), "", item.text)
		item.text = item.text.lower().strip()
		text = item.text.split("\n")[0] # first line only
		for i, quote in enumerate(quotes_clean):
			if len(text.split(" ")) >= 3 and text in quote: # we'll get a lot of false hits with only one word :/
				if quotes[i] not in timecode_quote.values():
					timecode_quote[str(item.start)] = quotes[i]
	# #####  ###################################################
	tree = et.parse("project.xml")
	movie = tree.getroot()
	fps = float( movie.attrib["fps"] )
	frames = float( movie.attrib["frames"] )
	seconds = frames / fps
	#print seconds
	"""start_frame = float( movie.attrib["start_frame"] )
	start_sec = startframe / fps"""
	# sort by timecode
	timecodes = timecode_quote.keys()
	f = open("quotes.txt", "w")
	for tc in timecodes:
		#print tc
		print "%.1f" % (100 * timecode_to_seconds(tc) / seconds) + "%", tc
		print timecode_quote[tc]
		print ""
		f.write("%f#%s\n" % (timecode_to_seconds(tc) / seconds, timecode_quote[tc].replace("\n", "#")))
	print "<<", len(timecodes), "QUOTES >>"
	#raw_input("- done -")
コード例 #3
def main():

    raw_input("are the subtitle timings correct?".upper())

    # ##### extract quotes from IMDB html-file ###################################################
    f = open(r"quotes.htm", "r")
    parser = etree.HTMLParser()
    tree = etree.parse(f, parser)
    root = tree.getroot()

    quotes = []
    for div in root.xpath("//div"):
            c = div.attrib["class"]
            if c == "sodatext":
                s = etree.tostring(div)
                s = re.sub("\<div.*\>\n", "", s)
                s = re.sub("\</div.*\>", "", s)

                #s = re.sub("\<b\>.*\</b\>:\n", "- ", s) # names
                s = re.sub("\<b\>\<a.*\"\>", "", s)
                s = re.sub("\</a\>\</b\>:\n", ": ", s)

                # share this quote
                s = re.sub("\<p.*\>.*\</p\>", "", s)
                s = re.sub("\<span.*\>.*\</span\>", "", s)

                s = re.sub("\[.*\]", "", s)  # stage directions
                s = re.sub("\<br/\>", "", s)
                s = re.sub("  ", " ", s)
                lines = [line.strip() for line in s.split("\n")]
                lines = [line for line in lines if len(line) > 0]
                if len(lines) == 1:
                    lines[0] = lines[0][1:].strip()
                quote = "\n".join(lines)
                # #######
                '''if len(quote) >= QUOTE_MIN_LEN and len(quote) <= QUOTE_MAX_LEN:
                # #######

    quotes = list(set(quotes))
    quotes_clean = [
        re.sub("[%s]+" % re.escape(string.punctuation), "", x) for x in quotes
    quotes_clean = [x.lower().strip() for x in quotes_clean]
    """for quote in quotes_clean:
		print quote, "\n" """

    # ##### read subtitles from srt-file ###################################################
    subs = SubRipFile.open('subtitles.srt')
    """for sub in subs:
		#print sub.from_string()
		print sub.index
		#print sub.shift()
		print sub.start
		print sub.end
		print sub.text
		print "\n" """
    #print dir(subs)

    timecode_quote = {}
    for item in subs:
        item.text = re.sub("[%s]+" % re.escape(string.punctuation), "",
        item.text = item.text.lower().strip()
        text = item.text.split("\n")[0]  # first line only

        for i, quote in enumerate(quotes_clean):
            if len(
                    text.split(" ")
            ) >= 3 and text in quote:  # we'll get a lot of false hits with only one word :/
                if quotes[i] not in timecode_quote.values():
                    timecode_quote[str(item.start)] = quotes[i]

    # #####  ###################################################
    tree = et.parse("project.xml")
    movie = tree.getroot()
    fps = float(movie.attrib["fps"])
    frames = float(movie.attrib["frames"])
    seconds = frames / fps
    #print seconds
    """start_frame = float( movie.attrib["start_frame"] )
	start_sec = startframe / fps"""

    # sort by timecode
    timecodes = timecode_quote.keys()

    f = open("quotes.txt", "w")
    for tc in timecodes:
        #print tc
        print "%.1f" % (100 * timecode_to_seconds(tc) / seconds) + "%", tc
        print timecode_quote[tc]
        print ""
        f.write("%f#%s\n" % (timecode_to_seconds(tc) / seconds,
                             timecode_quote[tc].replace("\n", "#")))

    print "<<", len(timecodes), "QUOTES >>"

    #raw_input("- done -")