def clean(root, outputPath): """ A little bit involved but with a bit of planning, it wasn't too difficult. Fortunately, three games all used the same formatting, so this function yielded a lot of text. """ END_OF_HEADER = "TIMESTAMPS" nameRe = re.compile("\n{2,}.*?\n[(「1]") if not os.path.exists(outputPath): os.mkdir(outputPath) for fn, fd in utils.loadFiles(root): data = fd.read() start = data.find(END_OF_HEADER) + len(END_OF_HEADER) + 1 data = data[start:].lstrip() # Remove lines containing an arrow in them data = removeLinesContaining(data, "→") start = nameRe.search(data).start() preamble = getPreamble(data[:start]) data = data[start:] data = removeFooter(data) # Protect events by making them look like speech data = data.replace("\n・", "イベント\n「・") data = data.replace("\n\n1.", "\n\n選択\n「1.") epilogue = getEpilogue(data) outputList = [ preamble, ] + splitDataBySpeechEvent(data) outputList.append(epilogue) outputData = "\n".join(outputList) + "\n" outputData = outputData.replace(" ", "") # Remove ideographic space outputData = setSpeakerBeforeDecisions(outputData) # Clean up the earlier protected 'events' outputData = removePlaceholderSpeech(outputData, "イベント:「・", "\n:・") outputData = removePlaceholderSpeech(outputData, "選択:「1.", "\n:1.") outputData = reduceWhitespaceBeforeNumbers(outputData) outputData = addColonBeforeDecisions(outputData) outputData = replaceSpecialCharacters(outputData) outputData = utils.removeQuoteMarksFromSpeech(outputData) outputData = utils.simplifyLongNewlineSequences(outputData) outputData = utils.addLeadingWhitespaceWhenSameSpeaker( outputData, True) outputData = utils.addSpaceAfterChoiceMarker(outputData) with io.open(os.path.join(outputPath, fn), "w", encoding="utf-8") as fd: fd.write(outputData)
def clean(root, outputPath): """ Post-clean up thoughts This has a lot going on. Changes in the scene are demarcated with < >. Keywords, sometimes the current speaker, and maybe narrations are decorated with 【 】 A blank line represents a break in continuity or passage of time maybe? All dialog can be extracted by picking lines with ":" in them. """ if not os.path.exists(outputPath): os.mkdir(outputPath) for fn, fd in utils.loadFiles(root): data = fd.read() data = data.lstrip() data = data[data.find("\n"):].strip() start = 0 while True: start = data.find("「", start) if start == -1: break end = data.find("」", start) speech = data[start:end].replace("\n", "") data = data[:start] + speech + data[end:] start = start + 1 data = data.replace("\n「", ":「") data = data.replace("」:「", "」\n:「") data = data.split("TO BE")[0] data = data.strip() + "\n" data = changeEventMarking(data) data = removeCharactersSurroundingSpeaker(data) data = utils.removeQuoteMarksFromSpeech(data) data = utils.simplifyLongNewlineSequences(data) data = utils.addSpaceAfterChoiceMarker(data) data = utils.addLeadingWhitespaceWhenSameSpeaker(data, True) with io.open(os.path.join(outputPath, fn), "w", encoding="utf-8") as fd: fd.write(data)
def clean(root, outputPath): """ Post-clean up thoughts -- had I parsed the page as a table, rather than do a text dump, the process would have been simpler. I had to manually pre-process the last page (credits). Otherwise, some pages have a table with 2 columns and others have 3 columns. """ if not os.path.exists(outputPath): os.mkdir(outputPath) for fn, fd in utils.loadFiles(root): data = fd.read() data = data.lstrip() pageTitle, data = data.split("\n", 1) dataList = data.split("セリフ&ナレーション") if len(dataList) != 2: continue data = dataList[1] data = data.split("Back Next")[0].strip() data = data.replace("\n ", " ") step = 2 if data.count("\n \n") > 5: step = 3 outputRows = [f": ++{pageTitle}"] dataList = data.split("\n") for i in range(0, len(dataList) - step + 1, step): speaker = dataList[i] text = dataList[i + step - 1] outputRows.append("%s:「%s」" % (speaker, text)) data = "\n".join(outputRows) + "\n" data = utils.removeQuoteMarksFromSpeech(data) data = utils.simplifyLongNewlineSequences(data) data = utils.addSpaceAfterChoiceMarker(data) data = utils.removeReundantSpeakers(data) data = utils.addLeadingWhitespaceWhenSameSpeaker(data, True) with io.open(os.path.join(outputPath, fn), "w", encoding="utf-8") as fd: fd.write(data)
def cleanSectionScript(sections): cleanedSections = [] for section in sections: section = simpleCondenseLines(section) section = markChoices(section) section = condenseLines(section) section = isolateChoices(section) section = separateSpeakersAndSpeech(section) section = section.replace(" ", "") # Scrub whitespace section = section.replace("[END]", "") section = section.replace(":\n:", ":") # Happens sometimes section = section.replace("「", "") # No longer needed section = section.replace("\n [", "\n[") section = fixDialogueSpacing(section) section = utils.simplifyLongNewlineSequences(section) section = utils.removeReundantSpeakers(section) section = utils.addSpaceAfterChoiceMarker(section) section = utils.addLeadingWhitespaceWhenSameSpeaker(section, False) cleanedSections.append(section) return cleanedSections
def clean(sourceFolder, outputFolder): if not os.path.exists(outputFolder): os.mkdir(outputFolder) for fn, fd in utils.loadFiles(sourceFolder): section = fd.read() section = condenseWhitespace(section) section = removeNewlinesInContinuousSpeech(section) section = markEvents(section) section = separateSpeakersAndSpeech(section) section = section.replace(" ", "") section = section.strip() section = utils.simplifyLongNewlineSequences(section) section = utils.addSpaceAfterChoiceMarker(section) section = utils.addLeadingWhitespaceWhenSameSpeaker(section, False) with io.open(join(outputFolder, fn), "w", encoding="utf-8") as writeFd: writeFd.write(section)
def clean(root, outputPath): """ Post-clean up thoughts Seems similar to FF7 but with more whitespace. Lots of inner monologue from Squall? """ locationRe = re.compile("\n\n[^\n((『「①②③+]+\n\n") if not os.path.exists(outputPath): os.mkdir(outputPath) for fn, fd in utils.loadFiles(root): data = fd.read() data = data.strip() # For unmarked locations, mark them start = 0 while True: match = locationRe.search(data, start) if match is None: break start = match.start() + 2 # pass the 2 leading newlines # text = data[match.start():match.end()] # if text.strip() == '': # continue data = data[:start] + "++" + data[start:] start += 2 # pass the 2 inserted characters data = data.replace("\n(", ":(") # Place quoted sections onto a single line start = 0 while True: start = data.find("「", start) if start == -1: break end = data.find("」", start) speech = data[start:end].replace("\n", "") data = data[:start] + speech + data[end:] start = start + 1 data = data.replace("\n「", ":「") data = data.replace("」:「", "」\n:「") data = data.replace("\n\n", "\n") data = markChoices(data) data = correctNumberOfQuotes(data) data = utils.removeQuoteMarksFromSpeech(data) data = utils.simplifyLongNewlineSequences(data) data = utils.addSpaceAfterChoiceMarker(data) data = utils.addLeadingWhitespaceWhenSameSpeaker(data, False) data = data.split("*****")[0] data = data.split("++私的好き台詞++")[0] data = data.strip() + "\n" data = "++" + data data = data.replace("\n", "++\n", 1) with io.open(os.path.join(outputPath, fn), "w", encoding="utf-8") as fd: fd.write(data)