Exemple #1
0
def clean(root, outputPath):
    """
    A little bit involved but with a bit of planning, it wasn't too difficult.
    Fortunately, three games all used the same formatting, so this function
    yielded a lot of text.
    """
    END_OF_HEADER = "TIMESTAMPS"
    nameRe = re.compile("\n{2,}.*?\n[(「1]")

    if not os.path.exists(outputPath):
        os.mkdir(outputPath)

    for fn, fd in utils.loadFiles(root):
        data = fd.read()

        start = data.find(END_OF_HEADER) + len(END_OF_HEADER) + 1
        data = data[start:].lstrip()

        # Remove lines containing an arrow in them
        data = removeLinesContaining(data, "→")

        start = nameRe.search(data).start()
        preamble = getPreamble(data[:start])
        data = data[start:]

        data = removeFooter(data)

        # Protect events by making them look like speech
        data = data.replace("\n・", "イベント\n「・")
        data = data.replace("\n\n1.", "\n\n選択\n「1.")

        epilogue = getEpilogue(data)
        outputList = [
            preamble,
        ] + splitDataBySpeechEvent(data)
        outputList.append(epilogue)
        outputData = "\n".join(outputList) + "\n"
        outputData = outputData.replace(" ", "")  # Remove ideographic space

        outputData = setSpeakerBeforeDecisions(outputData)

        # Clean up the earlier protected 'events'
        outputData = removePlaceholderSpeech(outputData, "イベント:「・", "\n:・")
        outputData = removePlaceholderSpeech(outputData, "選択:「1.", "\n:1.")

        outputData = reduceWhitespaceBeforeNumbers(outputData)
        outputData = addColonBeforeDecisions(outputData)
        outputData = replaceSpecialCharacters(outputData)

        outputData = utils.removeQuoteMarksFromSpeech(outputData)
        outputData = utils.simplifyLongNewlineSequences(outputData)
        outputData = utils.addLeadingWhitespaceWhenSameSpeaker(
            outputData, True)
        outputData = utils.addSpaceAfterChoiceMarker(outputData)

        with io.open(os.path.join(outputPath, fn), "w",
                     encoding="utf-8") as fd:
            fd.write(outputData)
def clean(root, outputPath):
    """

    Post-clean up thoughts
        This has a lot going on.  Changes in the scene are demarcated with < >.
        Keywords, sometimes the current speaker, and maybe narrations are decorated
        with 【 】
        A blank line represents a break in continuity or passage of time maybe?

        All dialog can be extracted by picking lines with ":" in them.
    """

    if not os.path.exists(outputPath):
        os.mkdir(outputPath)

    for fn, fd in utils.loadFiles(root):
        data = fd.read()

        data = data.lstrip()
        data = data[data.find("\n"):].strip()

        start = 0
        while True:
            start = data.find("「", start)
            if start == -1:
                break
            end = data.find("」", start)

            speech = data[start:end].replace("\n", "")
            data = data[:start] + speech + data[end:]

            start = start + 1

        data = data.replace("\n「", ":「")
        data = data.replace("」:「", "」\n:「")
        data = data.split("TO BE")[0]
        data = data.strip() + "\n"

        data = changeEventMarking(data)
        data = removeCharactersSurroundingSpeaker(data)

        data = utils.removeQuoteMarksFromSpeech(data)
        data = utils.simplifyLongNewlineSequences(data)
        data = utils.addSpaceAfterChoiceMarker(data)
        data = utils.addLeadingWhitespaceWhenSameSpeaker(data, True)

        with io.open(os.path.join(outputPath, fn), "w",
                     encoding="utf-8") as fd:
            fd.write(data)
Exemple #3
0
def clean(root, outputPath):
    """

    Post-clean up thoughts
    -- had I parsed the page as a table, rather than do a text dump,
       the process would have been simpler. I had to manually
       pre-process the last page (credits). Otherwise, some pages have
       a table with 2 columns and others have 3 columns.
    """

    if not os.path.exists(outputPath):
        os.mkdir(outputPath)

    for fn, fd in utils.loadFiles(root):
        data = fd.read()

        data = data.lstrip()
        pageTitle, data = data.split("\n", 1)
        dataList = data.split("セリフ&ナレーション")
        if len(dataList) != 2:
            continue
        data = dataList[1]
        data = data.split("Back  Next")[0].strip()
        data = data.replace("\n  ", "  ")

        step = 2
        if data.count("\n \n") > 5:
            step = 3

        outputRows = [f": ++{pageTitle}"]
        dataList = data.split("\n")
        for i in range(0, len(dataList) - step + 1, step):
            speaker = dataList[i]
            text = dataList[i + step - 1]
            outputRows.append("%s:「%s」" % (speaker, text))

        data = "\n".join(outputRows) + "\n"

        data = utils.removeQuoteMarksFromSpeech(data)
        data = utils.simplifyLongNewlineSequences(data)
        data = utils.addSpaceAfterChoiceMarker(data)
        data = utils.removeReundantSpeakers(data)
        data = utils.addLeadingWhitespaceWhenSameSpeaker(data, True)

        with io.open(os.path.join(outputPath, fn), "w",
                     encoding="utf-8") as fd:
            fd.write(data)
def cleanSectionScript(sections):
    cleanedSections = []
    for section in sections:
        section = simpleCondenseLines(section)
        section = markChoices(section)
        section = condenseLines(section)
        section = isolateChoices(section)
        section = separateSpeakersAndSpeech(section)
        section = section.replace(" ", "")  # Scrub whitespace
        section = section.replace("[END]", "")
        section = section.replace(":\n:", ":")  # Happens sometimes
        section = section.replace("「", "")  # No longer needed
        section = section.replace("\n [", "\n[")
        section = fixDialogueSpacing(section)
        section = utils.simplifyLongNewlineSequences(section)
        section = utils.removeReundantSpeakers(section)
        section = utils.addSpaceAfterChoiceMarker(section)
        section = utils.addLeadingWhitespaceWhenSameSpeaker(section, False)
        cleanedSections.append(section)

    return cleanedSections
Exemple #5
0
def clean(sourceFolder, outputFolder):

    if not os.path.exists(outputFolder):
        os.mkdir(outputFolder)

    for fn, fd in utils.loadFiles(sourceFolder):
        section = fd.read()

        section = condenseWhitespace(section)
        section = removeNewlinesInContinuousSpeech(section)
        section = markEvents(section)
        section = separateSpeakersAndSpeech(section)

        section = section.replace(" ", "")
        section = section.strip()

        section = utils.simplifyLongNewlineSequences(section)
        section = utils.addSpaceAfterChoiceMarker(section)
        section = utils.addLeadingWhitespaceWhenSameSpeaker(section, False)

        with io.open(join(outputFolder, fn), "w", encoding="utf-8") as writeFd:
            writeFd.write(section)
def clean(root, outputPath):
    """

    Post-clean up thoughts
        Seems similar to FF7 but with more whitespace.
        Lots of inner monologue from Squall?
    """
    locationRe = re.compile("\n\n[^\n((『「①②③+]+\n\n")

    if not os.path.exists(outputPath):
        os.mkdir(outputPath)

    for fn, fd in utils.loadFiles(root):
        data = fd.read()

        data = data.strip()

        # For unmarked locations, mark them
        start = 0
        while True:
            match = locationRe.search(data, start)
            if match is None:
                break
            start = match.start() + 2  # pass the 2 leading newlines
            # text = data[match.start():match.end()]
            # if text.strip() == '':
            #     continue

            data = data[:start] + "++" + data[start:]
            start += 2  # pass the 2 inserted characters

        data = data.replace("\n(", ":(")

        # Place quoted sections onto a single line
        start = 0
        while True:
            start = data.find("「", start)
            if start == -1:
                break
            end = data.find("」", start)

            speech = data[start:end].replace("\n", "")
            data = data[:start] + speech + data[end:]

            start = start + 1

        data = data.replace("\n「", ":「")
        data = data.replace("」:「", "」\n:「")
        data = data.replace("\n\n", "\n")

        data = markChoices(data)
        data = correctNumberOfQuotes(data)
        data = utils.removeQuoteMarksFromSpeech(data)
        data = utils.simplifyLongNewlineSequences(data)
        data = utils.addSpaceAfterChoiceMarker(data)
        data = utils.addLeadingWhitespaceWhenSameSpeaker(data, False)

        data = data.split("*****")[0]
        data = data.split("++私的好き台詞++")[0]
        data = data.strip() + "\n"

        data = "++" + data
        data = data.replace("\n", "++\n", 1)

        with io.open(os.path.join(outputPath, fn), "w", encoding="utf-8") as fd:
            fd.write(data)