def extractDialogWithSudachi(rootDir):
    outputDir = utils.getOutputPath(rootDir, "stats")
    tokenizer_obj = dictionary.Dictionary().create()
    mode = tokenizer.Tokenizer.SplitMode.C

    unigrams = []
    bigrams = []
    trigrams = []
    fourgrams = []
    POS_LIST = ["名詞", "動詞", "副詞", "形容詞", "連体詞", "形状詞"]
    for fn, fd in utils.loadFiles(rootDir):
        for line in fd:
            line = line.strip()
            wordList = []
            for word in tokenizer_obj.tokenize(line, mode):
                if word.part_of_speech()[0] not in POS_LIST:
                    continue
                wordList.append(
                    (word.dictionary_form(), word.part_of_speech()[0]))
                print([
                    word.surface(),
                    word.dictionary_form(),
                    word.part_of_speech()[0]
                ])

            unigrams.extend(getChunks(wordList, 1))
            bigrams.extend(getChunks(wordList, 2))
            trigrams.extend(getChunks(wordList, 3))
            fourgrams.extend(getChunks(wordList, 4))

    _output(outputDir, unigrams, bigrams, trigrams, fourgrams)
Beispiel #2
0
    def collect_tg_execution(self, _dirpath,  _outputname, _args):
        # parameter passing
        _pattern = _args.pattern if _args.pattern is not None else r'(\d+)-TG_ESAIL_SAFE_T(\d)_parallel.log'
        # _pattern = _args.pattern if _args.pattern is not None else r'(\d+)-TG_ESAIL_SAFE_bf(\d)_parallel.log'
        _nParition = _args.Nums if _args.Nums is not None else 5

        # listing target directories
        files = utils.loadFiles([{'path':_dirpath}], _ptn=_pattern, _sort=True)

        # output
        output = open(_outputname, "w")
        output.write("JobID,PartitionID,seq,time\n")

        # progressing

        progress = tqdm(desc='Collecting data', total=len(files), unit=' #', postfix=None)
        prev = 0
        cnt = 0
        for item in files:
            if prev==int(item[1]):
                cnt += 1
            else:
                cnt = 0
                prev = int(item[1])

            try:
                header = '%d, %s' % (item['jobID'], item[1])
                self.collect_logtime(item['path'], output, header, cnt*10)
            except Exception as e:
                print('Filed to get information: run%02d' % int(item['Run']))
            progress.update(1)
            progress.set_postfix_str(item['path'])
        progress.close()
        output.close()
def extractDialog(rootDir):

    outputDir = utils.getOutputPath(rootDir, "dialog")

    for fn, fd in utils.loadFiles(rootDir):
        sentenceList = []
        for line in fd:
            if ":「" not in line:
                continue
            line = line.split(":「")[1].rstrip().rstrip("」")

            for punctuation in NON_FINAL_PUNCTUATION:
                line = line.replace(punctuation, "")

            tmpSentenceList = [line]
            for punctuation in SENTENCE_FINAL_PUNCTUATION:
                subTmpSentenceList = []
                for sentence in tmpSentenceList:
                    subTmpSentenceList.extend(sentence.split(punctuation))
                tmpSentenceList = [
                    line.strip() for line in subTmpSentenceList if line != ""
                ]
            sentenceList.extend(tmpSentenceList)

        with io.open(os.path.join(outputDir, fn), "w", encoding="utf-8") as fd:
            for line in sentenceList:
                fd.write(line + "\n")
Beispiel #4
0
def clean(root, outputPath):
    """
    A little bit involved but with a bit of planning, it wasn't too difficult.
    Fortunately, three games all used the same formatting, so this function
    yielded a lot of text.
    """
    END_OF_HEADER = "TIMESTAMPS"
    nameRe = re.compile("\n{2,}.*?\n[(「1]")

    if not os.path.exists(outputPath):
        os.mkdir(outputPath)

    for fn, fd in utils.loadFiles(root):
        data = fd.read()

        start = data.find(END_OF_HEADER) + len(END_OF_HEADER) + 1
        data = data[start:].lstrip()

        # Remove lines containing an arrow in them
        data = removeLinesContaining(data, "→")

        start = nameRe.search(data).start()
        preamble = getPreamble(data[:start])
        data = data[start:]

        data = removeFooter(data)

        # Protect events by making them look like speech
        data = data.replace("\n・", "イベント\n「・")
        data = data.replace("\n\n1.", "\n\n選択\n「1.")

        epilogue = getEpilogue(data)
        outputList = [
            preamble,
        ] + splitDataBySpeechEvent(data)
        outputList.append(epilogue)
        outputData = "\n".join(outputList) + "\n"
        outputData = outputData.replace(" ", "")  # Remove ideographic space

        outputData = setSpeakerBeforeDecisions(outputData)

        # Clean up the earlier protected 'events'
        outputData = removePlaceholderSpeech(outputData, "イベント:「・", "\n:・")
        outputData = removePlaceholderSpeech(outputData, "選択:「1.", "\n:1.")

        outputData = reduceWhitespaceBeforeNumbers(outputData)
        outputData = addColonBeforeDecisions(outputData)
        outputData = replaceSpecialCharacters(outputData)

        outputData = utils.removeQuoteMarksFromSpeech(outputData)
        outputData = utils.simplifyLongNewlineSequences(outputData)
        outputData = utils.addLeadingWhitespaceWhenSameSpeaker(
            outputData, True)
        outputData = utils.addSpaceAfterChoiceMarker(outputData)

        with io.open(os.path.join(outputPath, fn), "w",
                     encoding="utf-8") as fd:
            fd.write(outputData)
Beispiel #5
0
def extractDialogAsJson(rootDir, outputFn, builder, title, version):
    sectionNum = 0
    sectionsByFile = []
    for fn, fd in utils.loadFiles(rootDir):
        sections, sectionNum = compileSections(fd, builder, sectionNum)
        sectionsByFile.append({"source": fn, "sections": sections})

    root = {"game_content": title, "version": version, "content": sectionsByFile}

    with io.open(outputFn, "w", encoding="utf-8") as fd:
        json.dump(root, fd, ensure_ascii=False)
def clean(root, outputPath):
    """

    Post-clean up thoughts
        This has a lot going on.  Changes in the scene are demarcated with < >.
        Keywords, sometimes the current speaker, and maybe narrations are decorated
        with 【 】
        A blank line represents a break in continuity or passage of time maybe?

        All dialog can be extracted by picking lines with ":" in them.
    """

    if not os.path.exists(outputPath):
        os.mkdir(outputPath)

    for fn, fd in utils.loadFiles(root):
        data = fd.read()

        data = data.lstrip()
        data = data[data.find("\n"):].strip()

        start = 0
        while True:
            start = data.find("「", start)
            if start == -1:
                break
            end = data.find("」", start)

            speech = data[start:end].replace("\n", "")
            data = data[:start] + speech + data[end:]

            start = start + 1

        data = data.replace("\n「", ":「")
        data = data.replace("」:「", "」\n:「")
        data = data.split("TO BE")[0]
        data = data.strip() + "\n"

        data = changeEventMarking(data)
        data = removeCharactersSurroundingSpeaker(data)

        data = utils.removeQuoteMarksFromSpeech(data)
        data = utils.simplifyLongNewlineSequences(data)
        data = utils.addSpaceAfterChoiceMarker(data)
        data = utils.addLeadingWhitespaceWhenSameSpeaker(data, True)

        with io.open(os.path.join(outputPath, fn), "w",
                     encoding="utf-8") as fd:
            fd.write(data)
Beispiel #7
0
def clean(root, outputPath):
    """

    Post-clean up thoughts
    -- had I parsed the page as a table, rather than do a text dump,
       the process would have been simpler. I had to manually
       pre-process the last page (credits). Otherwise, some pages have
       a table with 2 columns and others have 3 columns.
    """

    if not os.path.exists(outputPath):
        os.mkdir(outputPath)

    for fn, fd in utils.loadFiles(root):
        data = fd.read()

        data = data.lstrip()
        pageTitle, data = data.split("\n", 1)
        dataList = data.split("セリフ&ナレーション")
        if len(dataList) != 2:
            continue
        data = dataList[1]
        data = data.split("Back  Next")[0].strip()
        data = data.replace("\n  ", "  ")

        step = 2
        if data.count("\n \n") > 5:
            step = 3

        outputRows = [f": ++{pageTitle}"]
        dataList = data.split("\n")
        for i in range(0, len(dataList) - step + 1, step):
            speaker = dataList[i]
            text = dataList[i + step - 1]
            outputRows.append("%s:「%s」" % (speaker, text))

        data = "\n".join(outputRows) + "\n"

        data = utils.removeQuoteMarksFromSpeech(data)
        data = utils.simplifyLongNewlineSequences(data)
        data = utils.addSpaceAfterChoiceMarker(data)
        data = utils.removeReundantSpeakers(data)
        data = utils.addLeadingWhitespaceWhenSameSpeaker(data, True)

        with io.open(os.path.join(outputPath, fn), "w",
                     encoding="utf-8") as fd:
            fd.write(data)
def extractDialogWithMecab(rootDir):

    outputDir = utils.getOutputPath(rootDir, "stats")
    parser = MeCab.Tagger("-Owakati")
    unigrams = []
    bigrams = []
    trigrams = []
    fourgrams = []
    for fn, fd in utils.loadFiles(rootDir):
        for line in fd:
            wordList = parser.parse(line).split()
            unigrams.extend(getChunks(wordList, 1))
            bigrams.extend(getChunks(wordList, 2))
            trigrams.extend(getChunks(wordList, 3))
            fourgrams.extend(getChunks(wordList, 4))

    _output(outputDir, unigrams, bigrams, trigrams, fourgrams)
def extractDialogAsHtml(rootDir,
                        outputDir,
                        builder,
                        tocTitle,
                        getTitleFromFile=None):
    if not os.path.exists(outputDir):
        os.mkdir(outputDir)

    sectionNum = 0
    for fn, fd in utils.loadFiles(rootDir):
        sections, sectionNum = compileSections(fd, builder, sectionNum)
        sectionsText = "\n\n".join(sections)
        html = builder.makeDocument(sectionsText)

        outputFn = os.path.splitext(fn)[0] + ".html"
        with io.open(os.path.join(outputDir, outputFn), "w",
                     encoding="utf-8") as fd:
            fd.write(html)

    generateDirectoryListing(outputDir, tocTitle, getTitleFromFile)
Beispiel #10
0
def clean(sourceFolder, outputFolder):

    if not os.path.exists(outputFolder):
        os.mkdir(outputFolder)

    for fn, fd in utils.loadFiles(sourceFolder):
        section = fd.read()

        section = condenseWhitespace(section)
        section = removeNewlinesInContinuousSpeech(section)
        section = markEvents(section)
        section = separateSpeakersAndSpeech(section)

        section = section.replace(" ", "")
        section = section.strip()

        section = utils.simplifyLongNewlineSequences(section)
        section = utils.addSpaceAfterChoiceMarker(section)
        section = utils.addLeadingWhitespaceWhenSameSpeaker(section, False)

        with io.open(join(outputFolder, fn), "w", encoding="utf-8") as writeFd:
            writeFd.write(section)
def clean(root, outputPath):
    """

    Post-clean up thoughts
        Seems similar to FF7 but with more whitespace.
        Lots of inner monologue from Squall?
    """
    locationRe = re.compile("\n\n[^\n((『「①②③+]+\n\n")

    if not os.path.exists(outputPath):
        os.mkdir(outputPath)

    for fn, fd in utils.loadFiles(root):
        data = fd.read()

        data = data.strip()

        # For unmarked locations, mark them
        start = 0
        while True:
            match = locationRe.search(data, start)
            if match is None:
                break
            start = match.start() + 2  # pass the 2 leading newlines
            # text = data[match.start():match.end()]
            # if text.strip() == '':
            #     continue

            data = data[:start] + "++" + data[start:]
            start += 2  # pass the 2 inserted characters

        data = data.replace("\n(", ":(")

        # Place quoted sections onto a single line
        start = 0
        while True:
            start = data.find("「", start)
            if start == -1:
                break
            end = data.find("」", start)

            speech = data[start:end].replace("\n", "")
            data = data[:start] + speech + data[end:]

            start = start + 1

        data = data.replace("\n「", ":「")
        data = data.replace("」:「", "」\n:「")
        data = data.replace("\n\n", "\n")

        data = markChoices(data)
        data = correctNumberOfQuotes(data)
        data = utils.removeQuoteMarksFromSpeech(data)
        data = utils.simplifyLongNewlineSequences(data)
        data = utils.addSpaceAfterChoiceMarker(data)
        data = utils.addLeadingWhitespaceWhenSameSpeaker(data, False)

        data = data.split("*****")[0]
        data = data.split("++私的好き台詞++")[0]
        data = data.strip() + "\n"

        data = "++" + data
        data = data.replace("\n", "++\n", 1)

        with io.open(os.path.join(outputPath, fn), "w", encoding="utf-8") as fd:
            fd.write(data)