def extractDialogWithSudachi(rootDir): outputDir = utils.getOutputPath(rootDir, "stats") tokenizer_obj = dictionary.Dictionary().create() mode = tokenizer.Tokenizer.SplitMode.C unigrams = [] bigrams = [] trigrams = [] fourgrams = [] POS_LIST = ["名詞", "動詞", "副詞", "形容詞", "連体詞", "形状詞"] for fn, fd in utils.loadFiles(rootDir): for line in fd: line = line.strip() wordList = [] for word in tokenizer_obj.tokenize(line, mode): if word.part_of_speech()[0] not in POS_LIST: continue wordList.append( (word.dictionary_form(), word.part_of_speech()[0])) print([ word.surface(), word.dictionary_form(), word.part_of_speech()[0] ]) unigrams.extend(getChunks(wordList, 1)) bigrams.extend(getChunks(wordList, 2)) trigrams.extend(getChunks(wordList, 3)) fourgrams.extend(getChunks(wordList, 4)) _output(outputDir, unigrams, bigrams, trigrams, fourgrams)
def collect_tg_execution(self, _dirpath, _outputname, _args): # parameter passing _pattern = _args.pattern if _args.pattern is not None else r'(\d+)-TG_ESAIL_SAFE_T(\d)_parallel.log' # _pattern = _args.pattern if _args.pattern is not None else r'(\d+)-TG_ESAIL_SAFE_bf(\d)_parallel.log' _nParition = _args.Nums if _args.Nums is not None else 5 # listing target directories files = utils.loadFiles([{'path':_dirpath}], _ptn=_pattern, _sort=True) # output output = open(_outputname, "w") output.write("JobID,PartitionID,seq,time\n") # progressing progress = tqdm(desc='Collecting data', total=len(files), unit=' #', postfix=None) prev = 0 cnt = 0 for item in files: if prev==int(item[1]): cnt += 1 else: cnt = 0 prev = int(item[1]) try: header = '%d, %s' % (item['jobID'], item[1]) self.collect_logtime(item['path'], output, header, cnt*10) except Exception as e: print('Filed to get information: run%02d' % int(item['Run'])) progress.update(1) progress.set_postfix_str(item['path']) progress.close() output.close()
def extractDialog(rootDir): outputDir = utils.getOutputPath(rootDir, "dialog") for fn, fd in utils.loadFiles(rootDir): sentenceList = [] for line in fd: if ":「" not in line: continue line = line.split(":「")[1].rstrip().rstrip("」") for punctuation in NON_FINAL_PUNCTUATION: line = line.replace(punctuation, "") tmpSentenceList = [line] for punctuation in SENTENCE_FINAL_PUNCTUATION: subTmpSentenceList = [] for sentence in tmpSentenceList: subTmpSentenceList.extend(sentence.split(punctuation)) tmpSentenceList = [ line.strip() for line in subTmpSentenceList if line != "" ] sentenceList.extend(tmpSentenceList) with io.open(os.path.join(outputDir, fn), "w", encoding="utf-8") as fd: for line in sentenceList: fd.write(line + "\n")
def clean(root, outputPath): """ A little bit involved but with a bit of planning, it wasn't too difficult. Fortunately, three games all used the same formatting, so this function yielded a lot of text. """ END_OF_HEADER = "TIMESTAMPS" nameRe = re.compile("\n{2,}.*?\n[(「1]") if not os.path.exists(outputPath): os.mkdir(outputPath) for fn, fd in utils.loadFiles(root): data = fd.read() start = data.find(END_OF_HEADER) + len(END_OF_HEADER) + 1 data = data[start:].lstrip() # Remove lines containing an arrow in them data = removeLinesContaining(data, "→") start = nameRe.search(data).start() preamble = getPreamble(data[:start]) data = data[start:] data = removeFooter(data) # Protect events by making them look like speech data = data.replace("\n・", "イベント\n「・") data = data.replace("\n\n1.", "\n\n選択\n「1.") epilogue = getEpilogue(data) outputList = [ preamble, ] + splitDataBySpeechEvent(data) outputList.append(epilogue) outputData = "\n".join(outputList) + "\n" outputData = outputData.replace(" ", "") # Remove ideographic space outputData = setSpeakerBeforeDecisions(outputData) # Clean up the earlier protected 'events' outputData = removePlaceholderSpeech(outputData, "イベント:「・", "\n:・") outputData = removePlaceholderSpeech(outputData, "選択:「1.", "\n:1.") outputData = reduceWhitespaceBeforeNumbers(outputData) outputData = addColonBeforeDecisions(outputData) outputData = replaceSpecialCharacters(outputData) outputData = utils.removeQuoteMarksFromSpeech(outputData) outputData = utils.simplifyLongNewlineSequences(outputData) outputData = utils.addLeadingWhitespaceWhenSameSpeaker( outputData, True) outputData = utils.addSpaceAfterChoiceMarker(outputData) with io.open(os.path.join(outputPath, fn), "w", encoding="utf-8") as fd: fd.write(outputData)
def extractDialogAsJson(rootDir, outputFn, builder, title, version): sectionNum = 0 sectionsByFile = [] for fn, fd in utils.loadFiles(rootDir): sections, sectionNum = compileSections(fd, builder, sectionNum) sectionsByFile.append({"source": fn, "sections": sections}) root = {"game_content": title, "version": version, "content": sectionsByFile} with io.open(outputFn, "w", encoding="utf-8") as fd: json.dump(root, fd, ensure_ascii=False)
def clean(root, outputPath): """ Post-clean up thoughts This has a lot going on. Changes in the scene are demarcated with < >. Keywords, sometimes the current speaker, and maybe narrations are decorated with 【 】 A blank line represents a break in continuity or passage of time maybe? All dialog can be extracted by picking lines with ":" in them. """ if not os.path.exists(outputPath): os.mkdir(outputPath) for fn, fd in utils.loadFiles(root): data = fd.read() data = data.lstrip() data = data[data.find("\n"):].strip() start = 0 while True: start = data.find("「", start) if start == -1: break end = data.find("」", start) speech = data[start:end].replace("\n", "") data = data[:start] + speech + data[end:] start = start + 1 data = data.replace("\n「", ":「") data = data.replace("」:「", "」\n:「") data = data.split("TO BE")[0] data = data.strip() + "\n" data = changeEventMarking(data) data = removeCharactersSurroundingSpeaker(data) data = utils.removeQuoteMarksFromSpeech(data) data = utils.simplifyLongNewlineSequences(data) data = utils.addSpaceAfterChoiceMarker(data) data = utils.addLeadingWhitespaceWhenSameSpeaker(data, True) with io.open(os.path.join(outputPath, fn), "w", encoding="utf-8") as fd: fd.write(data)
def clean(root, outputPath): """ Post-clean up thoughts -- had I parsed the page as a table, rather than do a text dump, the process would have been simpler. I had to manually pre-process the last page (credits). Otherwise, some pages have a table with 2 columns and others have 3 columns. """ if not os.path.exists(outputPath): os.mkdir(outputPath) for fn, fd in utils.loadFiles(root): data = fd.read() data = data.lstrip() pageTitle, data = data.split("\n", 1) dataList = data.split("セリフ&ナレーション") if len(dataList) != 2: continue data = dataList[1] data = data.split("Back Next")[0].strip() data = data.replace("\n ", " ") step = 2 if data.count("\n \n") > 5: step = 3 outputRows = [f": ++{pageTitle}"] dataList = data.split("\n") for i in range(0, len(dataList) - step + 1, step): speaker = dataList[i] text = dataList[i + step - 1] outputRows.append("%s:「%s」" % (speaker, text)) data = "\n".join(outputRows) + "\n" data = utils.removeQuoteMarksFromSpeech(data) data = utils.simplifyLongNewlineSequences(data) data = utils.addSpaceAfterChoiceMarker(data) data = utils.removeReundantSpeakers(data) data = utils.addLeadingWhitespaceWhenSameSpeaker(data, True) with io.open(os.path.join(outputPath, fn), "w", encoding="utf-8") as fd: fd.write(data)
def extractDialogWithMecab(rootDir): outputDir = utils.getOutputPath(rootDir, "stats") parser = MeCab.Tagger("-Owakati") unigrams = [] bigrams = [] trigrams = [] fourgrams = [] for fn, fd in utils.loadFiles(rootDir): for line in fd: wordList = parser.parse(line).split() unigrams.extend(getChunks(wordList, 1)) bigrams.extend(getChunks(wordList, 2)) trigrams.extend(getChunks(wordList, 3)) fourgrams.extend(getChunks(wordList, 4)) _output(outputDir, unigrams, bigrams, trigrams, fourgrams)
def extractDialogAsHtml(rootDir, outputDir, builder, tocTitle, getTitleFromFile=None): if not os.path.exists(outputDir): os.mkdir(outputDir) sectionNum = 0 for fn, fd in utils.loadFiles(rootDir): sections, sectionNum = compileSections(fd, builder, sectionNum) sectionsText = "\n\n".join(sections) html = builder.makeDocument(sectionsText) outputFn = os.path.splitext(fn)[0] + ".html" with io.open(os.path.join(outputDir, outputFn), "w", encoding="utf-8") as fd: fd.write(html) generateDirectoryListing(outputDir, tocTitle, getTitleFromFile)
def clean(sourceFolder, outputFolder): if not os.path.exists(outputFolder): os.mkdir(outputFolder) for fn, fd in utils.loadFiles(sourceFolder): section = fd.read() section = condenseWhitespace(section) section = removeNewlinesInContinuousSpeech(section) section = markEvents(section) section = separateSpeakersAndSpeech(section) section = section.replace(" ", "") section = section.strip() section = utils.simplifyLongNewlineSequences(section) section = utils.addSpaceAfterChoiceMarker(section) section = utils.addLeadingWhitespaceWhenSameSpeaker(section, False) with io.open(join(outputFolder, fn), "w", encoding="utf-8") as writeFd: writeFd.write(section)
def clean(root, outputPath): """ Post-clean up thoughts Seems similar to FF7 but with more whitespace. Lots of inner monologue from Squall? """ locationRe = re.compile("\n\n[^\n((『「①②③+]+\n\n") if not os.path.exists(outputPath): os.mkdir(outputPath) for fn, fd in utils.loadFiles(root): data = fd.read() data = data.strip() # For unmarked locations, mark them start = 0 while True: match = locationRe.search(data, start) if match is None: break start = match.start() + 2 # pass the 2 leading newlines # text = data[match.start():match.end()] # if text.strip() == '': # continue data = data[:start] + "++" + data[start:] start += 2 # pass the 2 inserted characters data = data.replace("\n(", ":(") # Place quoted sections onto a single line start = 0 while True: start = data.find("「", start) if start == -1: break end = data.find("」", start) speech = data[start:end].replace("\n", "") data = data[:start] + speech + data[end:] start = start + 1 data = data.replace("\n「", ":「") data = data.replace("」:「", "」\n:「") data = data.replace("\n\n", "\n") data = markChoices(data) data = correctNumberOfQuotes(data) data = utils.removeQuoteMarksFromSpeech(data) data = utils.simplifyLongNewlineSequences(data) data = utils.addSpaceAfterChoiceMarker(data) data = utils.addLeadingWhitespaceWhenSameSpeaker(data, False) data = data.split("*****")[0] data = data.split("++私的好き台詞++")[0] data = data.strip() + "\n" data = "++" + data data = data.replace("\n", "++\n", 1) with io.open(os.path.join(outputPath, fn), "w", encoding="utf-8") as fd: fd.write(data)