def main(_): setup_seed() setup_logging(FLAGS.verbose) tagger: fugashi.Tagger = fugashi.Tagger("-Owakati") config: TfRecordConfig = TfRecordConfig( train_path=FLAGS.train_path, eval_path=FLAGS.eval_path, example_fields=[ TfRecordField( name="keyword", proerty_name="keyword.search_word", analyzer=lambda s: japanese_tokenizer(tagger, s), featurizer=get_str_feature, dictionary=FLAGS.keyword_path, ) ], context_fields=[ TfRecordField( name="title", proerty_name="keyword.digest", analyzer=lambda s: japanese_tokenizer(tagger, s), featurizer=get_str_feature, dictionary=FLAGS.title_path, ) ], to_ndjson=FLAGS.to_ndjson, ) reader: SearchLogReader = SearchLogReader(FLAGS.searchlog_path) reader.to_tfrecords(config)
def extractVocab(filePath): tagger = fugashi.Tagger() f = open(filePath, "r") text = f.read() test_list = [] res = [] output = [] outputText = '' for word in tagger(text): test_list.append(word.feature.lemma) # Sorts array by frequency res = Counter(test_list) sorted_res = sorted(res, key=res.get, reverse=True) for i in sorted_res: if i not in output: output.append(i) # Places the output into a String variable for value in output: outputText += str(value) + '\n' outputText += 'Words: ' + str(len(output)) return outputText
def __init__(self): if JapaneseTagger.__instance__ is None: JapaneseTagger.__instance__ = self self.tagger = fugashi.Tagger(unidic.DICDIR) log.debug('%s: Tagger initialised.', PLUGIN_NAME) _kks = pykakasi.kakasi() for mode in ['H', 'K', 'J']: _kks.setMode(mode, 'a') self.conv = _kks.getConverter() else: raise Exception( 'Tagger object cannot be initialised more than once.')
def print_matching_sub_lines(subsdir, selection, value, sublogger=None): tagger = fugashi.Tagger() if sublogger is None: sublogger = open("sublines.log", "w", encoding="utf-8") if "srt" in selection.lower(): with open(f"{subsdir}/{selection}", "r", encoding="utf-8") as file: subtitle = file.read() sub_gen = srt.parse(subtitle) subs = list(sub_gen) for sen in subs: sentence_tokens = [ word.feature.lemma if word.feature.lemma else word.surface for word in tagger(kana.markup_book_html(sen.content)) ] sentence_tokens = [ kana.clean_lemma(token) for token in sentence_tokens if not kana.is_single_kana(token) ] if value in sentence_tokens: print_console_and_file(f"{sen.start} {sen.end}", sublogger) print_console_and_file(sen.content, sublogger) else: subsrtfiles = [ f"{selection}/{f.name}" for f in os.scandir(f"{subsdir}/{selection}") if f.is_file() and f.name[0] != "$" ] print(subsrtfiles) for srtfile in subsrtfiles: print_console_and_file(srtfile, sublogger) print_matching_sub_lines(subsdir, selection=srtfile, value=value, sublogger=sublogger) return sublogger
def report_function(booklist): from collections import Counter import pandas as pd OLD_LIB = False tagger = fugashi.Tagger() reportdf = pd.DataFrame() reportdir = f"{os.path.dirname(booklist[0])}/$_report" reportname = "$report.csv" if not os.path.isdir(reportdir): os.mkdir(reportdir) if os.path.isfile(f"{reportdir}/{reportname}"): lib_df = pd.read_csv(f"{reportdir}/{reportname}", index_col=0) OLD_LIB = True for novel in tqdm(booklist, ascii=True, desc="Creating Report"): reportfile = f"{reportdir}/{os.path.basename(novel)}.zip" if OLD_LIB: if lib_df["Name"].isin([os.path.basename(novel)]).any(): continue if os.path.isfile(reportfile): with zipfile.ZipFile(reportfile) as myzip: with myzip.open(f"{os.path.basename(novel)}.txt") as file: rtxt = file.read().decode("utf-8").splitlines() # with open(reportfile, 'r', encoding='utf-8') as file: # rtxt = file.read().splitlines() rdict = get_rdict(rtxt) # with open(f"{novel}/{os.path.basename(novel)}.html", # 'r', encoding='utf-8') as file: # raw_book = file.read() # cleaned_book = kana.markup_book_html(raw_book) # cleaned_book = kana.reduce_new_lines(cleaned_book) # sentences = cleaned_book.split('\n') all_kanji = kana.remove_non_kanji(kana.getrdictstring(rdict)) uniq_kanji = set(all_kanji) kanji_counter = Counter(all_kanji) # appears at least two times aka 2+ times n2plus = sum(k >= 2 for k in kanji_counter.values()) # appears at least 5 times aka 5+ times n5plus = sum(k >= 5 for k in kanji_counter.values()) # appears at least 10 times aka 10+ times n10plus = sum(k >= 10 for k in kanji_counter.values()) add_data = [{ "Name": os.path.basename(novel), "Number Tokens": sum(rdict.values()), "Total Words": len(rdict), "Total Kanji": len(uniq_kanji), "Kanji 10+": n10plus, "Kanji 5+": n5plus, "Kanji 2+": n2plus, }] reportdf = reportdf.append(add_data, ignore_index=True, sort=False) else: with open(f"{novel}/{os.path.basename(novel)}.html", "r", encoding="utf-8") as file: raw_book = file.read() cleaned_book = kana.markup_book_html(raw_book) cleaned_book = kana.reduce_new_lines(cleaned_book) sentences = cleaned_book.split("\n") token_words = [] token_extend = token_words.extend for sen in sentences: sentence_tokens = [ word.feature.lemma if word.feature.lemma else word.surface for word in tagger(sen) ] sentence_tokens = [ kana.clean_lemma(token) for token in sentence_tokens if not kana.is_single_kana(token) ] sentence_tokens = kana.get_unique_token_words(sentence_tokens) token_extend(sentence_tokens) token_counter = Counter(token_words) all_kanji = kana.remove_non_kanji("".join(token_words)) token_words = set(token_words) uniq_kanji = set(all_kanji) kanji_counter = Counter(all_kanji) # appears at least two times aka 2+ times n2plus = sum(k >= 2 for k in kanji_counter.values()) # appears at least 5 times aka 5+ times n5plus = sum(k >= 5 for k in kanji_counter.values()) # appears at least 10 times aka 10+ times n10plus = sum(k >= 10 for k in kanji_counter.values()) add_data = [{ "Name": os.path.basename(novel), "Number Tokens": sum(token_counter.values()), "Total Words": len(token_words), "Total Kanji": len(uniq_kanji), "Kanji 10+": n10plus, "Kanji 5+": n5plus, "Kanji 2+": n2plus, }] reportdf = reportdf.append(add_data, ignore_index=True, sort=False) counterstr = "" for k, v in token_counter.most_common(): counterstr += f"{k}, {v}\n" with open(f"{reportdir}/{os.path.basename(novel)}.txt", "w", encoding="utf-8") as wr: wr.write(counterstr) with zipfile.ZipFile(f"{reportdir}/{os.path.basename(novel)}.zip", "w", zipfile.ZIP_LZMA) as myzip: myzip.write( f"{reportdir}/{os.path.basename(novel)}.txt", f"{os.path.basename(novel)}.txt", ) if os.path.exists(f"{reportdir}/{os.path.basename(novel)}.txt"): os.remove(f"{reportdir}/{os.path.basename(novel)}.txt") if OLD_LIB: lib_df = lib_df.append(reportdf, ignore_index=True, sort=False) lib_df.to_csv(f"{reportdir}/{reportname}", index_label="Index") else: reportdf.to_csv(f"{reportdir}/{reportname}", index_label="Index")
def srt_processing(subtitledir, reportdir=None): import srt import pandas as pd OLD_LIB = False tagger = fugashi.Tagger() reportdf = pd.DataFrame() if reportdir is None: reportdir = f"{subtitledir}/$_report" reportname = "$report.csv" if not os.path.isdir(reportdir): os.mkdir(reportdir) if os.path.isfile(f"{reportdir}/{reportname}"): lib_df = pd.read_csv(f"{reportdir}/{reportname}", index_col=0) OLD_LIB = True srtdirs = [ f"{subtitledir}/{f.name}" for f in os.scandir(subtitledir) if f.is_dir() and f.name[0] != "$" ] srtfiles = [ f"{subtitledir}/{f.name}" for f in os.scandir(subtitledir) if f.is_file() and f.name[0] != "$" ] for subdir in tqdm(srtdirs, ascii=True, desc="Creating Directory Report"): reportfile = f"{reportdir}/{os.path.basename(subdir)}.zip" if OLD_LIB: if lib_df["Name"].isin([os.path.basename(subdir)]).any(): continue subsrtfiles = [ f"{subdir}/{f.name}" for f in os.scandir(subdir) if f.is_file() and f.name[0] != "$" ] if os.path.isfile(reportfile): with zipfile.ZipFile(reportfile) as myzip: with myzip.open(f"{os.path.basename(subdir)}.txt") as file: rtxt = file.read().decode("utf-8").splitlines() # with open(reportfile, 'r', encoding='utf-8') as file: # rtxt = file.read().splitlines() rdict = get_rdict(rtxt) all_kanji = kana.remove_non_kanji(kana.getrdictstring(rdict)) uniq_kanji = set(all_kanji) kanji_counter = Counter(all_kanji) # appears at least two times aka 2+ times n2plus = sum(k >= 2 for k in kanji_counter.values()) # appears at least 5 times aka 5+ times n5plus = sum(k >= 5 for k in kanji_counter.values()) # appears at least 10 times aka 10+ times n10plus = sum(k >= 10 for k in kanji_counter.values()) add_data = [{ "Name": os.path.basename(subdir), "Number Tokens": sum(rdict.values()), "Total Words": len(rdict), "Total Kanji": len(uniq_kanji), "Kanji 10+": n10plus, "Kanji 5+": n5plus, "Kanji 2+": n2plus, }] reportdf = reportdf.append(add_data, ignore_index=True, sort=False) else: subsrtfiles = [ f"{subdir}/{f.name}" for f in os.scandir(subdir) if f.is_file() and f.name[0] != "$" ] concatsubs = "" for subf in subsrtfiles: with open(f"{subf}", "r", encoding="utf-8") as file: concatsubs += file.read() sub_gen = srt.parse(concatsubs) subs = list(sub_gen) token_words = [] token_extend = token_words.extend for sen in subs: sentence_tokens = [ word.feature.lemma if word.feature.lemma else word.surface for word in tagger( kana.markup_book_html(remove_names(sen.content))) ] sentence_tokens = [ kana.clean_lemma(token) for token in sentence_tokens if not kana.is_single_kana(token) ] sentence_tokens = kana.get_unique_token_words(sentence_tokens) token_extend(sentence_tokens) token_counter = Counter(token_words) all_kanji = kana.remove_non_kanji("".join(token_words)) uni_token_words = set(token_words) uniq_kanji = set(all_kanji) kanji_counter = Counter(all_kanji) # appears at least two times aka 2+ times n2plus = sum(k >= 2 for k in kanji_counter.values()) # appears at least 5 times aka 5+ times n5plus = sum(k >= 5 for k in kanji_counter.values()) # appears at least 10 times aka 10+ times n10plus = sum(k >= 10 for k in kanji_counter.values()) add_data = [{ "Name": os.path.basename(subdir), "Number Tokens": sum(token_counter.values()), "Total Words": len(uni_token_words), "Total Kanji": len(uniq_kanji), "Kanji 10+": n10plus, "Kanji 5+": n5plus, "Kanji 2+": n2plus, }] reportdf = reportdf.append(add_data, ignore_index=True, sort=False) counterstr = "" for k, v in token_counter.most_common(): counterstr += f"{k}, {v}\n" with open(f"{reportdir}/{os.path.basename(subdir)}.txt", "w", encoding="utf-8") as wr: wr.write(counterstr) with zipfile.ZipFile(f"{reportdir}/{os.path.basename(subdir)}.zip", "w", zipfile.ZIP_LZMA) as myzip: myzip.write( f"{reportdir}/{os.path.basename(subdir)}.txt", f"{os.path.basename(subdir)}.txt", ) if os.path.exists(f"{reportdir}/{os.path.basename(subdir)}.txt"): os.remove(f"{reportdir}/{os.path.basename(subdir)}.txt") for subf in tqdm(srtfiles, ascii=True, desc="Creating File Report"): reportfile = f"{reportdir}/{os.path.basename(subf)}.zip" if OLD_LIB: if lib_df["Name"].isin([os.path.basename(subf)]).any(): continue if os.path.isfile(reportfile): with zipfile.ZipFile(reportfile) as myzip: with myzip.open(f"{os.path.basename(subf)}.txt") as file: rtxt = file.read().decode("utf-8").splitlines() # with open(reportfile, 'r', encoding='utf-8') as file: # rtxt = file.read().splitlines() rdict = get_rdict(rtxt) all_kanji = kana.remove_non_kanji(kana.getrdictstring(rdict)) uniq_kanji = set(all_kanji) kanji_counter = Counter(all_kanji) # appears at least two times aka 2+ times n2plus = sum(k >= 2 for k in kanji_counter.values()) # appears at least 5 times aka 5+ times n5plus = sum(k >= 5 for k in kanji_counter.values()) # appears at least 10 times aka 10+ times n10plus = sum(k >= 10 for k in kanji_counter.values()) add_data = [{ "Name": os.path.basename(subf), "Number Tokens": sum(rdict.values()), "Total Words": len(rdict), "Total Kanji": len(uniq_kanji), "Kanji 10+": n10plus, "Kanji 5+": n5plus, "Kanji 2+": n2plus, }] reportdf = reportdf.append(add_data, ignore_index=True, sort=False) else: with open(f"{subf}", "r", encoding="utf-8") as file: subtitle = file.read() sub_gen = srt.parse(subtitle) subs = list(sub_gen) token_words = [] token_extend = token_words.extend for sen in subs: sentence_tokens = [ word.feature.lemma if word.feature.lemma else word.surface for word in tagger(kana.markup_book_html(sen.content)) ] sentence_tokens = [ kana.clean_lemma(token) for token in sentence_tokens if not kana.is_single_kana(token) ] sentence_tokens = kana.get_unique_token_words(sentence_tokens) token_extend(sentence_tokens) token_counter = Counter(token_words) all_kanji = kana.remove_non_kanji("".join(token_words)) uni_token_words = set(token_words) uniq_kanji = set(all_kanji) kanji_counter = Counter(all_kanji) # appears at least two times aka 2+ times n2plus = sum(k >= 2 for k in kanji_counter.values()) # appears at least 5 times aka 5+ times n5plus = sum(k >= 5 for k in kanji_counter.values()) # appears at least 10 times aka 10+ times n10plus = sum(k >= 10 for k in kanji_counter.values()) add_data = [{ "Name": os.path.basename(subf), "Number Tokens": sum(token_counter.values()), "Total Words": len(uni_token_words), "Total Kanji": len(uniq_kanji), "Kanji 10+": n10plus, "Kanji 5+": n5plus, "Kanji 2+": n2plus, }] reportdf = reportdf.append(add_data, ignore_index=True, sort=False) counterstr = "" for k, v in token_counter.most_common(): counterstr += f"{k}, {v}\n" with open(f"{reportdir}/{os.path.basename(subf)}.txt", "w", encoding="utf-8") as wr: wr.write(counterstr) with zipfile.ZipFile(f"{reportdir}/{os.path.basename(subf)}.zip", "w", zipfile.ZIP_LZMA) as myzip: myzip.write( f"{reportdir}/{os.path.basename(subf)}.txt", f"{os.path.basename(subf)}.txt", ) if os.path.exists(f"{reportdir}/{os.path.basename(subf)}.txt"): os.remove(f"{reportdir}/{os.path.basename(subf)}.txt") if OLD_LIB: lib_df = lib_df.append(reportdf, ignore_index=True, sort=False) lib_df.to_csv(f"{reportdir}/{reportname}", index_label="Index") else: reportdf.to_csv(f"{reportdir}/{reportname}", index_label="Index")
def main(no_kanjigrid, user): try: import fugashi tagger = fugashi.Tagger() EXTRA = True except ModuleNotFoundError: EXTRA = False __loc__ = os.path.abspath("") __loc__ = os.path.dirname(os.path.realpath(__file__)) DISREGARD_OLD_KNOWN = False ADD_NX_SUP = False CREATE_KANJIGRID = not no_kanjigrid COUNT_NEW_LEECHES = True write_to_file_text = "" col = Collection(user=user) notes = col.cards.merge_notes() path = __loc__ + "\\resources" kw_path = path + "\\.known_words.txt" if os.path.isfile(path + "\\.previous.txt"): with open(path + "\\.previous.txt", "r", encoding="utf-8") as file: print("Previous known words:") print(file.read()) print("_" * 50 + "\n" * 2) print("Current known words:") with open(path + "\\anki_cards.txt", "r") as file: card_list = file.read().splitlines() words = [] for cards in card_list: card, field = cards.split(":") field = int(field) selection = notes.query( f"nmodel == '{card}' and cqueue == 'due' " f"or nmodel == '{card}' and cqueue == 'suspended'" ) sellist = selection["nflds"].tolist() if COUNT_NEW_LEECHES: mask = notes.ntags.apply(lambda x: "leech" in x) leech_sel = notes[mask] sel = leech_sel.query(f"nmodel == '{card}' and cqueue == 'new'") sellist.extend(sel["nflds"].tolist()) print(f"card model {card} found:") write_to_file_text = write_to_file_text + f"card model {card} found:" + "\n" print(len(sellist)) write_to_file_text = write_to_file_text + str(len(sellist)) + "\n" for w in sellist: if not kana.is_single_kana(w[field - 1]): words.append(w[field - 1]) uniq_w = set(words) # for a better reprensation of what i actually known # it would probably be better to do this right before any processing # and not now which just inflates the numbers # 21.01 still unsure about this if EXTRA: extra = set() for w in uniq_w: w = kana.markup_book_html(w) tags = [ word.feature.lemma if word.feature.lemma else word.surface for word in tagger(w) ] tags = [ kana.clean_lemma(token) for token in tags if not kana.is_single_kana(token) ] tags = kana.get_unique_token_words(tags) extra.update(tags) uniq_w.update(extra) if not DISREGARD_OLD_KNOWN: if os.path.isfile(kw_path): with open(kw_path, "r", encoding="utf-8") as file: previous_known = file.read().splitlines() previous_known = [ word for word in previous_known if not kana.is_single_kana(word) and word ] uniq_w.update(previous_known) if ADD_NX_SUP: nx_sup = [] for i in range(1, 6): if os.path.isfile("n" + str(i) + ".txt"): # print(i) with open("n" + str(i) + ".txt", "r", encoding="utf-8") as file: # print(sum(1 for _ in file)) nx_sup.extend(list(file.read().split("\n"))) uniq_w.update(nx_sup) muniq = {w for w in kana.markup_known_words("\n".join(uniq_w)) if w != ""} muniq = list(muniq) muniq.sort() uniqK = kana.get_unique_kanji(muniq) print(f"found a total of {len(muniq)} words") print(f"with a total of {len(uniqK)} unique kanji") write_to_file_text = ( write_to_file_text + f"found a total of {len(muniq)} words" + "\n" ) write_to_file_text = ( write_to_file_text + f"with a total of {len(uniqK)} unique kanji" + "\n" ) with open(kw_path, "w", encoding="utf-8") as wr: wr.write("\n".join(muniq)) with open(path + "\\.previous.txt", "w", encoding="utf-8") as wr: wr.write(write_to_file_text) add_data = [ { "Date": current_date, "Time": current_time, "Words": len(muniq), "Kanji": len(uniqK), } ] if os.path.isfile(path + "\\.progress.csv"): prog_df = pd.read_csv(path + "\\.progress.csv", index_col=0) prog_df = prog_df.append(add_data, ignore_index=True, sort=False) prog_df.to_csv(path + "\\.progress.csv", index_label="Index") else: prog_df = pd.DataFrame(add_data) prog_df.to_csv(path + "\\.progress.csv", index_label="Index") if CREATE_KANJIGRID: kj.main()
def __init__(self): self.specialchars=["«","»","—","‘","’","“","”","„",] self.subs=["■'s","■'ll","■'t","■'cause","■'d","■'em","■'ve","■'dn","■'m","■'n","■'re","■'til","■'tween","■'all","ol'■"] self.re_num = re.compile(r'[\d\,\.]+') self.tagger = fugashi.Tagger()
def main(_): setup_seed() setup_logging(FLAGS.verbose) tagger: fugashi.Tagger = fugashi.Tagger("-Owakati") config: TfRecordConfig = TfRecordConfig( example_fields=[ TfRecordField( name="keyword", proerty_name="keyword.keyword", analyzer=lambda s: japanese_tokenizer(tagger, s), featurizer=get_str_feature, dictionary=FLAGS.keyword_path, ) ], context_fields=[ TfRecordField( name="title", proerty_name="keyword.title", analyzer=lambda s: japanese_tokenizer(tagger, s), featurizer=get_str_feature, dictionary=FLAGS.title_path, ) ], ) def convert_func(data: Dict[str, Any]) -> Dict[str, Any]: results: Dict[str, Any] = {} if "contexts" in data: for i, doc in enumerate(data["contexts"]): relevance: int = doc.get("relevance") results[f"doc_{i+1}"] = { "id": f"{i+1}", "keyword": { "title": doc.get("title"), }, "boolean": { "clicked": relevance > 0, }, "integer": { "relevance": relevance, }, } return { "request": { "id": { "query": data.get("_id"), }, "conditions": { "keyword": { "keyword": data.get("keyword"), }, }, }, "response": { "results": results, }, "@timestamp": "now", } def dump_func(example: Dict[str, Any], context: Dict[str, Any], score: float) -> Dict[str, Any]: return { "keyword": example.get("keyword"), "title": context.get("title"), "score": score, "relevance": context.get("relevance"), } save_predictions( config, FLAGS.model_path, FLAGS.ndjson_path, FLAGS.output_path, convert_func, dump_func, )