def print_matching_sub_lines(subsdir, selection, value, sublogger=None): tagger = fugashi.Tagger() if sublogger is None: sublogger = open("sublines.log", "w", encoding="utf-8") if "srt" in selection.lower(): with open(f"{subsdir}/{selection}", "r", encoding="utf-8") as file: subtitle = file.read() sub_gen = srt.parse(subtitle) subs = list(sub_gen) for sen in subs: sentence_tokens = [ word.feature.lemma if word.feature.lemma else word.surface for word in tagger(kana.markup_book_html(sen.content)) ] sentence_tokens = [ kana.clean_lemma(token) for token in sentence_tokens if not kana.is_single_kana(token) ] if value in sentence_tokens: print_console_and_file(f"{sen.start} {sen.end}", sublogger) print_console_and_file(sen.content, sublogger) else: subsrtfiles = [ f"{selection}/{f.name}" for f in os.scandir(f"{subsdir}/{selection}") if f.is_file() and f.name[0] != "$" ] print(subsrtfiles) for srtfile in subsrtfiles: print_console_and_file(srtfile, sublogger) print_matching_sub_lines(subsdir, selection=srtfile, value=value, sublogger=sublogger) return sublogger
if IGNORE_CITIES: with open(path + "\\citynames.txt", "r", encoding="utf-8") as file: cities = file.read() cities = kana.markup_known_words(cities) cities = kana.get_unique_kanji(cities) for article in tqdm(subfolders, ascii=True, desc="sorting the articles", ncols=100): if article[0] == "$": continue with open(nhkdir + "\\" + article + "\\story.html", "r", encoding="utf-8") as file: booktml = file.read() cleaned_book = kana.markup_book_html(booktml) token_words = [word.surface for word in tagger(cleaned_book)] uniq_words = list(kana.get_unique_token_words(token_words)) booktml, kanjiwords, lemmawords, unknown_words = kana.mark_known_words_sbl( booktml, uniq_words, known_words, tagger, disable=True) booktml = kana.mark_kanjiwords(booktml, kanjiwords, known_words, disable=True) booktml = kana.mark_lemmawords(booktml, lemmawords, known_words, disable=True) booktml = kana.mark_known_kanji(booktml, known_kanji, disable=True) uniq_kanji = kana.get_unique_kanji(uniq_words) unknown_kanji = uniq_kanji.difference(known_kanji)
def download_story(story): response = requests.get(nhkeasy_prefix + str(story)) soup = BeautifulSoup(response.text, "lxml") soup = hpre.delete_tags(soup, blacklist) soup = hpre.strip_tags_and_font(soup, whitelist) for tag in soup.find_all("ruby"): tag.unwrap() soup = hpre.pop_img_width(soup) for tag in soup.find_all("img"): if tag.get("alt") == "Story illustration": locsrc = tag.get("src") tag.attrs["src"] = "https://nhkeasier.com" + locsrc elif tag.get("title") == None: pass elif "furigana" in tag.get("title"): tag.replaceWith("") for tag in soup.find_all("audio"): test = tag.get("src") tag.attrs["src"] = "https://nhkeasier.com" + test tag.attrs["preload"] = "auto" for tag in soup.findAll("p"): tag.string = tag.text.replace(" ", "") teststr = soup.prettify() teststr = teststr.replace("\n", "") teststr = teststr.replace(" ", "") teststr = teststr.replace( '<h1> <a href="/">NHK News WebEasier </a> </h1>', '<h2> <a href="https://nhkeasier.com/">NHK News WebEasier </a> </h2>', ) teststr = teststr.replace("<h2> Single Story </h2>", "") teststr = teststr.replace("<link/>", "") soup = BeautifulSoup(teststr, "lxml") with open(path + "\\styling.txt", "r", encoding="utf-8") as file: styletag = file.read() soup = hpre.add_style(soup, styletag) try: soup.img.insert_before(soup.audio) except AttributeError: pass # change this # archive/story_xxxx is better os.mkdir(nhkdir + story_dir_prefix + str(story)) with open(nhkdir + story_dir_prefix + str(story) + "\\story.html", "w", encoding="utf-8") as wr: wr.write(soup.prettify()) if DO_KANJI_ANALYZE: path_ankipanda = os.path.expanduser("~") + "\\ankipandas_words.txt" if os.path.isfile(path_ankipanda): with open(path_ankipanda, "r", encoding="utf-8") as file: known_words = file.read() else: with open("known_words.txt", "r", encoding="utf-8") as file: known_words = file.read() with open("known_supplement.txt", "r", encoding="utf-8") as file: known_words2 = file.read() known_words = known_words + "\n" + known_words2 tagger = fugashi.Tagger() known_words = kana.markup_known_words(known_words) known_kanji = kana.get_unique_kanji(known_words) booktml = soup.prettify() cleaned_book = kana.markup_book_html(booktml) token_words = [word.surface for word in tagger(cleaned_book)] uniq_words = kana.get_unique_token_words(token_words) booktml, kanjiwords, lemmawords, unknown_words = kana.mark_known_words_sbl( booktml, uniq_words, known_words, tagger) booktml = kana.mark_kanjiwords(booktml, kanjiwords, known_words) booktml = kana.mark_lemmawords(booktml, lemmawords, known_words) booktml = kana.mark_known_kanji(booktml, known_kanji) uniq_kanji = kana.get_unique_kanji(uniq_words) unknown_kanji = uniq_kanji.difference(known_kanji) booktml = kana.mark_unknown_kanji(booktml, unknown_kanji) with open( nhkdir + story_dir_prefix + str(story) + "\\story_marked.html", "w", encoding="utf-8", ) as wr: wr.write(booktml) freq_list, unknown_freq_list = kana.get_freq_lists( token_words, unknown_words) with open( nhkdir + story_dir_prefix + str(story) + "\\story_freq.txt", "w", encoding="utf-8", ) as wr: for w, f in freq_list: wr.write(f"{w}, {f}\n") with open( nhkdir + story_dir_prefix + str(story) + "\\story_unknown_freq.txt", "w", encoding="utf-8", ) as wr: for w, f in unknown_freq_list: wr.write(f"{w}, {f}\n")
def report_function(booklist): from collections import Counter import pandas as pd OLD_LIB = False tagger = fugashi.Tagger() reportdf = pd.DataFrame() reportdir = f"{os.path.dirname(booklist[0])}/$_report" reportname = "$report.csv" if not os.path.isdir(reportdir): os.mkdir(reportdir) if os.path.isfile(f"{reportdir}/{reportname}"): lib_df = pd.read_csv(f"{reportdir}/{reportname}", index_col=0) OLD_LIB = True for novel in tqdm(booklist, ascii=True, desc="Creating Report"): reportfile = f"{reportdir}/{os.path.basename(novel)}.zip" if OLD_LIB: if lib_df["Name"].isin([os.path.basename(novel)]).any(): continue if os.path.isfile(reportfile): with zipfile.ZipFile(reportfile) as myzip: with myzip.open(f"{os.path.basename(novel)}.txt") as file: rtxt = file.read().decode("utf-8").splitlines() # with open(reportfile, 'r', encoding='utf-8') as file: # rtxt = file.read().splitlines() rdict = get_rdict(rtxt) # with open(f"{novel}/{os.path.basename(novel)}.html", # 'r', encoding='utf-8') as file: # raw_book = file.read() # cleaned_book = kana.markup_book_html(raw_book) # cleaned_book = kana.reduce_new_lines(cleaned_book) # sentences = cleaned_book.split('\n') all_kanji = kana.remove_non_kanji(kana.getrdictstring(rdict)) uniq_kanji = set(all_kanji) kanji_counter = Counter(all_kanji) # appears at least two times aka 2+ times n2plus = sum(k >= 2 for k in kanji_counter.values()) # appears at least 5 times aka 5+ times n5plus = sum(k >= 5 for k in kanji_counter.values()) # appears at least 10 times aka 10+ times n10plus = sum(k >= 10 for k in kanji_counter.values()) add_data = [{ "Name": os.path.basename(novel), "Number Tokens": sum(rdict.values()), "Total Words": len(rdict), "Total Kanji": len(uniq_kanji), "Kanji 10+": n10plus, "Kanji 5+": n5plus, "Kanji 2+": n2plus, }] reportdf = reportdf.append(add_data, ignore_index=True, sort=False) else: with open(f"{novel}/{os.path.basename(novel)}.html", "r", encoding="utf-8") as file: raw_book = file.read() cleaned_book = kana.markup_book_html(raw_book) cleaned_book = kana.reduce_new_lines(cleaned_book) sentences = cleaned_book.split("\n") token_words = [] token_extend = token_words.extend for sen in sentences: sentence_tokens = [ word.feature.lemma if word.feature.lemma else word.surface for word in tagger(sen) ] sentence_tokens = [ kana.clean_lemma(token) for token in sentence_tokens if not kana.is_single_kana(token) ] sentence_tokens = kana.get_unique_token_words(sentence_tokens) token_extend(sentence_tokens) token_counter = Counter(token_words) all_kanji = kana.remove_non_kanji("".join(token_words)) token_words = set(token_words) uniq_kanji = set(all_kanji) kanji_counter = Counter(all_kanji) # appears at least two times aka 2+ times n2plus = sum(k >= 2 for k in kanji_counter.values()) # appears at least 5 times aka 5+ times n5plus = sum(k >= 5 for k in kanji_counter.values()) # appears at least 10 times aka 10+ times n10plus = sum(k >= 10 for k in kanji_counter.values()) add_data = [{ "Name": os.path.basename(novel), "Number Tokens": sum(token_counter.values()), "Total Words": len(token_words), "Total Kanji": len(uniq_kanji), "Kanji 10+": n10plus, "Kanji 5+": n5plus, "Kanji 2+": n2plus, }] reportdf = reportdf.append(add_data, ignore_index=True, sort=False) counterstr = "" for k, v in token_counter.most_common(): counterstr += f"{k}, {v}\n" with open(f"{reportdir}/{os.path.basename(novel)}.txt", "w", encoding="utf-8") as wr: wr.write(counterstr) with zipfile.ZipFile(f"{reportdir}/{os.path.basename(novel)}.zip", "w", zipfile.ZIP_LZMA) as myzip: myzip.write( f"{reportdir}/{os.path.basename(novel)}.txt", f"{os.path.basename(novel)}.txt", ) if os.path.exists(f"{reportdir}/{os.path.basename(novel)}.txt"): os.remove(f"{reportdir}/{os.path.basename(novel)}.txt") if OLD_LIB: lib_df = lib_df.append(reportdf, ignore_index=True, sort=False) lib_df.to_csv(f"{reportdir}/{reportname}", index_label="Index") else: reportdf.to_csv(f"{reportdir}/{reportname}", index_label="Index")
def srt_processing(subtitledir, reportdir=None): import srt import pandas as pd OLD_LIB = False tagger = fugashi.Tagger() reportdf = pd.DataFrame() if reportdir is None: reportdir = f"{subtitledir}/$_report" reportname = "$report.csv" if not os.path.isdir(reportdir): os.mkdir(reportdir) if os.path.isfile(f"{reportdir}/{reportname}"): lib_df = pd.read_csv(f"{reportdir}/{reportname}", index_col=0) OLD_LIB = True srtdirs = [ f"{subtitledir}/{f.name}" for f in os.scandir(subtitledir) if f.is_dir() and f.name[0] != "$" ] srtfiles = [ f"{subtitledir}/{f.name}" for f in os.scandir(subtitledir) if f.is_file() and f.name[0] != "$" ] for subdir in tqdm(srtdirs, ascii=True, desc="Creating Directory Report"): reportfile = f"{reportdir}/{os.path.basename(subdir)}.zip" if OLD_LIB: if lib_df["Name"].isin([os.path.basename(subdir)]).any(): continue subsrtfiles = [ f"{subdir}/{f.name}" for f in os.scandir(subdir) if f.is_file() and f.name[0] != "$" ] if os.path.isfile(reportfile): with zipfile.ZipFile(reportfile) as myzip: with myzip.open(f"{os.path.basename(subdir)}.txt") as file: rtxt = file.read().decode("utf-8").splitlines() # with open(reportfile, 'r', encoding='utf-8') as file: # rtxt = file.read().splitlines() rdict = get_rdict(rtxt) all_kanji = kana.remove_non_kanji(kana.getrdictstring(rdict)) uniq_kanji = set(all_kanji) kanji_counter = Counter(all_kanji) # appears at least two times aka 2+ times n2plus = sum(k >= 2 for k in kanji_counter.values()) # appears at least 5 times aka 5+ times n5plus = sum(k >= 5 for k in kanji_counter.values()) # appears at least 10 times aka 10+ times n10plus = sum(k >= 10 for k in kanji_counter.values()) add_data = [{ "Name": os.path.basename(subdir), "Number Tokens": sum(rdict.values()), "Total Words": len(rdict), "Total Kanji": len(uniq_kanji), "Kanji 10+": n10plus, "Kanji 5+": n5plus, "Kanji 2+": n2plus, }] reportdf = reportdf.append(add_data, ignore_index=True, sort=False) else: subsrtfiles = [ f"{subdir}/{f.name}" for f in os.scandir(subdir) if f.is_file() and f.name[0] != "$" ] concatsubs = "" for subf in subsrtfiles: with open(f"{subf}", "r", encoding="utf-8") as file: concatsubs += file.read() sub_gen = srt.parse(concatsubs) subs = list(sub_gen) token_words = [] token_extend = token_words.extend for sen in subs: sentence_tokens = [ word.feature.lemma if word.feature.lemma else word.surface for word in tagger( kana.markup_book_html(remove_names(sen.content))) ] sentence_tokens = [ kana.clean_lemma(token) for token in sentence_tokens if not kana.is_single_kana(token) ] sentence_tokens = kana.get_unique_token_words(sentence_tokens) token_extend(sentence_tokens) token_counter = Counter(token_words) all_kanji = kana.remove_non_kanji("".join(token_words)) uni_token_words = set(token_words) uniq_kanji = set(all_kanji) kanji_counter = Counter(all_kanji) # appears at least two times aka 2+ times n2plus = sum(k >= 2 for k in kanji_counter.values()) # appears at least 5 times aka 5+ times n5plus = sum(k >= 5 for k in kanji_counter.values()) # appears at least 10 times aka 10+ times n10plus = sum(k >= 10 for k in kanji_counter.values()) add_data = [{ "Name": os.path.basename(subdir), "Number Tokens": sum(token_counter.values()), "Total Words": len(uni_token_words), "Total Kanji": len(uniq_kanji), "Kanji 10+": n10plus, "Kanji 5+": n5plus, "Kanji 2+": n2plus, }] reportdf = reportdf.append(add_data, ignore_index=True, sort=False) counterstr = "" for k, v in token_counter.most_common(): counterstr += f"{k}, {v}\n" with open(f"{reportdir}/{os.path.basename(subdir)}.txt", "w", encoding="utf-8") as wr: wr.write(counterstr) with zipfile.ZipFile(f"{reportdir}/{os.path.basename(subdir)}.zip", "w", zipfile.ZIP_LZMA) as myzip: myzip.write( f"{reportdir}/{os.path.basename(subdir)}.txt", f"{os.path.basename(subdir)}.txt", ) if os.path.exists(f"{reportdir}/{os.path.basename(subdir)}.txt"): os.remove(f"{reportdir}/{os.path.basename(subdir)}.txt") for subf in tqdm(srtfiles, ascii=True, desc="Creating File Report"): reportfile = f"{reportdir}/{os.path.basename(subf)}.zip" if OLD_LIB: if lib_df["Name"].isin([os.path.basename(subf)]).any(): continue if os.path.isfile(reportfile): with zipfile.ZipFile(reportfile) as myzip: with myzip.open(f"{os.path.basename(subf)}.txt") as file: rtxt = file.read().decode("utf-8").splitlines() # with open(reportfile, 'r', encoding='utf-8') as file: # rtxt = file.read().splitlines() rdict = get_rdict(rtxt) all_kanji = kana.remove_non_kanji(kana.getrdictstring(rdict)) uniq_kanji = set(all_kanji) kanji_counter = Counter(all_kanji) # appears at least two times aka 2+ times n2plus = sum(k >= 2 for k in kanji_counter.values()) # appears at least 5 times aka 5+ times n5plus = sum(k >= 5 for k in kanji_counter.values()) # appears at least 10 times aka 10+ times n10plus = sum(k >= 10 for k in kanji_counter.values()) add_data = [{ "Name": os.path.basename(subf), "Number Tokens": sum(rdict.values()), "Total Words": len(rdict), "Total Kanji": len(uniq_kanji), "Kanji 10+": n10plus, "Kanji 5+": n5plus, "Kanji 2+": n2plus, }] reportdf = reportdf.append(add_data, ignore_index=True, sort=False) else: with open(f"{subf}", "r", encoding="utf-8") as file: subtitle = file.read() sub_gen = srt.parse(subtitle) subs = list(sub_gen) token_words = [] token_extend = token_words.extend for sen in subs: sentence_tokens = [ word.feature.lemma if word.feature.lemma else word.surface for word in tagger(kana.markup_book_html(sen.content)) ] sentence_tokens = [ kana.clean_lemma(token) for token in sentence_tokens if not kana.is_single_kana(token) ] sentence_tokens = kana.get_unique_token_words(sentence_tokens) token_extend(sentence_tokens) token_counter = Counter(token_words) all_kanji = kana.remove_non_kanji("".join(token_words)) uni_token_words = set(token_words) uniq_kanji = set(all_kanji) kanji_counter = Counter(all_kanji) # appears at least two times aka 2+ times n2plus = sum(k >= 2 for k in kanji_counter.values()) # appears at least 5 times aka 5+ times n5plus = sum(k >= 5 for k in kanji_counter.values()) # appears at least 10 times aka 10+ times n10plus = sum(k >= 10 for k in kanji_counter.values()) add_data = [{ "Name": os.path.basename(subf), "Number Tokens": sum(token_counter.values()), "Total Words": len(uni_token_words), "Total Kanji": len(uniq_kanji), "Kanji 10+": n10plus, "Kanji 5+": n5plus, "Kanji 2+": n2plus, }] reportdf = reportdf.append(add_data, ignore_index=True, sort=False) counterstr = "" for k, v in token_counter.most_common(): counterstr += f"{k}, {v}\n" with open(f"{reportdir}/{os.path.basename(subf)}.txt", "w", encoding="utf-8") as wr: wr.write(counterstr) with zipfile.ZipFile(f"{reportdir}/{os.path.basename(subf)}.zip", "w", zipfile.ZIP_LZMA) as myzip: myzip.write( f"{reportdir}/{os.path.basename(subf)}.txt", f"{os.path.basename(subf)}.txt", ) if os.path.exists(f"{reportdir}/{os.path.basename(subf)}.txt"): os.remove(f"{reportdir}/{os.path.basename(subf)}.txt") if OLD_LIB: lib_df = lib_df.append(reportdf, ignore_index=True, sort=False) lib_df.to_csv(f"{reportdir}/{reportname}", index_label="Index") else: reportdf.to_csv(f"{reportdir}/{reportname}", index_label="Index")
def main(no_kanjigrid, user): try: import fugashi tagger = fugashi.Tagger() EXTRA = True except ModuleNotFoundError: EXTRA = False __loc__ = os.path.abspath("") __loc__ = os.path.dirname(os.path.realpath(__file__)) DISREGARD_OLD_KNOWN = False ADD_NX_SUP = False CREATE_KANJIGRID = not no_kanjigrid COUNT_NEW_LEECHES = True write_to_file_text = "" col = Collection(user=user) notes = col.cards.merge_notes() path = __loc__ + "\\resources" kw_path = path + "\\.known_words.txt" if os.path.isfile(path + "\\.previous.txt"): with open(path + "\\.previous.txt", "r", encoding="utf-8") as file: print("Previous known words:") print(file.read()) print("_" * 50 + "\n" * 2) print("Current known words:") with open(path + "\\anki_cards.txt", "r") as file: card_list = file.read().splitlines() words = [] for cards in card_list: card, field = cards.split(":") field = int(field) selection = notes.query( f"nmodel == '{card}' and cqueue == 'due' " f"or nmodel == '{card}' and cqueue == 'suspended'" ) sellist = selection["nflds"].tolist() if COUNT_NEW_LEECHES: mask = notes.ntags.apply(lambda x: "leech" in x) leech_sel = notes[mask] sel = leech_sel.query(f"nmodel == '{card}' and cqueue == 'new'") sellist.extend(sel["nflds"].tolist()) print(f"card model {card} found:") write_to_file_text = write_to_file_text + f"card model {card} found:" + "\n" print(len(sellist)) write_to_file_text = write_to_file_text + str(len(sellist)) + "\n" for w in sellist: if not kana.is_single_kana(w[field - 1]): words.append(w[field - 1]) uniq_w = set(words) # for a better reprensation of what i actually known # it would probably be better to do this right before any processing # and not now which just inflates the numbers # 21.01 still unsure about this if EXTRA: extra = set() for w in uniq_w: w = kana.markup_book_html(w) tags = [ word.feature.lemma if word.feature.lemma else word.surface for word in tagger(w) ] tags = [ kana.clean_lemma(token) for token in tags if not kana.is_single_kana(token) ] tags = kana.get_unique_token_words(tags) extra.update(tags) uniq_w.update(extra) if not DISREGARD_OLD_KNOWN: if os.path.isfile(kw_path): with open(kw_path, "r", encoding="utf-8") as file: previous_known = file.read().splitlines() previous_known = [ word for word in previous_known if not kana.is_single_kana(word) and word ] uniq_w.update(previous_known) if ADD_NX_SUP: nx_sup = [] for i in range(1, 6): if os.path.isfile("n" + str(i) + ".txt"): # print(i) with open("n" + str(i) + ".txt", "r", encoding="utf-8") as file: # print(sum(1 for _ in file)) nx_sup.extend(list(file.read().split("\n"))) uniq_w.update(nx_sup) muniq = {w for w in kana.markup_known_words("\n".join(uniq_w)) if w != ""} muniq = list(muniq) muniq.sort() uniqK = kana.get_unique_kanji(muniq) print(f"found a total of {len(muniq)} words") print(f"with a total of {len(uniqK)} unique kanji") write_to_file_text = ( write_to_file_text + f"found a total of {len(muniq)} words" + "\n" ) write_to_file_text = ( write_to_file_text + f"with a total of {len(uniqK)} unique kanji" + "\n" ) with open(kw_path, "w", encoding="utf-8") as wr: wr.write("\n".join(muniq)) with open(path + "\\.previous.txt", "w", encoding="utf-8") as wr: wr.write(write_to_file_text) add_data = [ { "Date": current_date, "Time": current_time, "Words": len(muniq), "Kanji": len(uniqK), } ] if os.path.isfile(path + "\\.progress.csv"): prog_df = pd.read_csv(path + "\\.progress.csv", index_col=0) prog_df = prog_df.append(add_data, ignore_index=True, sort=False) prog_df.to_csv(path + "\\.progress.csv", index_label="Index") else: prog_df = pd.DataFrame(add_data) prog_df.to_csv(path + "\\.progress.csv", index_label="Index") if CREATE_KANJIGRID: kj.main()