sorted_path = nhkdir + "\\$_sorted_by_kanji" story_dir_prefix = "\\Story_" tagger = fugashi.Tagger() if os.path.isdir(sorted_path): shutil.rmtree(sorted_path) os.mkdir(sorted_path) kw_path = path + "\\.known_words.txt" if os.path.isfile(kw_path): with open(kw_path, "r", encoding="utf-8") as file: # print("success") known_words = file.read() known_words = list(kana.markup_known_words(known_words)) known_kanji = kana.get_unique_kanji(known_words) subfolders = [ f.name for f in os.scandir(nhkdir) if f.is_dir() and f.name[0] != "$" ] if IGNORE_CITIES: with open(path + "\\citynames.txt", "r", encoding="utf-8") as file: cities = file.read() cities = kana.markup_known_words(cities) cities = kana.get_unique_kanji(cities) for article in tqdm(subfolders, ascii=True, desc="sorting the articles", ncols=100):
def download_story(story): response = requests.get(nhkeasy_prefix + str(story)) soup = BeautifulSoup(response.text, "lxml") soup = hpre.delete_tags(soup, blacklist) soup = hpre.strip_tags_and_font(soup, whitelist) for tag in soup.find_all("ruby"): tag.unwrap() soup = hpre.pop_img_width(soup) for tag in soup.find_all("img"): if tag.get("alt") == "Story illustration": locsrc = tag.get("src") tag.attrs["src"] = "https://nhkeasier.com" + locsrc elif tag.get("title") == None: pass elif "furigana" in tag.get("title"): tag.replaceWith("") for tag in soup.find_all("audio"): test = tag.get("src") tag.attrs["src"] = "https://nhkeasier.com" + test tag.attrs["preload"] = "auto" for tag in soup.findAll("p"): tag.string = tag.text.replace(" ", "") teststr = soup.prettify() teststr = teststr.replace("\n", "") teststr = teststr.replace(" ", "") teststr = teststr.replace( '<h1> <a href="/">NHK News WebEasier </a> </h1>', '<h2> <a href="https://nhkeasier.com/">NHK News WebEasier </a> </h2>', ) teststr = teststr.replace("<h2> Single Story </h2>", "") teststr = teststr.replace("<link/>", "") soup = BeautifulSoup(teststr, "lxml") with open(path + "\\styling.txt", "r", encoding="utf-8") as file: styletag = file.read() soup = hpre.add_style(soup, styletag) try: soup.img.insert_before(soup.audio) except AttributeError: pass # change this # archive/story_xxxx is better os.mkdir(nhkdir + story_dir_prefix + str(story)) with open(nhkdir + story_dir_prefix + str(story) + "\\story.html", "w", encoding="utf-8") as wr: wr.write(soup.prettify()) if DO_KANJI_ANALYZE: path_ankipanda = os.path.expanduser("~") + "\\ankipandas_words.txt" if os.path.isfile(path_ankipanda): with open(path_ankipanda, "r", encoding="utf-8") as file: known_words = file.read() else: with open("known_words.txt", "r", encoding="utf-8") as file: known_words = file.read() with open("known_supplement.txt", "r", encoding="utf-8") as file: known_words2 = file.read() known_words = known_words + "\n" + known_words2 tagger = fugashi.Tagger() known_words = kana.markup_known_words(known_words) known_kanji = kana.get_unique_kanji(known_words) booktml = soup.prettify() cleaned_book = kana.markup_book_html(booktml) token_words = [word.surface for word in tagger(cleaned_book)] uniq_words = kana.get_unique_token_words(token_words) booktml, kanjiwords, lemmawords, unknown_words = kana.mark_known_words_sbl( booktml, uniq_words, known_words, tagger) booktml = kana.mark_kanjiwords(booktml, kanjiwords, known_words) booktml = kana.mark_lemmawords(booktml, lemmawords, known_words) booktml = kana.mark_known_kanji(booktml, known_kanji) uniq_kanji = kana.get_unique_kanji(uniq_words) unknown_kanji = uniq_kanji.difference(known_kanji) booktml = kana.mark_unknown_kanji(booktml, unknown_kanji) with open( nhkdir + story_dir_prefix + str(story) + "\\story_marked.html", "w", encoding="utf-8", ) as wr: wr.write(booktml) freq_list, unknown_freq_list = kana.get_freq_lists( token_words, unknown_words) with open( nhkdir + story_dir_prefix + str(story) + "\\story_freq.txt", "w", encoding="utf-8", ) as wr: for w, f in freq_list: wr.write(f"{w}, {f}\n") with open( nhkdir + story_dir_prefix + str(story) + "\\story_unknown_freq.txt", "w", encoding="utf-8", ) as wr: for w, f in unknown_freq_list: wr.write(f"{w}, {f}\n")
def main(print_unknown_kanji): __loc__ = os.path.dirname(os.path.realpath(__file__)) path = __loc__ + "\\resources" kw_path = path + "\\.known_words.txt" notopath = ( os.path.expanduser("~") + r"\AppData\Local\Microsoft\Windows\Fonts\NotoSansJP-Regular.otf") try: fontjp = ImageFont.truetype(notopath, size=KANJISIZE) except: try: # fontjp = ImageFont.truetype('msgothic.ttc', size=KANJISIZE) fontjp = ImageFont.truetype("YuGothM.ttc", size=KANJISIZE) # fontjp = ImageFont.truetype('msmincho.ttc', size=KANJISIZE) except: print("Unlucky") try: headerfont = ImageFont.truetype("cambria.ttc", size=HEADERFONTSIZE) except: print("Unlucky") print("Used Font: " + fontjp.font.family) with open(kw_path, "r", encoding="utf-8") as file: known_words = file.read() kanjistring = kana.remove_non_kanji(known_words) kanji_freq = Counter(kanjistring) known_kanji = kana.get_unique_kanji(known_words) all_kanji_in_grading = set() columns = COLUMNS tempimg = Image.new("RGB", (fontjp.size * columns, 0)) for idx in range(len(grades)): # print(idx) grade = grades[idx] with open(path + "/Grade_" + grade + ".txt", "r", encoding="utf-8") as file: grade_list = file.read().split("\n") all_kanji_in_grading.update(grade_list) grid = get_kanji_grid_wcounter(grade_list, kanji_freq, fontjp, columns) # now generate header for the grade gradestring = f"Grade {grade}" head = get_header(gradestring, headerfont, headerfont.size, columns) concat = get_vert_cat(head, grid) tempimg = get_vert_cat(tempimg, concat) known_kanji.difference_update(set(grade_list)) if bool(known_kanji): gradestring = f"Known Kanji outside the Grading" head = get_header(gradestring, headerfont, headerfont.size, columns) kanjilist = list(known_kanji) kanjilist.sort() grid = get_kanji_grid_wcounter(kanjilist, kanji_freq, fontjp, columns) concat = get_vert_cat(head, grid) tempimg = get_vert_cat(tempimg, concat) known_kanji = kana.get_unique_kanji(known_words) unknown_kanji = all_kanji_in_grading.difference(known_kanji) total = len(all_kanji_in_grading) test = get_stats(kanji_freq, unknown_kanji, total, headerfont, headerfont.size, columns) tempimg = get_vert_cat(tempimg, test) # add side padding pad_sides = Image.new("RGB", (KANJISIZE, tempimg.height), color="#FFFFFF") tempimg = get_hori_cat(tempimg, pad_sides) tempimg = get_hori_cat(pad_sides, tempimg) now = datetime.now() current_time = now.strftime("_%H_%M") current_date = now.strftime("%Y_%m_%d") tempimg.save(path + f"/Kanjigrid_{current_date}{current_time}.png") if print_unknown_kanji: print(unknown_kanji)
for v in val: sel_ser = shorted_subfol[v - 1] cur_sel = [vol for vol in subfolders if sel_ser in vol] selected_list.extend(cur_sel) filelist = [f"{lndir}{s}/{s}.html" for s in selected_list] corpus = "" for book in filelist: with open(book, "r", encoding="utf-8") as file: data = file.read() cleaned_data = kana.markup_book_html_rem_furigana(data) corpus += cleaned_data uniq_kanji = kana.get_unique_kanji(corpus) corpus = kana.reduce_new_lines(corpus) # tagging the whole corpus takes a long time depending on the corpus # the list comps are also slow # token_words = [[word.surface, word.feature.lemma] for word in tagger(testcorpus)] # splitting the corpus and feeding it into the tagger does not increase speed. # nothing important gets lost though and fugashi takes up a lot of memory if the input is big token_flat = [ feat for word in tagger(corpus) for feat in [word.surface, word.feature.lemma] if feat ] # token_flat = [word for word in token_flat if word]
def main(no_kanjigrid, user): try: import fugashi tagger = fugashi.Tagger() EXTRA = True except ModuleNotFoundError: EXTRA = False __loc__ = os.path.abspath("") __loc__ = os.path.dirname(os.path.realpath(__file__)) DISREGARD_OLD_KNOWN = False ADD_NX_SUP = False CREATE_KANJIGRID = not no_kanjigrid COUNT_NEW_LEECHES = True write_to_file_text = "" col = Collection(user=user) notes = col.cards.merge_notes() path = __loc__ + "\\resources" kw_path = path + "\\.known_words.txt" if os.path.isfile(path + "\\.previous.txt"): with open(path + "\\.previous.txt", "r", encoding="utf-8") as file: print("Previous known words:") print(file.read()) print("_" * 50 + "\n" * 2) print("Current known words:") with open(path + "\\anki_cards.txt", "r") as file: card_list = file.read().splitlines() words = [] for cards in card_list: card, field = cards.split(":") field = int(field) selection = notes.query( f"nmodel == '{card}' and cqueue == 'due' " f"or nmodel == '{card}' and cqueue == 'suspended'" ) sellist = selection["nflds"].tolist() if COUNT_NEW_LEECHES: mask = notes.ntags.apply(lambda x: "leech" in x) leech_sel = notes[mask] sel = leech_sel.query(f"nmodel == '{card}' and cqueue == 'new'") sellist.extend(sel["nflds"].tolist()) print(f"card model {card} found:") write_to_file_text = write_to_file_text + f"card model {card} found:" + "\n" print(len(sellist)) write_to_file_text = write_to_file_text + str(len(sellist)) + "\n" for w in sellist: if not kana.is_single_kana(w[field - 1]): words.append(w[field - 1]) uniq_w = set(words) # for a better reprensation of what i actually known # it would probably be better to do this right before any processing # and not now which just inflates the numbers # 21.01 still unsure about this if EXTRA: extra = set() for w in uniq_w: w = kana.markup_book_html(w) tags = [ word.feature.lemma if word.feature.lemma else word.surface for word in tagger(w) ] tags = [ kana.clean_lemma(token) for token in tags if not kana.is_single_kana(token) ] tags = kana.get_unique_token_words(tags) extra.update(tags) uniq_w.update(extra) if not DISREGARD_OLD_KNOWN: if os.path.isfile(kw_path): with open(kw_path, "r", encoding="utf-8") as file: previous_known = file.read().splitlines() previous_known = [ word for word in previous_known if not kana.is_single_kana(word) and word ] uniq_w.update(previous_known) if ADD_NX_SUP: nx_sup = [] for i in range(1, 6): if os.path.isfile("n" + str(i) + ".txt"): # print(i) with open("n" + str(i) + ".txt", "r", encoding="utf-8") as file: # print(sum(1 for _ in file)) nx_sup.extend(list(file.read().split("\n"))) uniq_w.update(nx_sup) muniq = {w for w in kana.markup_known_words("\n".join(uniq_w)) if w != ""} muniq = list(muniq) muniq.sort() uniqK = kana.get_unique_kanji(muniq) print(f"found a total of {len(muniq)} words") print(f"with a total of {len(uniqK)} unique kanji") write_to_file_text = ( write_to_file_text + f"found a total of {len(muniq)} words" + "\n" ) write_to_file_text = ( write_to_file_text + f"with a total of {len(uniqK)} unique kanji" + "\n" ) with open(kw_path, "w", encoding="utf-8") as wr: wr.write("\n".join(muniq)) with open(path + "\\.previous.txt", "w", encoding="utf-8") as wr: wr.write(write_to_file_text) add_data = [ { "Date": current_date, "Time": current_time, "Words": len(muniq), "Kanji": len(uniqK), } ] if os.path.isfile(path + "\\.progress.csv"): prog_df = pd.read_csv(path + "\\.progress.csv", index_col=0) prog_df = prog_df.append(add_data, ignore_index=True, sort=False) prog_df.to_csv(path + "\\.progress.csv", index_label="Index") else: prog_df = pd.DataFrame(add_data) prog_df.to_csv(path + "\\.progress.csv", index_label="Index") if CREATE_KANJIGRID: kj.main()