def personal_report(bookdir, subsdir): # the tokenizer makes a lot of mistakes and to deal with some # im using a .prignore.txt which contains words which are the result of # wrong lemmatization. i.e. sometimes 二郎 results in 次郎 instead of ジロウ # as I don't have to time to properly think of a solution im using this hotfix if os.path.isfile(".prignore.txt"): with open(".prignore.txt", "r", encoding="utf-8") as file: ignoset = set(file.read().splitlines()) ignoset = {k.split()[0] for k in ignoset} bookset = { f.name for f in os.scandir(bookdir) if f.is_dir() and f.name[0] != "$" and os.path.isfile(f"{bookdir}/{f.name}/read.txt") } readlist = [ f.path for f in os.scandir(f"{bookdir}/$_report") if os.path.splitext(f.name)[0] in bookset ] sublist = [ f.path for f in os.scandir(f"{subsdir}/$_report") if f.name[0] != "$" ] total_counter = Counter() reference_dict = dict() for subf in sublist: if os.path.isfile(subf): subfname = f"{os.path.splitext(os.path.basename(subf))[0]}.txt" with zipfile.ZipFile(subf) as myzip: with myzip.open(subfname) as file: rtxt = file.read().decode("utf-8").splitlines() rdict = get_rdict(rtxt) total_counter += Counter(rdict) for key in rdict.keys(): if key in reference_dict: reference_dict[key] += f", {subfname}" else: reference_dict[key] = subfname for book in readlist: if os.path.isfile(book): with open(book, "r", encoding="utf-8") as file: rtxt = file.read().splitlines() rdict = get_rdict(rtxt) total_counter += Counter(rdict) for key in rdict.keys(): if key in reference_dict: reference_dict[key] += f", {os.path.basename(book)}" else: reference_dict[key] = f"{os.path.basename(book)}" if os.path.isfile(kw_path): with open(kw_path, "r", encoding="utf-8") as file: known_words = file.read() known_words = kana.markup_known_words(known_words) else: known_words = set() counterstr = "" for k, v in total_counter.most_common(): if k not in known_words and k not in ignoset and not kana.is_katakana( k): counterstr += f"{k}, {v}, {reference_dict[k]}\n" with open("$PersonalReport.csv", "w", encoding="utf-8") as wr: wr.write(counterstr)
nhkdir = __loc__ + "\\nhkeasier_archive" sorted_path = nhkdir + "\\$_sorted_by_kanji" story_dir_prefix = "\\Story_" tagger = fugashi.Tagger() if os.path.isdir(sorted_path): shutil.rmtree(sorted_path) os.mkdir(sorted_path) kw_path = path + "\\.known_words.txt" if os.path.isfile(kw_path): with open(kw_path, "r", encoding="utf-8") as file: # print("success") known_words = file.read() known_words = list(kana.markup_known_words(known_words)) known_kanji = kana.get_unique_kanji(known_words) subfolders = [ f.name for f in os.scandir(nhkdir) if f.is_dir() and f.name[0] != "$" ] if IGNORE_CITIES: with open(path + "\\citynames.txt", "r", encoding="utf-8") as file: cities = file.read() cities = kana.markup_known_words(cities) cities = kana.get_unique_kanji(cities) for article in tqdm(subfolders, ascii=True, desc="sorting the articles",
def download_story(story): response = requests.get(nhkeasy_prefix + str(story)) soup = BeautifulSoup(response.text, "lxml") soup = hpre.delete_tags(soup, blacklist) soup = hpre.strip_tags_and_font(soup, whitelist) for tag in soup.find_all("ruby"): tag.unwrap() soup = hpre.pop_img_width(soup) for tag in soup.find_all("img"): if tag.get("alt") == "Story illustration": locsrc = tag.get("src") tag.attrs["src"] = "https://nhkeasier.com" + locsrc elif tag.get("title") == None: pass elif "furigana" in tag.get("title"): tag.replaceWith("") for tag in soup.find_all("audio"): test = tag.get("src") tag.attrs["src"] = "https://nhkeasier.com" + test tag.attrs["preload"] = "auto" for tag in soup.findAll("p"): tag.string = tag.text.replace(" ", "") teststr = soup.prettify() teststr = teststr.replace("\n", "") teststr = teststr.replace(" ", "") teststr = teststr.replace( '<h1> <a href="/">NHK News WebEasier </a> </h1>', '<h2> <a href="https://nhkeasier.com/">NHK News WebEasier </a> </h2>', ) teststr = teststr.replace("<h2> Single Story </h2>", "") teststr = teststr.replace("<link/>", "") soup = BeautifulSoup(teststr, "lxml") with open(path + "\\styling.txt", "r", encoding="utf-8") as file: styletag = file.read() soup = hpre.add_style(soup, styletag) try: soup.img.insert_before(soup.audio) except AttributeError: pass # change this # archive/story_xxxx is better os.mkdir(nhkdir + story_dir_prefix + str(story)) with open(nhkdir + story_dir_prefix + str(story) + "\\story.html", "w", encoding="utf-8") as wr: wr.write(soup.prettify()) if DO_KANJI_ANALYZE: path_ankipanda = os.path.expanduser("~") + "\\ankipandas_words.txt" if os.path.isfile(path_ankipanda): with open(path_ankipanda, "r", encoding="utf-8") as file: known_words = file.read() else: with open("known_words.txt", "r", encoding="utf-8") as file: known_words = file.read() with open("known_supplement.txt", "r", encoding="utf-8") as file: known_words2 = file.read() known_words = known_words + "\n" + known_words2 tagger = fugashi.Tagger() known_words = kana.markup_known_words(known_words) known_kanji = kana.get_unique_kanji(known_words) booktml = soup.prettify() cleaned_book = kana.markup_book_html(booktml) token_words = [word.surface for word in tagger(cleaned_book)] uniq_words = kana.get_unique_token_words(token_words) booktml, kanjiwords, lemmawords, unknown_words = kana.mark_known_words_sbl( booktml, uniq_words, known_words, tagger) booktml = kana.mark_kanjiwords(booktml, kanjiwords, known_words) booktml = kana.mark_lemmawords(booktml, lemmawords, known_words) booktml = kana.mark_known_kanji(booktml, known_kanji) uniq_kanji = kana.get_unique_kanji(uniq_words) unknown_kanji = uniq_kanji.difference(known_kanji) booktml = kana.mark_unknown_kanji(booktml, unknown_kanji) with open( nhkdir + story_dir_prefix + str(story) + "\\story_marked.html", "w", encoding="utf-8", ) as wr: wr.write(booktml) freq_list, unknown_freq_list = kana.get_freq_lists( token_words, unknown_words) with open( nhkdir + story_dir_prefix + str(story) + "\\story_freq.txt", "w", encoding="utf-8", ) as wr: for w, f in freq_list: wr.write(f"{w}, {f}\n") with open( nhkdir + story_dir_prefix + str(story) + "\\story_unknown_freq.txt", "w", encoding="utf-8", ) as wr: for w, f in unknown_freq_list: wr.write(f"{w}, {f}\n")
b = sum(k < thresh for k in token_counter.values()) a = sum(k for k in token_counter.values() if k < thresh) b20 = sum(k < 21 for k in token_counter.values()) b5 = sum(k < 6 for k in token_counter.values()) print(f"Threshold for 20 Occurences is position {len(token_counter)-b20}") print(f"Threshold for 10 Occurences is position {len(token_counter)-b}") print(f"Threshold for 5 Occurences is position {len(token_counter)-b5}") print(f"There's {b} terms which appear {thresh-1} times or less.") perc = 100 * a / (len(token_flat)) print(f"Together they appear {a} times making up {perc:.3f}% of the corpus.") wordcount = 100 / perc print(f"For every {int(wordcount)} terms there is one of them.") avg_novel = len(token_flat) / len(filelist) print(f"On average this results in {int(avg_novel/wordcount)} occurrences per novel.") print( "Hint: Due to how the program currently counts; This is a very rough overestimate." ) # compare the obtained corpus to the known words and create # a frequency txt containing just the unknowns if os.path.isfile(kw_path): with open(kw_path, "r", encoding="utf-8") as file: known_words = file.read() known_words = kana.markup_known_words(known_words) with open(f"{yomidir}{title}_unknown_freq.txt", "w", encoding="utf-8") as wr: for w, f in token_counter.most_common(): if w not in known_words and not kana.contains_lemma(w, known_words, tagger): wr.write(f"{w}, {f}\n")
def main(no_kanjigrid, user): try: import fugashi tagger = fugashi.Tagger() EXTRA = True except ModuleNotFoundError: EXTRA = False __loc__ = os.path.abspath("") __loc__ = os.path.dirname(os.path.realpath(__file__)) DISREGARD_OLD_KNOWN = False ADD_NX_SUP = False CREATE_KANJIGRID = not no_kanjigrid COUNT_NEW_LEECHES = True write_to_file_text = "" col = Collection(user=user) notes = col.cards.merge_notes() path = __loc__ + "\\resources" kw_path = path + "\\.known_words.txt" if os.path.isfile(path + "\\.previous.txt"): with open(path + "\\.previous.txt", "r", encoding="utf-8") as file: print("Previous known words:") print(file.read()) print("_" * 50 + "\n" * 2) print("Current known words:") with open(path + "\\anki_cards.txt", "r") as file: card_list = file.read().splitlines() words = [] for cards in card_list: card, field = cards.split(":") field = int(field) selection = notes.query( f"nmodel == '{card}' and cqueue == 'due' " f"or nmodel == '{card}' and cqueue == 'suspended'" ) sellist = selection["nflds"].tolist() if COUNT_NEW_LEECHES: mask = notes.ntags.apply(lambda x: "leech" in x) leech_sel = notes[mask] sel = leech_sel.query(f"nmodel == '{card}' and cqueue == 'new'") sellist.extend(sel["nflds"].tolist()) print(f"card model {card} found:") write_to_file_text = write_to_file_text + f"card model {card} found:" + "\n" print(len(sellist)) write_to_file_text = write_to_file_text + str(len(sellist)) + "\n" for w in sellist: if not kana.is_single_kana(w[field - 1]): words.append(w[field - 1]) uniq_w = set(words) # for a better reprensation of what i actually known # it would probably be better to do this right before any processing # and not now which just inflates the numbers # 21.01 still unsure about this if EXTRA: extra = set() for w in uniq_w: w = kana.markup_book_html(w) tags = [ word.feature.lemma if word.feature.lemma else word.surface for word in tagger(w) ] tags = [ kana.clean_lemma(token) for token in tags if not kana.is_single_kana(token) ] tags = kana.get_unique_token_words(tags) extra.update(tags) uniq_w.update(extra) if not DISREGARD_OLD_KNOWN: if os.path.isfile(kw_path): with open(kw_path, "r", encoding="utf-8") as file: previous_known = file.read().splitlines() previous_known = [ word for word in previous_known if not kana.is_single_kana(word) and word ] uniq_w.update(previous_known) if ADD_NX_SUP: nx_sup = [] for i in range(1, 6): if os.path.isfile("n" + str(i) + ".txt"): # print(i) with open("n" + str(i) + ".txt", "r", encoding="utf-8") as file: # print(sum(1 for _ in file)) nx_sup.extend(list(file.read().split("\n"))) uniq_w.update(nx_sup) muniq = {w for w in kana.markup_known_words("\n".join(uniq_w)) if w != ""} muniq = list(muniq) muniq.sort() uniqK = kana.get_unique_kanji(muniq) print(f"found a total of {len(muniq)} words") print(f"with a total of {len(uniqK)} unique kanji") write_to_file_text = ( write_to_file_text + f"found a total of {len(muniq)} words" + "\n" ) write_to_file_text = ( write_to_file_text + f"with a total of {len(uniqK)} unique kanji" + "\n" ) with open(kw_path, "w", encoding="utf-8") as wr: wr.write("\n".join(muniq)) with open(path + "\\.previous.txt", "w", encoding="utf-8") as wr: wr.write(write_to_file_text) add_data = [ { "Date": current_date, "Time": current_time, "Words": len(muniq), "Kanji": len(uniqK), } ] if os.path.isfile(path + "\\.progress.csv"): prog_df = pd.read_csv(path + "\\.progress.csv", index_col=0) prog_df = prog_df.append(add_data, ignore_index=True, sort=False) prog_df.to_csv(path + "\\.progress.csv", index_label="Index") else: prog_df = pd.DataFrame(add_data) prog_df.to_csv(path + "\\.progress.csv", index_label="Index") if CREATE_KANJIGRID: kj.main()