def personal_report(bookdir, subsdir):
    # the tokenizer makes a lot of mistakes and to deal with some
    # im using a .prignore.txt which contains words which are the result of
    # wrong lemmatization. i.e. sometimes 二郎 results in 次郎 instead of ジロウ
    # as I don't have to time to properly think of a solution im using this hotfix
    if os.path.isfile(".prignore.txt"):
        with open(".prignore.txt", "r", encoding="utf-8") as file:
            ignoset = set(file.read().splitlines())
        ignoset = {k.split()[0] for k in ignoset}
    bookset = {
        f.name
        for f in os.scandir(bookdir) if f.is_dir() and f.name[0] != "$"
        and os.path.isfile(f"{bookdir}/{f.name}/read.txt")
    }
    readlist = [
        f.path for f in os.scandir(f"{bookdir}/$_report")
        if os.path.splitext(f.name)[0] in bookset
    ]
    sublist = [
        f.path for f in os.scandir(f"{subsdir}/$_report") if f.name[0] != "$"
    ]
    total_counter = Counter()
    reference_dict = dict()
    for subf in sublist:
        if os.path.isfile(subf):
            subfname = f"{os.path.splitext(os.path.basename(subf))[0]}.txt"
            with zipfile.ZipFile(subf) as myzip:
                with myzip.open(subfname) as file:
                    rtxt = file.read().decode("utf-8").splitlines()
            rdict = get_rdict(rtxt)
            total_counter += Counter(rdict)
            for key in rdict.keys():
                if key in reference_dict:
                    reference_dict[key] += f", {subfname}"
                else:
                    reference_dict[key] = subfname
    for book in readlist:
        if os.path.isfile(book):
            with open(book, "r", encoding="utf-8") as file:
                rtxt = file.read().splitlines()
            rdict = get_rdict(rtxt)
            total_counter += Counter(rdict)
            for key in rdict.keys():
                if key in reference_dict:
                    reference_dict[key] += f", {os.path.basename(book)}"
                else:
                    reference_dict[key] = f"{os.path.basename(book)}"
    if os.path.isfile(kw_path):
        with open(kw_path, "r", encoding="utf-8") as file:
            known_words = file.read()
        known_words = kana.markup_known_words(known_words)
    else:
        known_words = set()
    counterstr = ""
    for k, v in total_counter.most_common():
        if k not in known_words and k not in ignoset and not kana.is_katakana(
                k):
            counterstr += f"{k}, {v}, {reference_dict[k]}\n"
    with open("$PersonalReport.csv", "w", encoding="utf-8") as wr:
        wr.write(counterstr)
Exemple #2
0
nhkdir = __loc__ + "\\nhkeasier_archive"
sorted_path = nhkdir + "\\$_sorted_by_kanji"
story_dir_prefix = "\\Story_"
tagger = fugashi.Tagger()

if os.path.isdir(sorted_path):
    shutil.rmtree(sorted_path)
os.mkdir(sorted_path)

kw_path = path + "\\.known_words.txt"
if os.path.isfile(kw_path):
    with open(kw_path, "r", encoding="utf-8") as file:
        # print("success")
        known_words = file.read()

known_words = list(kana.markup_known_words(known_words))
known_kanji = kana.get_unique_kanji(known_words)

subfolders = [
    f.name for f in os.scandir(nhkdir) if f.is_dir() and f.name[0] != "$"
]

if IGNORE_CITIES:
    with open(path + "\\citynames.txt", "r", encoding="utf-8") as file:
        cities = file.read()
    cities = kana.markup_known_words(cities)
    cities = kana.get_unique_kanji(cities)

for article in tqdm(subfolders,
                    ascii=True,
                    desc="sorting the articles",
Exemple #3
0
def download_story(story):
    response = requests.get(nhkeasy_prefix + str(story))

    soup = BeautifulSoup(response.text, "lxml")
    soup = hpre.delete_tags(soup, blacklist)
    soup = hpre.strip_tags_and_font(soup, whitelist)
    for tag in soup.find_all("ruby"):
        tag.unwrap()
    soup = hpre.pop_img_width(soup)

    for tag in soup.find_all("img"):
        if tag.get("alt") == "Story illustration":
            locsrc = tag.get("src")
            tag.attrs["src"] = "https://nhkeasier.com" + locsrc
        elif tag.get("title") == None:
            pass
        elif "furigana" in tag.get("title"):
            tag.replaceWith("")

    for tag in soup.find_all("audio"):
        test = tag.get("src")
        tag.attrs["src"] = "https://nhkeasier.com" + test
        tag.attrs["preload"] = "auto"

    for tag in soup.findAll("p"):
        tag.string = tag.text.replace(" ", "")

    teststr = soup.prettify()
    teststr = teststr.replace("\n", "")
    teststr = teststr.replace("     ", "")
    teststr = teststr.replace(
        '<h1>    <a href="/">NHK News WebEasier    </a>   </h1>',
        '<h2>    <a href="https://nhkeasier.com/">NHK News WebEasier    </a>   </h2>',
    )
    teststr = teststr.replace("<h2>    Single Story   </h2>", "")
    teststr = teststr.replace("<link/>", "")

    soup = BeautifulSoup(teststr, "lxml")
    with open(path + "\\styling.txt", "r", encoding="utf-8") as file:
        styletag = file.read()
    soup = hpre.add_style(soup, styletag)
    try:
        soup.img.insert_before(soup.audio)
    except AttributeError:
        pass
    # change this
    # archive/story_xxxx is better
    os.mkdir(nhkdir + story_dir_prefix + str(story))
    with open(nhkdir + story_dir_prefix + str(story) + "\\story.html",
              "w",
              encoding="utf-8") as wr:
        wr.write(soup.prettify())

    if DO_KANJI_ANALYZE:
        path_ankipanda = os.path.expanduser("~") + "\\ankipandas_words.txt"
        if os.path.isfile(path_ankipanda):
            with open(path_ankipanda, "r", encoding="utf-8") as file:
                known_words = file.read()
        else:
            with open("known_words.txt", "r", encoding="utf-8") as file:
                known_words = file.read()
        with open("known_supplement.txt", "r", encoding="utf-8") as file:
            known_words2 = file.read()
        known_words = known_words + "\n" + known_words2

        tagger = fugashi.Tagger()

        known_words = kana.markup_known_words(known_words)
        known_kanji = kana.get_unique_kanji(known_words)

        booktml = soup.prettify()
        cleaned_book = kana.markup_book_html(booktml)
        token_words = [word.surface for word in tagger(cleaned_book)]
        uniq_words = kana.get_unique_token_words(token_words)
        booktml, kanjiwords, lemmawords, unknown_words = kana.mark_known_words_sbl(
            booktml, uniq_words, known_words, tagger)
        booktml = kana.mark_kanjiwords(booktml, kanjiwords, known_words)
        booktml = kana.mark_lemmawords(booktml, lemmawords, known_words)
        booktml = kana.mark_known_kanji(booktml, known_kanji)

        uniq_kanji = kana.get_unique_kanji(uniq_words)
        unknown_kanji = uniq_kanji.difference(known_kanji)
        booktml = kana.mark_unknown_kanji(booktml, unknown_kanji)

        with open(
                nhkdir + story_dir_prefix + str(story) + "\\story_marked.html",
                "w",
                encoding="utf-8",
        ) as wr:
            wr.write(booktml)
        freq_list, unknown_freq_list = kana.get_freq_lists(
            token_words, unknown_words)
        with open(
                nhkdir + story_dir_prefix + str(story) + "\\story_freq.txt",
                "w",
                encoding="utf-8",
        ) as wr:
            for w, f in freq_list:
                wr.write(f"{w}, {f}\n")
        with open(
                nhkdir + story_dir_prefix + str(story) +
                "\\story_unknown_freq.txt",
                "w",
                encoding="utf-8",
        ) as wr:
            for w, f in unknown_freq_list:
                wr.write(f"{w}, {f}\n")
b = sum(k < thresh for k in token_counter.values())
a = sum(k for k in token_counter.values() if k < thresh)
b20 = sum(k < 21 for k in token_counter.values())
b5 = sum(k < 6 for k in token_counter.values())

print(f"Threshold for 20 Occurences is position {len(token_counter)-b20}")
print(f"Threshold for 10 Occurences is position {len(token_counter)-b}")
print(f"Threshold for 5 Occurences is position {len(token_counter)-b5}")

print(f"There's {b} terms which appear {thresh-1} times or less.")
perc = 100 * a / (len(token_flat))
print(f"Together they appear {a} times making up {perc:.3f}% of the corpus.")
wordcount = 100 / perc
print(f"For every {int(wordcount)} terms there is one of them.")
avg_novel = len(token_flat) / len(filelist)
print(f"On average this results in {int(avg_novel/wordcount)} occurrences per novel.")
print(
    "Hint: Due to how the program currently counts; This is a very rough overestimate."
)
# compare the obtained corpus to the known words and create
# a frequency txt containing just the unknowns
if os.path.isfile(kw_path):
    with open(kw_path, "r", encoding="utf-8") as file:
        known_words = file.read()
    known_words = kana.markup_known_words(known_words)

    with open(f"{yomidir}{title}_unknown_freq.txt", "w", encoding="utf-8") as wr:
        for w, f in token_counter.most_common():
            if w not in known_words and not kana.contains_lemma(w, known_words, tagger):
                wr.write(f"{w}, {f}\n")
def main(no_kanjigrid, user):
    try:
        import fugashi

        tagger = fugashi.Tagger()
        EXTRA = True
    except ModuleNotFoundError:
        EXTRA = False

    __loc__ = os.path.abspath("")
    __loc__ = os.path.dirname(os.path.realpath(__file__))
    DISREGARD_OLD_KNOWN = False
    ADD_NX_SUP = False
    CREATE_KANJIGRID = not no_kanjigrid
    COUNT_NEW_LEECHES = True

    write_to_file_text = ""

    col = Collection(user=user)
    notes = col.cards.merge_notes()

    path = __loc__ + "\\resources"
    kw_path = path + "\\.known_words.txt"

    if os.path.isfile(path + "\\.previous.txt"):
        with open(path + "\\.previous.txt", "r", encoding="utf-8") as file:
            print("Previous known words:")
            print(file.read())
            print("_" * 50 + "\n" * 2)
            print("Current known words:")

    with open(path + "\\anki_cards.txt", "r") as file:
        card_list = file.read().splitlines()

    words = []
    for cards in card_list:
        card, field = cards.split(":")
        field = int(field)
        selection = notes.query(
            f"nmodel == '{card}' and cqueue == 'due' "
            f"or nmodel == '{card}' and cqueue == 'suspended'"
        )
        sellist = selection["nflds"].tolist()
        if COUNT_NEW_LEECHES:
            mask = notes.ntags.apply(lambda x: "leech" in x)
            leech_sel = notes[mask]
            sel = leech_sel.query(f"nmodel == '{card}' and cqueue == 'new'")
            sellist.extend(sel["nflds"].tolist())
        print(f"card model {card} found:")
        write_to_file_text = write_to_file_text + f"card model {card} found:" + "\n"
        print(len(sellist))
        write_to_file_text = write_to_file_text + str(len(sellist)) + "\n"
        for w in sellist:
            if not kana.is_single_kana(w[field - 1]):
                words.append(w[field - 1])

    uniq_w = set(words)

    # for a better reprensation of what i actually known
    # it would probably be better to do this right before any processing
    # and not now which just inflates the numbers
    # 21.01 still unsure about this
    if EXTRA:
        extra = set()
        for w in uniq_w:
            w = kana.markup_book_html(w)
            tags = [
                word.feature.lemma if word.feature.lemma else word.surface
                for word in tagger(w)
            ]
            tags = [
                kana.clean_lemma(token)
                for token in tags
                if not kana.is_single_kana(token)
            ]
            tags = kana.get_unique_token_words(tags)
            extra.update(tags)

        uniq_w.update(extra)

    if not DISREGARD_OLD_KNOWN:
        if os.path.isfile(kw_path):
            with open(kw_path, "r", encoding="utf-8") as file:
                previous_known = file.read().splitlines()
                previous_known = [
                    word
                    for word in previous_known
                    if not kana.is_single_kana(word) and word
                ]
        uniq_w.update(previous_known)

    if ADD_NX_SUP:
        nx_sup = []
        for i in range(1, 6):
            if os.path.isfile("n" + str(i) + ".txt"):
                # print(i)
                with open("n" + str(i) + ".txt", "r", encoding="utf-8") as file:
                    # print(sum(1 for _ in file))
                    nx_sup.extend(list(file.read().split("\n")))

                uniq_w.update(nx_sup)

    muniq = {w for w in kana.markup_known_words("\n".join(uniq_w)) if w != ""}
    muniq = list(muniq)
    muniq.sort()

    uniqK = kana.get_unique_kanji(muniq)

    print(f"found a total of {len(muniq)} words")
    print(f"with a total of {len(uniqK)} unique kanji")
    write_to_file_text = (
        write_to_file_text + f"found a total of {len(muniq)} words" + "\n"
    )
    write_to_file_text = (
        write_to_file_text + f"with a total of {len(uniqK)} unique kanji" + "\n"
    )

    with open(kw_path, "w", encoding="utf-8") as wr:
        wr.write("\n".join(muniq))

    with open(path + "\\.previous.txt", "w", encoding="utf-8") as wr:
        wr.write(write_to_file_text)

    add_data = [
        {
            "Date": current_date,
            "Time": current_time,
            "Words": len(muniq),
            "Kanji": len(uniqK),
        }
    ]
    if os.path.isfile(path + "\\.progress.csv"):
        prog_df = pd.read_csv(path + "\\.progress.csv", index_col=0)
        prog_df = prog_df.append(add_data, ignore_index=True, sort=False)
        prog_df.to_csv(path + "\\.progress.csv", index_label="Index")
    else:
        prog_df = pd.DataFrame(add_data)
        prog_df.to_csv(path + "\\.progress.csv", index_label="Index")

    if CREATE_KANJIGRID:
        kj.main()