Exemple #1
0
sorted_path = nhkdir + "\\$_sorted_by_kanji"
story_dir_prefix = "\\Story_"
tagger = fugashi.Tagger()

if os.path.isdir(sorted_path):
    shutil.rmtree(sorted_path)
os.mkdir(sorted_path)

kw_path = path + "\\.known_words.txt"
if os.path.isfile(kw_path):
    with open(kw_path, "r", encoding="utf-8") as file:
        # print("success")
        known_words = file.read()

known_words = list(kana.markup_known_words(known_words))
known_kanji = kana.get_unique_kanji(known_words)

subfolders = [
    f.name for f in os.scandir(nhkdir) if f.is_dir() and f.name[0] != "$"
]

if IGNORE_CITIES:
    with open(path + "\\citynames.txt", "r", encoding="utf-8") as file:
        cities = file.read()
    cities = kana.markup_known_words(cities)
    cities = kana.get_unique_kanji(cities)

for article in tqdm(subfolders,
                    ascii=True,
                    desc="sorting the articles",
                    ncols=100):
Exemple #2
0
def download_story(story):
    response = requests.get(nhkeasy_prefix + str(story))

    soup = BeautifulSoup(response.text, "lxml")
    soup = hpre.delete_tags(soup, blacklist)
    soup = hpre.strip_tags_and_font(soup, whitelist)
    for tag in soup.find_all("ruby"):
        tag.unwrap()
    soup = hpre.pop_img_width(soup)

    for tag in soup.find_all("img"):
        if tag.get("alt") == "Story illustration":
            locsrc = tag.get("src")
            tag.attrs["src"] = "https://nhkeasier.com" + locsrc
        elif tag.get("title") == None:
            pass
        elif "furigana" in tag.get("title"):
            tag.replaceWith("")

    for tag in soup.find_all("audio"):
        test = tag.get("src")
        tag.attrs["src"] = "https://nhkeasier.com" + test
        tag.attrs["preload"] = "auto"

    for tag in soup.findAll("p"):
        tag.string = tag.text.replace(" ", "")

    teststr = soup.prettify()
    teststr = teststr.replace("\n", "")
    teststr = teststr.replace("     ", "")
    teststr = teststr.replace(
        '<h1>    <a href="/">NHK News WebEasier    </a>   </h1>',
        '<h2>    <a href="https://nhkeasier.com/">NHK News WebEasier    </a>   </h2>',
    )
    teststr = teststr.replace("<h2>    Single Story   </h2>", "")
    teststr = teststr.replace("<link/>", "")

    soup = BeautifulSoup(teststr, "lxml")
    with open(path + "\\styling.txt", "r", encoding="utf-8") as file:
        styletag = file.read()
    soup = hpre.add_style(soup, styletag)
    try:
        soup.img.insert_before(soup.audio)
    except AttributeError:
        pass
    # change this
    # archive/story_xxxx is better
    os.mkdir(nhkdir + story_dir_prefix + str(story))
    with open(nhkdir + story_dir_prefix + str(story) + "\\story.html",
              "w",
              encoding="utf-8") as wr:
        wr.write(soup.prettify())

    if DO_KANJI_ANALYZE:
        path_ankipanda = os.path.expanduser("~") + "\\ankipandas_words.txt"
        if os.path.isfile(path_ankipanda):
            with open(path_ankipanda, "r", encoding="utf-8") as file:
                known_words = file.read()
        else:
            with open("known_words.txt", "r", encoding="utf-8") as file:
                known_words = file.read()
        with open("known_supplement.txt", "r", encoding="utf-8") as file:
            known_words2 = file.read()
        known_words = known_words + "\n" + known_words2

        tagger = fugashi.Tagger()

        known_words = kana.markup_known_words(known_words)
        known_kanji = kana.get_unique_kanji(known_words)

        booktml = soup.prettify()
        cleaned_book = kana.markup_book_html(booktml)
        token_words = [word.surface for word in tagger(cleaned_book)]
        uniq_words = kana.get_unique_token_words(token_words)
        booktml, kanjiwords, lemmawords, unknown_words = kana.mark_known_words_sbl(
            booktml, uniq_words, known_words, tagger)
        booktml = kana.mark_kanjiwords(booktml, kanjiwords, known_words)
        booktml = kana.mark_lemmawords(booktml, lemmawords, known_words)
        booktml = kana.mark_known_kanji(booktml, known_kanji)

        uniq_kanji = kana.get_unique_kanji(uniq_words)
        unknown_kanji = uniq_kanji.difference(known_kanji)
        booktml = kana.mark_unknown_kanji(booktml, unknown_kanji)

        with open(
                nhkdir + story_dir_prefix + str(story) + "\\story_marked.html",
                "w",
                encoding="utf-8",
        ) as wr:
            wr.write(booktml)
        freq_list, unknown_freq_list = kana.get_freq_lists(
            token_words, unknown_words)
        with open(
                nhkdir + story_dir_prefix + str(story) + "\\story_freq.txt",
                "w",
                encoding="utf-8",
        ) as wr:
            for w, f in freq_list:
                wr.write(f"{w}, {f}\n")
        with open(
                nhkdir + story_dir_prefix + str(story) +
                "\\story_unknown_freq.txt",
                "w",
                encoding="utf-8",
        ) as wr:
            for w, f in unknown_freq_list:
                wr.write(f"{w}, {f}\n")
def main(print_unknown_kanji):
    __loc__ = os.path.dirname(os.path.realpath(__file__))
    path = __loc__ + "\\resources"
    kw_path = path + "\\.known_words.txt"
    notopath = (
        os.path.expanduser("~") +
        r"\AppData\Local\Microsoft\Windows\Fonts\NotoSansJP-Regular.otf")

    try:
        fontjp = ImageFont.truetype(notopath, size=KANJISIZE)
    except:
        try:
            # fontjp = ImageFont.truetype('msgothic.ttc', size=KANJISIZE)
            fontjp = ImageFont.truetype("YuGothM.ttc", size=KANJISIZE)
            # fontjp = ImageFont.truetype('msmincho.ttc', size=KANJISIZE)
        except:
            print("Unlucky")
    try:
        headerfont = ImageFont.truetype("cambria.ttc", size=HEADERFONTSIZE)
    except:
        print("Unlucky")
    print("Used Font: " + fontjp.font.family)
    with open(kw_path, "r", encoding="utf-8") as file:
        known_words = file.read()
    kanjistring = kana.remove_non_kanji(known_words)
    kanji_freq = Counter(kanjistring)
    known_kanji = kana.get_unique_kanji(known_words)

    all_kanji_in_grading = set()

    columns = COLUMNS
    tempimg = Image.new("RGB", (fontjp.size * columns, 0))
    for idx in range(len(grades)):
        #     print(idx)
        grade = grades[idx]
        with open(path + "/Grade_" + grade + ".txt", "r",
                  encoding="utf-8") as file:
            grade_list = file.read().split("\n")
        all_kanji_in_grading.update(grade_list)
        grid = get_kanji_grid_wcounter(grade_list, kanji_freq, fontjp, columns)
        # now generate header for the grade
        gradestring = f"Grade {grade}"
        head = get_header(gradestring, headerfont, headerfont.size, columns)
        concat = get_vert_cat(head, grid)
        tempimg = get_vert_cat(tempimg, concat)
        known_kanji.difference_update(set(grade_list))

    if bool(known_kanji):
        gradestring = f"Known Kanji outside the Grading"
        head = get_header(gradestring, headerfont, headerfont.size, columns)
        kanjilist = list(known_kanji)
        kanjilist.sort()
        grid = get_kanji_grid_wcounter(kanjilist, kanji_freq, fontjp, columns)
        concat = get_vert_cat(head, grid)
        tempimg = get_vert_cat(tempimg, concat)
        known_kanji = kana.get_unique_kanji(known_words)
        unknown_kanji = all_kanji_in_grading.difference(known_kanji)
        total = len(all_kanji_in_grading)
        test = get_stats(kanji_freq, unknown_kanji, total, headerfont,
                         headerfont.size, columns)
        tempimg = get_vert_cat(tempimg, test)

    # add side padding
    pad_sides = Image.new("RGB", (KANJISIZE, tempimg.height), color="#FFFFFF")
    tempimg = get_hori_cat(tempimg, pad_sides)
    tempimg = get_hori_cat(pad_sides, tempimg)

    now = datetime.now()
    current_time = now.strftime("_%H_%M")
    current_date = now.strftime("%Y_%m_%d")
    tempimg.save(path + f"/Kanjigrid_{current_date}{current_time}.png")

    if print_unknown_kanji:
        print(unknown_kanji)
for v in val:
    sel_ser = shorted_subfol[v - 1]
    cur_sel = [vol for vol in subfolders if sel_ser in vol]
    selected_list.extend(cur_sel)

filelist = [f"{lndir}{s}/{s}.html" for s in selected_list]


corpus = ""
for book in filelist:
    with open(book, "r", encoding="utf-8") as file:
        data = file.read()
    cleaned_data = kana.markup_book_html_rem_furigana(data)
    corpus += cleaned_data

uniq_kanji = kana.get_unique_kanji(corpus)

corpus = kana.reduce_new_lines(corpus)

# tagging the whole corpus takes a long time depending on the corpus
# the list comps are also slow
# token_words = [[word.surface, word.feature.lemma] for word in tagger(testcorpus)]
# splitting the corpus and feeding it into the tagger does not increase speed.
# nothing important gets lost though and fugashi takes up a lot of memory if the input is big
token_flat = [
    feat
    for word in tagger(corpus)
    for feat in [word.surface, word.feature.lemma]
    if feat
]
# token_flat = [word for word in token_flat if word]
def main(no_kanjigrid, user):
    try:
        import fugashi

        tagger = fugashi.Tagger()
        EXTRA = True
    except ModuleNotFoundError:
        EXTRA = False

    __loc__ = os.path.abspath("")
    __loc__ = os.path.dirname(os.path.realpath(__file__))
    DISREGARD_OLD_KNOWN = False
    ADD_NX_SUP = False
    CREATE_KANJIGRID = not no_kanjigrid
    COUNT_NEW_LEECHES = True

    write_to_file_text = ""

    col = Collection(user=user)
    notes = col.cards.merge_notes()

    path = __loc__ + "\\resources"
    kw_path = path + "\\.known_words.txt"

    if os.path.isfile(path + "\\.previous.txt"):
        with open(path + "\\.previous.txt", "r", encoding="utf-8") as file:
            print("Previous known words:")
            print(file.read())
            print("_" * 50 + "\n" * 2)
            print("Current known words:")

    with open(path + "\\anki_cards.txt", "r") as file:
        card_list = file.read().splitlines()

    words = []
    for cards in card_list:
        card, field = cards.split(":")
        field = int(field)
        selection = notes.query(
            f"nmodel == '{card}' and cqueue == 'due' "
            f"or nmodel == '{card}' and cqueue == 'suspended'"
        )
        sellist = selection["nflds"].tolist()
        if COUNT_NEW_LEECHES:
            mask = notes.ntags.apply(lambda x: "leech" in x)
            leech_sel = notes[mask]
            sel = leech_sel.query(f"nmodel == '{card}' and cqueue == 'new'")
            sellist.extend(sel["nflds"].tolist())
        print(f"card model {card} found:")
        write_to_file_text = write_to_file_text + f"card model {card} found:" + "\n"
        print(len(sellist))
        write_to_file_text = write_to_file_text + str(len(sellist)) + "\n"
        for w in sellist:
            if not kana.is_single_kana(w[field - 1]):
                words.append(w[field - 1])

    uniq_w = set(words)

    # for a better reprensation of what i actually known
    # it would probably be better to do this right before any processing
    # and not now which just inflates the numbers
    # 21.01 still unsure about this
    if EXTRA:
        extra = set()
        for w in uniq_w:
            w = kana.markup_book_html(w)
            tags = [
                word.feature.lemma if word.feature.lemma else word.surface
                for word in tagger(w)
            ]
            tags = [
                kana.clean_lemma(token)
                for token in tags
                if not kana.is_single_kana(token)
            ]
            tags = kana.get_unique_token_words(tags)
            extra.update(tags)

        uniq_w.update(extra)

    if not DISREGARD_OLD_KNOWN:
        if os.path.isfile(kw_path):
            with open(kw_path, "r", encoding="utf-8") as file:
                previous_known = file.read().splitlines()
                previous_known = [
                    word
                    for word in previous_known
                    if not kana.is_single_kana(word) and word
                ]
        uniq_w.update(previous_known)

    if ADD_NX_SUP:
        nx_sup = []
        for i in range(1, 6):
            if os.path.isfile("n" + str(i) + ".txt"):
                # print(i)
                with open("n" + str(i) + ".txt", "r", encoding="utf-8") as file:
                    # print(sum(1 for _ in file))
                    nx_sup.extend(list(file.read().split("\n")))

                uniq_w.update(nx_sup)

    muniq = {w for w in kana.markup_known_words("\n".join(uniq_w)) if w != ""}
    muniq = list(muniq)
    muniq.sort()

    uniqK = kana.get_unique_kanji(muniq)

    print(f"found a total of {len(muniq)} words")
    print(f"with a total of {len(uniqK)} unique kanji")
    write_to_file_text = (
        write_to_file_text + f"found a total of {len(muniq)} words" + "\n"
    )
    write_to_file_text = (
        write_to_file_text + f"with a total of {len(uniqK)} unique kanji" + "\n"
    )

    with open(kw_path, "w", encoding="utf-8") as wr:
        wr.write("\n".join(muniq))

    with open(path + "\\.previous.txt", "w", encoding="utf-8") as wr:
        wr.write(write_to_file_text)

    add_data = [
        {
            "Date": current_date,
            "Time": current_time,
            "Words": len(muniq),
            "Kanji": len(uniqK),
        }
    ]
    if os.path.isfile(path + "\\.progress.csv"):
        prog_df = pd.read_csv(path + "\\.progress.csv", index_col=0)
        prog_df = prog_df.append(add_data, ignore_index=True, sort=False)
        prog_df.to_csv(path + "\\.progress.csv", index_label="Index")
    else:
        prog_df = pd.DataFrame(add_data)
        prog_df.to_csv(path + "\\.progress.csv", index_label="Index")

    if CREATE_KANJIGRID:
        kj.main()