Esempio n. 1
0
def parse_block(hanzi, notes, expand, definition_link_function):
    hsk_word_count = {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0}
    hsk_char_count = {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0}
    words = set()
    chars = set()
    ignore_chars = set()
    unknown_chars = set()
    tokenised = mostlikelywordsallchars(hanzi, expand, definition_link_function, notes)
    for ishanzi, text in tokenised:
        if ishanzi:
            for char in text:
                chars.add(char)
                hsk_char_count[get_hsk_char_level(char)] += 1
                if char not in char_freq:
                    unknown_chars.add(char)
            words.add(text)
            hsk_word_count[get_hsk_word_level(text)] += 1
        else:
            ignore_chars.add(text)
    if len(ignore_chars):
        notes.append(
            "Ignored characters: " + ", ".join([definition_link_function(h, "none", expand) for h in ignore_chars]))
    if len(unknown_chars):
        notes.append(
            "Unknown characters: " + ", ".join([definition_link_function(h, "none", expand) for h in unknown_chars]))
    return words, chars, hsk_word_count, hsk_char_count
Esempio n. 2
0
def init_radical_data():
    for char, freq in char_freq.items():
        if char in cc_radicalof:
            radical = cc_radicalof[char]
            if radical not in radical_freq:
                radical_freq[radical] = 0
            radical_freq[radical] += freq
            radical_level = get_hsk_char_level(radical)
            char_level_char = get_hsk_char_level(char)
            if char_level_char != 0 and char_level_char < radical_level:
                radical_level = char_level_char
            if ((radical in hsk_radical_level
                 and radical_level < hsk_radical_level[radical]
                 and radical_level != 0)
                    or (radical not in hsk_radical_level)):
                hsk_radical_level[radical] = radical_level
    frequency_order = [(freq, radical)
                       for (radical, freq) in radical_freq.items()]
    frequency_order.sort()
    frequency_order.reverse()
    for i in range(len(frequency_order)):
        radical_frequency_index[frequency_order[i][1]] = i + 1
Esempio n. 3
0
def parse_list(hanzi, notes, allownonhanzi, expand, definition_link_function):
    hsk_word_count = {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0}
    hsk_char_count = {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0}
    words = set()
    chars = set()
    ignore_words = set()
    ignore_chars = set()
    unknown_words = set()
    unknown_chars = set()
    for line in hanzi.split("\n"):
        chunks = line.split()
        if len(chunks):
            word = chunks[0]
            if allownonhanzi or len([c for c in word if not char_is_ok(c)]) == 0:
                words.add(word)
                hsk_word_count[get_hsk_word_level(word)] += 1
                if (not allownonhanzi) and word not in word_freq:
                    unknown_words.add(word)
            else:
                ignore_words.add(word)
            for char in word:
                if allownonhanzi or char_is_ok(char):
                    chars.add(char)
                    hsk_char_count[get_hsk_char_level(char)] += 1
                    if (not allownonhanzi) and char not in char_freq:
                        unknown_chars.add(char)
                else:
                    ignore_chars.add(char)

    if len(ignore_words):
        notes.append(
            "Ignored words: " + ", ".join([definition_link_function(h, "none", expand) for h in ignore_words]))
    if len(ignore_chars):
        notes.append(
            "Ignored characters: " + ", ".join([definition_link_function(h, "none", expand) for h in ignore_chars]))
    if len(unknown_words):
        notes.append(
            "Unknown words: " + ", ".join([definition_link_function(h, "none", expand) for h in unknown_words]))
    if len(unknown_chars):
        notes.append(
            "Unknown characters: " + ", ".join([definition_link_function(h, "none", expand) for h in unknown_chars]))
    return words, chars, hsk_word_count, hsk_char_count
Esempio n. 4
0
def hanzi_definition_link(results, hanziword, hanzichar, colour, expand):
    r"""
    Keyword arguments:

    colour -- can be 'char', 'word', 'none', 'auto'
    """

    urlstub = "/cidian?{}q=".format("expand=yes&" if expand == "yes" else "")
    url = urlstub + urllib.parse.quote(html.escape(hanziword))
    title = get_linktext_hanzi_info(hanziword, hanzichar, 30)
    wordlevel = get_hsk_word_level(hanziword)
    charlevel = get_hsk_char_level(hanzichar)
    radicallevel = get_hsk_radical_level(hanzichar)
    linkclass = "definition"
    if (colour == "word" or colour == "auto") and wordlevel > 0:
        linkclass = "hsk{3}"
    elif (colour == "char" or colour == "auto") and charlevel > 0:
        linkclass = "hsk{4}"
    elif colour == "radical":
        linkclass = "hsk{5}"
    link_format = '<a class="' + linkclass + '" href="{0}" title="{1}">{2}</a>'
    results.append(
        link_format.format(url, html.escape(title), html.escape(hanzichar),
                           wordlevel, charlevel, radicallevel))