def parse_block(hanzi, notes, expand, definition_link_function): hsk_word_count = {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0} hsk_char_count = {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0} words = set() chars = set() ignore_chars = set() unknown_chars = set() tokenised = mostlikelywordsallchars(hanzi, expand, definition_link_function, notes) for ishanzi, text in tokenised: if ishanzi: for char in text: chars.add(char) hsk_char_count[get_hsk_char_level(char)] += 1 if char not in char_freq: unknown_chars.add(char) words.add(text) hsk_word_count[get_hsk_word_level(text)] += 1 else: ignore_chars.add(text) if len(ignore_chars): notes.append( "Ignored characters: " + ", ".join([definition_link_function(h, "none", expand) for h in ignore_chars])) if len(unknown_chars): notes.append( "Unknown characters: " + ", ".join([definition_link_function(h, "none", expand) for h in unknown_chars])) return words, chars, hsk_word_count, hsk_char_count
def init_radical_data(): for char, freq in char_freq.items(): if char in cc_radicalof: radical = cc_radicalof[char] if radical not in radical_freq: radical_freq[radical] = 0 radical_freq[radical] += freq radical_level = get_hsk_char_level(radical) char_level_char = get_hsk_char_level(char) if char_level_char != 0 and char_level_char < radical_level: radical_level = char_level_char if ((radical in hsk_radical_level and radical_level < hsk_radical_level[radical] and radical_level != 0) or (radical not in hsk_radical_level)): hsk_radical_level[radical] = radical_level frequency_order = [(freq, radical) for (radical, freq) in radical_freq.items()] frequency_order.sort() frequency_order.reverse() for i in range(len(frequency_order)): radical_frequency_index[frequency_order[i][1]] = i + 1
def parse_list(hanzi, notes, allownonhanzi, expand, definition_link_function): hsk_word_count = {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0} hsk_char_count = {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0} words = set() chars = set() ignore_words = set() ignore_chars = set() unknown_words = set() unknown_chars = set() for line in hanzi.split("\n"): chunks = line.split() if len(chunks): word = chunks[0] if allownonhanzi or len([c for c in word if not char_is_ok(c)]) == 0: words.add(word) hsk_word_count[get_hsk_word_level(word)] += 1 if (not allownonhanzi) and word not in word_freq: unknown_words.add(word) else: ignore_words.add(word) for char in word: if allownonhanzi or char_is_ok(char): chars.add(char) hsk_char_count[get_hsk_char_level(char)] += 1 if (not allownonhanzi) and char not in char_freq: unknown_chars.add(char) else: ignore_chars.add(char) if len(ignore_words): notes.append( "Ignored words: " + ", ".join([definition_link_function(h, "none", expand) for h in ignore_words])) if len(ignore_chars): notes.append( "Ignored characters: " + ", ".join([definition_link_function(h, "none", expand) for h in ignore_chars])) if len(unknown_words): notes.append( "Unknown words: " + ", ".join([definition_link_function(h, "none", expand) for h in unknown_words])) if len(unknown_chars): notes.append( "Unknown characters: " + ", ".join([definition_link_function(h, "none", expand) for h in unknown_chars])) return words, chars, hsk_word_count, hsk_char_count
def hanzi_definition_link(results, hanziword, hanzichar, colour, expand): r""" Keyword arguments: colour -- can be 'char', 'word', 'none', 'auto' """ urlstub = "/cidian?{}q=".format("expand=yes&" if expand == "yes" else "") url = urlstub + urllib.parse.quote(html.escape(hanziword)) title = get_linktext_hanzi_info(hanziword, hanzichar, 30) wordlevel = get_hsk_word_level(hanziword) charlevel = get_hsk_char_level(hanzichar) radicallevel = get_hsk_radical_level(hanzichar) linkclass = "definition" if (colour == "word" or colour == "auto") and wordlevel > 0: linkclass = "hsk{3}" elif (colour == "char" or colour == "auto") and charlevel > 0: linkclass = "hsk{4}" elif colour == "radical": linkclass = "hsk{5}" link_format = '<a class="' + linkclass + '" href="{0}" title="{1}">{2}</a>' results.append( link_format.format(url, html.escape(title), html.escape(hanzichar), wordlevel, charlevel, radicallevel))