def analysewords(results, words, chars, hsk_word_count, hsk_char_count): results.append("<h4>Analysis of Words/Characters in Input</h4>") singlecharcount = len([w for w in words if len(w) == 1]) wordcount = len(words) charcount = len(chars) totalwords = sum(hsk_word_count.values()) totalchars = sum(hsk_char_count.values()) subtlexwords = subtlex_word_set & words ccedictwords = cedict_word_set & words results.append("""Input contained:<ul> <li>{} unique single-character entries</li> <li>{} unique multi-character entries</li> <li>{} unique entries</li> <li>{} total entries</li> <li>{} unique characters</li> <li>{} total characters</li> <li>{} unique words as recognised by SUBTLEX-CH</li> <li>{} unique words as recognised by CC-CEDICT</li> </ul>""".format(singlecharcount, wordcount - singlecharcount, wordcount, totalwords, charcount, totalchars, len(subtlexwords), len(ccedictwords))) wordsknown = "\n".join(frequency_order_word(words)) charsknown = "\n".join(frequency_order_char(chars)) subtlexknown = "\n".join(frequency_order_word(subtlexwords)) ccedictknown = "\n".join(frequency_order_word(ccedictwords)) results.append(blockboxtemplate().format("Unique Entries", "wordsknown", wordsknown)) results.append(blockboxtemplate().format("Unique Characters", "charsknown", charsknown)) results.append(blockboxtemplate().format("SUBTLEX Words", "subtlexknown", subtlexknown)) results.append(blockboxtemplate().format("CC-CEDICT Words", "cedictknown", ccedictknown))
def suggesthskwords(results, words): results.append("""<h4>Suggested HSK Words not in Input</h4>""") for i in range(1, 7): wordstolearn = "\n".join(frequency_order_word(hsk_words[i] - words)) results.append(blockboxtemplate().format("HSK " + str(i), "hskwordstolearn" + str(i), wordstolearn)) foundwords = [] for freq, word in word_frequency_ordered: if word not in words and word not in hsk_words[16]: foundwords.append(word) if len(foundwords) >= 1000: break wordstext = "\n".join(frequency_order_word(foundwords)) results.append(blockboxtemplate().format("Non-HSK", "nonhskwordstolearn" + str(i), wordstext))
def suggestfreqwords(results, words): results.append("""<h4>Suggested Words not in Input</h4>""") foundwords = [] for freq, word in word_frequency_ordered: if word not in words: foundwords.append(word) if len(foundwords) >= 1000: break wordstext = "\n".join(frequency_order_word(foundwords)) results.append(textareatemplate.format("highfreqwords", wordstext))
def suggestfreqwordsre(results, words, chars): results.append("""<h4>Suggested Words Using Characters in Input</h4>""") foundwords = [] for freq, word in word_frequency_ordered: if word not in words: allcharsmatch = True for char in word: if char not in chars: allcharsmatch = False break if not allcharsmatch: continue foundwords.append(word) if len(foundwords) >= 1000: break wordstext = "\n".join(frequency_order_word(foundwords)) results.append(textareatemplate.format("highfreqwordsreuse", wordstext))
def perform_set_operations(hanzi_a, hanzi_b, results, expand): init_resources() notes = [] if get_parameter("formatA") == "block": words_a, chars_a, hsk_word_count, hsk_char_count = parse_block( hanzi_a, notes, expand, word_char_definition_link) elif get_parameter("formatA") == "commasep": words_a, chars_a, hsk_word_count, hsk_char_count = parse_comma_sep( hanzi_a, notes, True, expand, word_char_definition_link) else: words_a, chars_a, hsk_word_count, hsk_char_count = parse_list( hanzi_a, notes, True, expand, word_char_definition_link) if len(notes): results.append( """<h2><span style="color:red;">Warnings (List A)</span> <a class="arrowlink" href="javascript:toggle_visibility('warningshelp');"><small><small>(?)</small></small></a></h2><ul> <div id="warningshelp" class="inlinehelp" style="max-width:600px;"> <p>This section lists words and character that are being treated as Chinese but aren't in the CC-CEDICT that is being used.</p> <p>In addition, when potential word matches are ignored during parsing of a block of text, warnings below will show you the words that are in the dictionary but which were not chosen by the script.</p> </div><span style="color:red;">""") for note in notes: results.append("<li>{}</li>".format(note)) results.append("</ul></span>") notes = [] if get_parameter("formatB") == "block": words_b, chars_b, hsk_word_count, hsk_char_count = parse_block( hanzi_b, notes, expand, word_char_definition_link) elif get_parameter("formatB") == "commasep": words_b, chars_b, hsk_word_count, hsk_char_count = parse_comma_sep( hanzi_b, notes, True, expand, word_char_definition_link) else: words_b, chars_b, hsk_word_count, hsk_char_count = parse_list( hanzi_b, notes, True, expand, word_char_definition_link) if len(notes): results.append( """<h2><span style="color:red;">Warnings (List B)</span> <a class="arrowlink" href="javascript:toggle_visibility('warningshelp');"><small><small>(?)</small></small></a></h2><ul> <div id="warningshelp" class="inlinehelp" style="max-width:600px;"> <p>This section lists words and character that are being treated as Chinese but aren't in the CC-CEDICT that is being used.</p> <p>In addition, when potential word matches are ignored during parsing of a block of text, warnings below will show you the words that are in the dictionary but which were not chosen by the script.</p> </div><span style="color:red;">""") for note in notes: results.append("<li>{}</li>".format(note)) results.append("</ul></span>") joinchar = "" if get_parameter("outputformat") == "oneperline": joinchar = "\n" elif get_parameter("outputformat") == "commasep": joinchar = "," elif get_parameter("outputformat") == "tabsep": joinchar = "\t" else: joinchar = " " results.append( """<h4>Set Operations on Words <a class="arrowlink" href="javascript:toggle_visibility('wordoperationshelp');"><small><small>(?)</small></small></a></h4> <div id="wordoperationshelp" class="inlinehelp" style="max-width:600px;"> <p><b>A<sub>w</sub></b> and <b>B<sub>w</sub></b> are the sets of all unique words derived from Word Lists A and B above.</p> <p><b>A<sub>w</sub> \u2229 B<sub>w</sub></b> <i>Intersection, words that appear in both sets.</i><br /> <b>A<sub>w</sub> \u222A B<sub>w</sub></b> <i>Union, both sets of words combined together as a single set.</i><br /> <b>A<sub>w</sub> \u2216 B<sub>w</sub></b> <i>Difference, words that are <b>A<sub>w</sub></b> but not <b>B<sub>w</sub></b>.</i><br /> <b>B<sub>w</sub> \u2216 A<sub>w</sub></b> <i>Difference, words that are <b>B<sub>w</sub></b> but not <b>A<sub>w</sub></b>.</i><br /> <b>A<sub>w</sub> \u2206 B<sub>w</sub></b> <i>Symmetric Difference, words that are in only one of the two sets.</i></p> <p>All sets are sorted with the most frequently used words first.</p> </div>""") results.append( setresultbox("A<sub>w</sub>", "Awords", frequency_order_word(words_a), joinchar, "word")) results.append( setresultbox("B<sub>w</sub>", "Bwords", frequency_order_word(words_b), joinchar, "word")) results.append( setresultbox("A<sub>w</sub> \u2229 B<sub>w</sub>", "AintersectBwords", frequency_order_word(words_a & words_b), joinchar, "word")) results.append( setresultbox("A<sub>w</sub> \u222A B<sub>w</sub>", "AunionBwords", frequency_order_word(words_a | words_b), joinchar, "word")) results.append( setresultbox("A<sub>w</sub> \u2216 B<sub>w</sub>", "AdifferenceBwords", frequency_order_word(words_a - words_b), joinchar, "word")) results.append( setresultbox("B<sub>w</sub> \u2216 A<sub>w</sub>", "BdifferenceAwords", frequency_order_word(words_b - words_a), joinchar, "word")) results.append( setresultbox("A<sub>w</sub> \u2206 B<sub>w</sub>", "AsymmmetricBwords", frequency_order_word(words_a ^ words_b), joinchar, "word")) results.append( """<h4>Set Operations on Characters <a class="arrowlink" href="javascript:toggle_visibility('charoperationshelp');"><small><small>(?)</small></small></a></h4> <div id="charoperationshelp" class="inlinehelp" style="max-width:600px;"> <p><b>A<sub>c</sub></b> and <b>B<sub>c</sub></b> are the sets of all unique characters derived from Word Lists A and B above.</p> <p><b>A<sub>c</sub> \u2229 B<sub>c</sub></b> <i>Intersection, characters that appear in both sets.</i><br /> <b>A<sub>c</sub> \u222A B<sub>c</sub></b> <i>Union, both sets of characters combined together as a single set.</i><br /> <b>A<sub>c</sub> \u2216 B<sub>c</sub></b> <i>Difference, characters that are <b>A<sub>c</sub></b> but not <b>B<sub>c</sub></b>.</i><br /> <b>B<sub>c</sub> \u2216 A<sub>c</sub></b> <i>Difference, characters that are <b>B<sub>c</sub></b> but not <b>A<sub>c</sub></b>.</i><br /> <b>A<sub>c</sub> \u2206 B<sub>c</sub></b> <i>Symmetric Difference, characters that are in only one of the two sets.</i></p> <p>All sets are sorted with the most frequently used characters first.</p> </div>""") results.append( setresultbox("A<sub>c</sub>", "Achars", frequency_order_word(chars_a), joinchar, "char")) results.append( setresultbox("B<sub>c</sub>", "Bchars", frequency_order_word(chars_b), joinchar, "char")) results.append( setresultbox("A<sub>c</sub> \u2229 B<sub>c</sub>", "AintersectBchars", frequency_order_char(chars_a & chars_b), joinchar, "char")) results.append( setresultbox("A<sub>c</sub> \u222A B<sub>c</sub>", "AunionBchars", frequency_order_char(chars_a | chars_b), joinchar, "char")) results.append( setresultbox("A<sub>c</sub> \u2216 B<sub>c</sub>", "AdifferenceBchars", frequency_order_char(chars_a - chars_b), joinchar, "char")) results.append( setresultbox("B<sub>c</sub> \u2216 A<sub>c</sub>", "BdifferenceAchars", frequency_order_char(chars_b - chars_a), joinchar, "char")) results.append( setresultbox("A<sub>c</sub> \u2206 B<sub>c</sub>", "AsymmmetricBchars", frequency_order_char(chars_a ^ chars_b), joinchar, "char"))
def analysehskwords(results, words, hsk_word_count): knownintersect = {} results.append("<h4>Analysis of HSK Words in Input</h4>") results.append("Input contained:<ul>") cumulativeknown = {} cumulativetotal = {} cumulativeknown[0] = 0 cumulativetotal[0] = 0 numknown = {} numhsk = {} for i in range(1, 7): knownintersect[i] = words & hsk_words[i] numknown[i] = len(knownintersect[i]) numhsk[i] = len(hsk_words[i]) percentknown = 100 * float(numknown[i]) / numhsk[i] cumulativeknown[i] = cumulativeknown[i - 1] + numknown[i] cumulativetotal[i] = cumulativetotal[i - 1] + numhsk[i] results.append("""<li>{} ({:.2f}%) of the {} HSK {} words""".format( numknown[i], percentknown, numhsk[i], i)) if i > 1 > 0: cumpercentknown = 100 * float( cumulativeknown[i]) / cumulativetotal[i] results.append( """ <i>(Cumulative: {} ({:.2f}%) of the {} HSK 1-{} words)</i>""" .format(cumulativeknown[i], cumpercentknown, cumulativetotal[i], i)) results.append("</li>") results.append("</ul>") totalunique = len(words) if totalunique > 0: numknown_nonhsk = totalunique - cumulativeknown[6] results.append( "Of the {} <b>unique</b> words in the input:<ul>".format( totalunique)) for i in range(1, 7): percentknown = 100 * float(numknown[i]) / totalunique results.append("""<li>{} ({:.2f}%) were HSK {} words""".format( numknown[i], percentknown, i)) if i > 1: cumpercentknown = 100 * float(cumulativeknown[i]) / totalunique results.append( """<i>(Cumulative: {} ({:.2f}%) were HSK 1-{} words)</i>""" .format(cumulativeknown[i], cumpercentknown, i)) results.append("</li>") numknown_nonhsk_percent = 100 * float(numknown_nonhsk) / totalunique results.append("""<li>{} ({:.2f}%) were non-HSK words</li>""".format( numknown_nonhsk, numknown_nonhsk_percent)) results.append("</ul>") totalwords = sum(hsk_word_count.values()) if totalwords == totalunique: results.append( "<p><i>Each word appeared only once in the input.</i></p>") else: cumknown = 0 results.append( "Of the {} <b>total</b> words that were input:<ul>".format( totalwords)) for i in range(1, 7): percentknown = 100 * float(hsk_word_count[i]) / totalwords cumknown += hsk_word_count[i] results.append("""<li>{} ({:.2f}%) were HSK {} words""".format( hsk_word_count[i], percentknown, i)) if i > 1: cumpercentknown = 100 * float(cumknown) / totalwords results.append( """<i>(Cumulative: {} ({:.2f}%) were HSK 1-{} words)</i>""" .format(cumknown, cumpercentknown, i)) results.append("</li>") num_nonhsk = totalwords - cumknown numknown_nonhsk_percent = 100 * float(num_nonhsk) / totalwords results.append("""<li>{} ({:.2f}%) were non-HSK words</li>""".format( num_nonhsk, numknown_nonhsk_percent)) results.append("</ul>") for i in range(1, 7): wordsknown = "\n".join(frequency_order_word(knownintersect[i])) results.append(blockboxtemplate().format("HSK " + str(i), "hskwordsknown" + str(i), wordsknown)) nonhskwords = "\n".join(frequency_order_word(words - hsk_words[16])) results.append(blockboxtemplate().format("Non-HSK", "nonhskwordsknown", nonhskwords))