Ejemplo n.º 1
0
def analysewords(results, words, chars, hsk_word_count, hsk_char_count):
    results.append("<h4>Analysis of Words/Characters in Input</h4>")
    singlecharcount = len([w for w in words if len(w) == 1])
    wordcount = len(words)
    charcount = len(chars)
    totalwords = sum(hsk_word_count.values())
    totalchars = sum(hsk_char_count.values())
    subtlexwords = subtlex_word_set & words
    ccedictwords = cedict_word_set & words
    results.append("""Input contained:<ul>
<li>{} unique single-character entries</li>
<li>{} unique multi-character entries</li>
<li>{} unique entries</li>
<li>{} total entries</li>
<li>{} unique characters</li>
<li>{} total characters</li>
<li>{} unique words as recognised by SUBTLEX-CH</li>
<li>{} unique words as recognised by CC-CEDICT</li>
</ul>""".format(singlecharcount, wordcount - singlecharcount, wordcount,
                totalwords, charcount, totalchars, len(subtlexwords),
                len(ccedictwords)))
    wordsknown = "\n".join(frequency_order_word(words))
    charsknown = "\n".join(frequency_order_char(chars))
    subtlexknown = "\n".join(frequency_order_word(subtlexwords))
    ccedictknown = "\n".join(frequency_order_word(ccedictwords))
    results.append(blockboxtemplate().format("Unique Entries", "wordsknown",
                                             wordsknown))
    results.append(blockboxtemplate().format("Unique Characters", "charsknown",
                                             charsknown))
    results.append(blockboxtemplate().format("SUBTLEX Words", "subtlexknown",
                                             subtlexknown))
    results.append(blockboxtemplate().format("CC-CEDICT Words", "cedictknown",
                                             ccedictknown))
Ejemplo n.º 2
0
def suggesthskwords(results, words):
    results.append("""<h4>Suggested HSK Words not in Input</h4>""")
    for i in range(1, 7):
        wordstolearn = "\n".join(frequency_order_word(hsk_words[i] - words))
        results.append(blockboxtemplate().format("HSK " + str(i),
                                                 "hskwordstolearn" + str(i),
                                                 wordstolearn))
    foundwords = []
    for freq, word in word_frequency_ordered:
        if word not in words and word not in hsk_words[16]:
            foundwords.append(word)
        if len(foundwords) >= 1000:
            break
    wordstext = "\n".join(frequency_order_word(foundwords))
    results.append(blockboxtemplate().format("Non-HSK",
                                             "nonhskwordstolearn" + str(i),
                                             wordstext))
Ejemplo n.º 3
0
def suggestfreqwords(results, words):
    results.append("""<h4>Suggested Words not in Input</h4>""")
    foundwords = []
    for freq, word in word_frequency_ordered:
        if word not in words:
            foundwords.append(word)
        if len(foundwords) >= 1000:
            break
    wordstext = "\n".join(frequency_order_word(foundwords))
    results.append(textareatemplate.format("highfreqwords", wordstext))
Ejemplo n.º 4
0
def suggestfreqwordsre(results, words, chars):
    results.append("""<h4>Suggested Words Using Characters in Input</h4>""")
    foundwords = []
    for freq, word in word_frequency_ordered:
        if word not in words:
            allcharsmatch = True
            for char in word:
                if char not in chars:
                    allcharsmatch = False
                    break
            if not allcharsmatch:
                continue
            foundwords.append(word)
        if len(foundwords) >= 1000:
            break
    wordstext = "\n".join(frequency_order_word(foundwords))
    results.append(textareatemplate.format("highfreqwordsreuse", wordstext))
Ejemplo n.º 5
0
def perform_set_operations(hanzi_a, hanzi_b, results, expand):
    init_resources()

    notes = []
    if get_parameter("formatA") == "block":
        words_a, chars_a, hsk_word_count, hsk_char_count = parse_block(
            hanzi_a, notes, expand, word_char_definition_link)
    elif get_parameter("formatA") == "commasep":
        words_a, chars_a, hsk_word_count, hsk_char_count = parse_comma_sep(
            hanzi_a, notes, True, expand, word_char_definition_link)
    else:
        words_a, chars_a, hsk_word_count, hsk_char_count = parse_list(
            hanzi_a, notes, True, expand, word_char_definition_link)
    if len(notes):
        results.append(
            """<h2><span style="color:red;">Warnings (List A)</span> <a class="arrowlink" href="javascript:toggle_visibility('warningshelp');"><small><small>(?)</small></small></a></h2><ul>
 <div id="warningshelp" class="inlinehelp" style="max-width:600px;">
    <p>This section lists words and character that are being treated as Chinese but aren't in the CC-CEDICT that is being used.</p>
    <p>In addition, when potential word matches are ignored during parsing of a block of text, warnings below will show you
    the words that are in the dictionary but which were not chosen by the script.</p>
 </div><span style="color:red;">""")
        for note in notes:
            results.append("<li>{}</li>".format(note))
        results.append("</ul></span>")

    notes = []
    if get_parameter("formatB") == "block":
        words_b, chars_b, hsk_word_count, hsk_char_count = parse_block(
            hanzi_b, notes, expand, word_char_definition_link)
    elif get_parameter("formatB") == "commasep":
        words_b, chars_b, hsk_word_count, hsk_char_count = parse_comma_sep(
            hanzi_b, notes, True, expand, word_char_definition_link)
    else:
        words_b, chars_b, hsk_word_count, hsk_char_count = parse_list(
            hanzi_b, notes, True, expand, word_char_definition_link)
    if len(notes):
        results.append(
            """<h2><span style="color:red;">Warnings (List B)</span> <a class="arrowlink" href="javascript:toggle_visibility('warningshelp');"><small><small>(?)</small></small></a></h2><ul>
 <div id="warningshelp" class="inlinehelp" style="max-width:600px;">
    <p>This section lists words and character that are being treated as Chinese but aren't in the CC-CEDICT that is being used.</p>
    <p>In addition, when potential word matches are ignored during parsing of a block of text, warnings below will show you
    the words that are in the dictionary but which were not chosen by the script.</p>
 </div><span style="color:red;">""")
        for note in notes:
            results.append("<li>{}</li>".format(note))
        results.append("</ul></span>")

    joinchar = ""
    if get_parameter("outputformat") == "oneperline":
        joinchar = "\n"
    elif get_parameter("outputformat") == "commasep":
        joinchar = ","
    elif get_parameter("outputformat") == "tabsep":
        joinchar = "\t"
    else:
        joinchar = " "

    results.append(
        """<h4>Set Operations on Words <a class="arrowlink" href="javascript:toggle_visibility('wordoperationshelp');"><small><small>(?)</small></small></a></h4>
 <div id="wordoperationshelp" class="inlinehelp" style="max-width:600px;">
 <p><b>A<sub>w</sub></b> and <b>B<sub>w</sub></b> are the sets of all unique words derived from Word Lists A and B above.</p>
 <p><b>A<sub>w</sub> \u2229 B<sub>w</sub></b> <i>Intersection, words that appear in both sets.</i><br />
    <b>A<sub>w</sub> \u222A B<sub>w</sub></b> <i>Union, both sets of words combined together as a single set.</i><br />
    <b>A<sub>w</sub> \u2216 B<sub>w</sub></b> <i>Difference, words that are <b>A<sub>w</sub></b> but not <b>B<sub>w</sub></b>.</i><br />
    <b>B<sub>w</sub> \u2216 A<sub>w</sub></b> <i>Difference, words that are <b>B<sub>w</sub></b> but not <b>A<sub>w</sub></b>.</i><br />
    <b>A<sub>w</sub> \u2206 B<sub>w</sub></b> <i>Symmetric Difference, words that are in only one of the two sets.</i></p>
    <p>All sets are sorted with the most frequently used words first.</p>
 </div>""")

    results.append(
        setresultbox("A<sub>w</sub>", "Awords", frequency_order_word(words_a),
                     joinchar, "word"))
    results.append(
        setresultbox("B<sub>w</sub>", "Bwords", frequency_order_word(words_b),
                     joinchar, "word"))
    results.append(
        setresultbox("A<sub>w</sub> \u2229 B<sub>w</sub>", "AintersectBwords",
                     frequency_order_word(words_a & words_b), joinchar,
                     "word"))
    results.append(
        setresultbox("A<sub>w</sub> \u222A B<sub>w</sub>", "AunionBwords",
                     frequency_order_word(words_a | words_b), joinchar,
                     "word"))
    results.append(
        setresultbox("A<sub>w</sub> \u2216 B<sub>w</sub>", "AdifferenceBwords",
                     frequency_order_word(words_a - words_b), joinchar,
                     "word"))
    results.append(
        setresultbox("B<sub>w</sub> \u2216 A<sub>w</sub>", "BdifferenceAwords",
                     frequency_order_word(words_b - words_a), joinchar,
                     "word"))
    results.append(
        setresultbox("A<sub>w</sub> \u2206 B<sub>w</sub>", "AsymmmetricBwords",
                     frequency_order_word(words_a ^ words_b), joinchar,
                     "word"))

    results.append(
        """<h4>Set Operations on Characters <a class="arrowlink" href="javascript:toggle_visibility('charoperationshelp');"><small><small>(?)</small></small></a></h4>
 <div id="charoperationshelp" class="inlinehelp" style="max-width:600px;">
 <p><b>A<sub>c</sub></b> and <b>B<sub>c</sub></b> are the sets of all unique characters derived from Word Lists A and B above.</p>
 <p><b>A<sub>c</sub> \u2229 B<sub>c</sub></b> <i>Intersection, characters that appear in both sets.</i><br />
    <b>A<sub>c</sub> \u222A B<sub>c</sub></b> <i>Union, both sets of characters combined together as a single set.</i><br />
    <b>A<sub>c</sub> \u2216 B<sub>c</sub></b> <i>Difference, characters that are <b>A<sub>c</sub></b> but not <b>B<sub>c</sub></b>.</i><br />
    <b>B<sub>c</sub> \u2216 A<sub>c</sub></b> <i>Difference, characters that are <b>B<sub>c</sub></b> but not <b>A<sub>c</sub></b>.</i><br />
    <b>A<sub>c</sub> \u2206 B<sub>c</sub></b> <i>Symmetric Difference, characters that are in only one of the two sets.</i></p>
    <p>All sets are sorted with the most frequently used characters first.</p>
 </div>""")

    results.append(
        setresultbox("A<sub>c</sub>", "Achars", frequency_order_word(chars_a),
                     joinchar, "char"))
    results.append(
        setresultbox("B<sub>c</sub>", "Bchars", frequency_order_word(chars_b),
                     joinchar, "char"))
    results.append(
        setresultbox("A<sub>c</sub> \u2229 B<sub>c</sub>", "AintersectBchars",
                     frequency_order_char(chars_a & chars_b), joinchar,
                     "char"))
    results.append(
        setresultbox("A<sub>c</sub> \u222A B<sub>c</sub>", "AunionBchars",
                     frequency_order_char(chars_a | chars_b), joinchar,
                     "char"))
    results.append(
        setresultbox("A<sub>c</sub> \u2216 B<sub>c</sub>", "AdifferenceBchars",
                     frequency_order_char(chars_a - chars_b), joinchar,
                     "char"))
    results.append(
        setresultbox("B<sub>c</sub> \u2216 A<sub>c</sub>", "BdifferenceAchars",
                     frequency_order_char(chars_b - chars_a), joinchar,
                     "char"))
    results.append(
        setresultbox("A<sub>c</sub> \u2206 B<sub>c</sub>", "AsymmmetricBchars",
                     frequency_order_char(chars_a ^ chars_b), joinchar,
                     "char"))
Ejemplo n.º 6
0
def analysehskwords(results, words, hsk_word_count):
    knownintersect = {}
    results.append("<h4>Analysis of HSK Words in Input</h4>")
    results.append("Input contained:<ul>")
    cumulativeknown = {}
    cumulativetotal = {}
    cumulativeknown[0] = 0
    cumulativetotal[0] = 0
    numknown = {}
    numhsk = {}
    for i in range(1, 7):
        knownintersect[i] = words & hsk_words[i]
        numknown[i] = len(knownintersect[i])
        numhsk[i] = len(hsk_words[i])
        percentknown = 100 * float(numknown[i]) / numhsk[i]
        cumulativeknown[i] = cumulativeknown[i - 1] + numknown[i]
        cumulativetotal[i] = cumulativetotal[i - 1] + numhsk[i]
        results.append("""<li>{} ({:.2f}%) of the {} HSK {} words""".format(
            numknown[i], percentknown, numhsk[i], i))
        if i > 1 > 0:
            cumpercentknown = 100 * float(
                cumulativeknown[i]) / cumulativetotal[i]
            results.append(
                """ <i>(Cumulative: {} ({:.2f}%) of the {} HSK 1-{} words)</i>"""
                .format(cumulativeknown[i], cumpercentknown,
                        cumulativetotal[i], i))
        results.append("</li>")
    results.append("</ul>")
    totalunique = len(words)
    if totalunique > 0:
        numknown_nonhsk = totalunique - cumulativeknown[6]
        results.append(
            "Of the {} <b>unique</b> words in the input:<ul>".format(
                totalunique))
        for i in range(1, 7):
            percentknown = 100 * float(numknown[i]) / totalunique
            results.append("""<li>{} ({:.2f}%) were HSK {} words""".format(
                numknown[i], percentknown, i))
            if i > 1:
                cumpercentknown = 100 * float(cumulativeknown[i]) / totalunique
                results.append(
                    """<i>(Cumulative: {} ({:.2f}%) were HSK 1-{} words)</i>"""
                    .format(cumulativeknown[i], cumpercentknown, i))
            results.append("</li>")
        numknown_nonhsk_percent = 100 * float(numknown_nonhsk) / totalunique
        results.append("""<li>{} ({:.2f}%) were non-HSK words</li>""".format(
            numknown_nonhsk, numknown_nonhsk_percent))
        results.append("</ul>")
    totalwords = sum(hsk_word_count.values())
    if totalwords == totalunique:
        results.append(
            "<p><i>Each word appeared only once in the input.</i></p>")
    else:
        cumknown = 0
        results.append(
            "Of the {} <b>total</b> words that were input:<ul>".format(
                totalwords))
        for i in range(1, 7):
            percentknown = 100 * float(hsk_word_count[i]) / totalwords
            cumknown += hsk_word_count[i]
            results.append("""<li>{} ({:.2f}%) were HSK {} words""".format(
                hsk_word_count[i], percentknown, i))
            if i > 1:
                cumpercentknown = 100 * float(cumknown) / totalwords
                results.append(
                    """<i>(Cumulative: {} ({:.2f}%) were HSK 1-{} words)</i>"""
                    .format(cumknown, cumpercentknown, i))
            results.append("</li>")
        num_nonhsk = totalwords - cumknown
        numknown_nonhsk_percent = 100 * float(num_nonhsk) / totalwords
        results.append("""<li>{} ({:.2f}%) were non-HSK words</li>""".format(
            num_nonhsk, numknown_nonhsk_percent))
        results.append("</ul>")
    for i in range(1, 7):
        wordsknown = "\n".join(frequency_order_word(knownintersect[i]))
        results.append(blockboxtemplate().format("HSK " + str(i),
                                                 "hskwordsknown" + str(i),
                                                 wordsknown))
    nonhskwords = "\n".join(frequency_order_word(words - hsk_words[16]))
    results.append(blockboxtemplate().format("Non-HSK", "nonhskwordsknown",
                                             nonhskwords))