Exemple #1
0
def get_mean_exposures_per_pivot():
    "Returns the number of exposures each pivot received."
    cursor = connection.cursor()
    cursor.execute("""
        SELECT pivot, pivot_type, COUNT(*) as n_exposures
        FROM drill_question
        GROUP BY CONCAT(pivot, "|", pivot_type)
    """)
    data = cursor.fetchall()
    word_c = []
    kanji_c = []
    combined_c = []
    kanji_inc_dist = FreqDist()
    for pivot, pivot_type, n_exposures in data:
        combined_c.append(n_exposures)

        if pivot_type == 'k':
            kanji_c.append(n_exposures)
            kanji_inc_dist.inc(pivot, n_exposures)

        elif pivot_type == 'w':
            word_c.append(n_exposures)
            for kanji in scripts.unique_kanji(pivot):
                kanji_inc_dist.inc(kanji, n_exposures)

        else:
            raise ValueError('unknown pivot type: %s' % pivot_type)

    return [
        ('Words', mean(word_c)),
        ('Kanji', mean(kanji_c)),
        ('Combined', mean(combined_c)),
        ('Kanji combined', mean(kanji_inc_dist.values())),
    ]
Exemple #2
0
def get_mean_exposures_per_pivot():
    "Returns the number of exposures each pivot received."
    cursor = connection.cursor()
    cursor.execute("""
        SELECT pivot, pivot_type, COUNT(*) as n_exposures
        FROM drill_question
        GROUP BY CONCAT(pivot, "|", pivot_type)
    """)
    data = cursor.fetchall()
    word_c = []
    kanji_c = []
    combined_c = []
    kanji_inc_dist = FreqDist()
    for pivot, pivot_type, n_exposures in data:
        combined_c.append(n_exposures)

        if pivot_type == 'k':
            kanji_c.append(n_exposures)
            kanji_inc_dist.inc(pivot, n_exposures)

        elif pivot_type == 'w':
            word_c.append(n_exposures)
            for kanji in scripts.unique_kanji(pivot):
                kanji_inc_dist.inc(kanji, n_exposures)

        else:
            raise ValueError('unknown pivot type: %s' % pivot_type)

    return [
            ('Words', mean(word_c)),
            ('Kanji', mean(kanji_c)),
            ('Combined', mean(combined_c)),
            ('Kanji combined', mean(kanji_inc_dist.values())),
        ]
def _embellish(response_data):
    """Adds kanji contained in words as kanji exposed."""
    kanji_script = scripts.Script.Kanji
    for pivot, pivot_type, is_correct, timestamp in response_data:
        yield (pivot, pivot_type, is_correct, timestamp)
        if pivot_type == 'w' and scripts.contains_script(kanji_script, pivot):
            for kanji in scripts.unique_kanji(pivot):
                yield kanji, 'k', is_correct, timestamp
Exemple #4
0
 def validate(cls):
     for syllabus in cls.objects.all():
         kanji_set = set(k.kanji for k in \
                 lexicon_models.Kanji.objects.filter(
                         partialkanji__syllabus=syllabus)
             )
         for partial_lexeme in syllabus.partiallexeme_set.all():
             for lexeme_surface in partial_lexeme.surface_set.all():
                 if not scripts.unique_kanji(
                         lexeme_surface.surface).issubset(kanji_set):
                     raise Exception('invalid surface')
Exemple #5
0
 def validate(cls):
     for syllabus in cls.objects.all():
         kanji_set = set(k.kanji for k in \
                 lexicon_models.Kanji.objects.filter(
                         partialkanji__syllabus=syllabus)
             )
         for partial_lexeme in syllabus.partiallexeme_set.all():
             for lexeme_surface in partial_lexeme.surface_set.all():
                 if not scripts.unique_kanji(lexeme_surface.surface
                         ).issubset(kanji_set):
                     raise Exception('invalid surface')
Exemple #6
0
def create_kanji_graph(word_dict=get_word_freq_dict()):
    kgraph = nx.Graph()
    #word_dict = get_word_freq_dict()
    for rank, word in enumerate(get_common_word_list(), start=1):
        charpairs = permutations(
            scripts.unique_kanji(word_dict[word]['chars']), 2)
        wordweight = get_word_importance_by_rank(rank)
        for char1, char2 in charpairs:
            try:
                kgraph[char1][char2]['words'].append((word, rank))
                word_list = sorted(set(kgraph[char1][char2]['words']))
                weight = kgraph[char1][char2]['weight'] - wordweight
                kgraph.add_edge(char1, char2, words=word_list, weight=weight)
            except KeyError:
                kgraph.add_edge(char1,
                                char2,
                                words=[(word, rank)],
                                weight=-wordweight)

    os.chdir(BASE_PATH)
    with open('output/kanji_graph_data.txt', 'w', encoding='utf-8') as outfile:
        outfile.write("#Kanji Graph Data\n\n## DEGREE:\n\n")
        outfile.write(
            pp.pformat(sorted(kgraph.degree, key=lambda x: x[1],
                              reverse=True)))
        outfile.write("\n\n##CONNECTED COMPONENTS:\n\n")
        outfile.write(
            pp.pformat(
                sorted(nx.connected_components(kgraph), key=len,
                       reverse=True)))
        outfile.write("\n\n##CLUSTERING:\n\n")
        outfile.write(pp.pformat(sorted(nx.clustering(kgraph), reverse=True)))
        outfile.write("\n\n##CENTRALITY:\n\n")
        outfile.write(
            pp.pformat(sorted(nx.betweenness_centrality(kgraph),
                              reverse=True)))
        outfile.write("\n\n##EDGES:\n\n")
        outfile.write(
            pp.pformat(
                sorted(kgraph.edges(data=True),
                       key=lambda x: x[1],
                       reverse=True)))
        outfile.write("\n\n##MINIMUM SPANNING TREE:\n\n")
        outfile.write(
            pp.pformat(
                sorted(nx.minimum_spanning_tree(kgraph).edges(data=True))))
        print('Wrote graph data to ' + BASE_PATH +
              'output/kanji_graph_data.txt')

    return kgraph
Exemple #7
0
def update_kanji_db():
    for rank, word in enumerate(get_common_word_list(), start=1):
        for char in scripts.unique_kanji(word):
            if rank <= 500:
                db.kanji.update({'char': char},
                                {'$addToSet': {
                                    'top500words': word
                                }},
                                upsert=True)
            elif rank <= 1000:
                db.kanji.update({'char': char},
                                {'$addToSet': {
                                    'top1000words': word
                                }},
                                upsert=True)
            elif rank <= 3000:
                db.kanji.update({'char': char},
                                {'$addToSet': {
                                    'top3000words': word
                                }},
                                upsert=True)
            elif rank <= 5000:
                db.kanji.update({'char': char},
                                {'$addToSet': {
                                    'top5000words': word
                                }},
                                upsert=True)
            elif rank <= 10000:
                db.kanji.update({'char': char},
                                {'$addToSet': {
                                    'top10000words': word
                                }},
                                upsert=True)
            elif rank <= 15000:
                db.kanji.update({'char': char},
                                {'$addToSet': {
                                    'top15000words': word
                                }},
                                upsert=True)
            elif rank <= 20000:
                db.kanji.update({'char': char},
                                {'$addToSet': {
                                    'top20000words': word
                                }},
                                upsert=True)
            else:
                break
def _store_word_surfaces(syllabus, syllabus_bundle):
    """
    Aligns the word surfaces with JMdict. We also use our known kanji set,
    replacing any unknown kanji with their readings. If this results in a
    surface known to JMdict, then we add that surface to the lexeme's list.
    """
    _log.start('Building lexeme surfaces', nSteps=2)

    _store_reduced_surfaces(syllabus, syllabus_bundle)

    _log.log('Adding non-syllabus surfaces which match')
    for partial_lexeme in syllabus.partiallexeme_set.all():
        # Only add new surfaces if we had no matches
        if partial_lexeme.surface_set.count() > 0:
            continue

        for lexeme_surface in partial_lexeme.lexeme.surface_set.all():
            if scripts.unique_kanji(lexeme_surface.surface).issubset(
                    syllabus_bundle.chars):
                partial_lexeme.surface_set.add(lexeme_surface)

    _log.finish()
def _store_reduced_surfaces(syllabus, syllabus_bundle):
    """
    We may have some but not all of the kanji found in a surface available
    as part of the syllabus. In these cases, we see if variants of the surface
    which don't use the missing kanji are also available.
    """
    _log.start('Finding reduced surfaces', nSteps=1)
    n_aligned = 0
    for alignment in syllabus_bundle.alignments:
        if not alignment.has_kanji():
            continue

        # Find the word which this alignment represents.
        try:
            partial_lexeme = syllabus.partiallexeme_set.filter(
                    lexeme__reading_set__reading=alignment.phoneme).get(
                    lexeme__surface_set__surface=alignment.grapheme)
        except:
            continue

        reading = partial_lexeme.reading_set.get(reading=alignment.phoneme)
        surface = partial_lexeme.lexeme.surface_set.get(
            surface=alignment.grapheme)

        if scripts.unique_kanji(surface.surface).issubset(
                    syllabus_bundle.chars):
            partial_lexeme.surface_set.add(surface)
            syllabus.alignment_set.get_or_create(
                    reading=reading,
                    surface=surface,
                    alignment=alignment.short_form(),
                )
            n_aligned += 1

    _log.finish('%d reduced surfaces' % n_aligned)
    return
Exemple #10
0
def get_list(list_name):
    "Returns the kanji in the given list."
    f = cjkdata.get_resource('lists/char/%s' % list_name)
    with codecs.open(f, 'r', 'utf8') as istream:
        return scripts.unique_kanji(istream.read())
Exemple #11
0
def get_list(list_name):
    """Returns the kanji in the given list."""
    f = cjkdata.get_resource('lists/char/%s' % list_name)
    with codecs.open(f, 'r', 'utf8') as istream:
        return scripts.unique_kanji(istream.read())