def get_mean_exposures_per_pivot(): "Returns the number of exposures each pivot received." cursor = connection.cursor() cursor.execute(""" SELECT pivot, pivot_type, COUNT(*) as n_exposures FROM drill_question GROUP BY CONCAT(pivot, "|", pivot_type) """) data = cursor.fetchall() word_c = [] kanji_c = [] combined_c = [] kanji_inc_dist = FreqDist() for pivot, pivot_type, n_exposures in data: combined_c.append(n_exposures) if pivot_type == 'k': kanji_c.append(n_exposures) kanji_inc_dist.inc(pivot, n_exposures) elif pivot_type == 'w': word_c.append(n_exposures) for kanji in scripts.unique_kanji(pivot): kanji_inc_dist.inc(kanji, n_exposures) else: raise ValueError('unknown pivot type: %s' % pivot_type) return [ ('Words', mean(word_c)), ('Kanji', mean(kanji_c)), ('Combined', mean(combined_c)), ('Kanji combined', mean(kanji_inc_dist.values())), ]
def _embellish(response_data): """Adds kanji contained in words as kanji exposed.""" kanji_script = scripts.Script.Kanji for pivot, pivot_type, is_correct, timestamp in response_data: yield (pivot, pivot_type, is_correct, timestamp) if pivot_type == 'w' and scripts.contains_script(kanji_script, pivot): for kanji in scripts.unique_kanji(pivot): yield kanji, 'k', is_correct, timestamp
def validate(cls): for syllabus in cls.objects.all(): kanji_set = set(k.kanji for k in \ lexicon_models.Kanji.objects.filter( partialkanji__syllabus=syllabus) ) for partial_lexeme in syllabus.partiallexeme_set.all(): for lexeme_surface in partial_lexeme.surface_set.all(): if not scripts.unique_kanji( lexeme_surface.surface).issubset(kanji_set): raise Exception('invalid surface')
def validate(cls): for syllabus in cls.objects.all(): kanji_set = set(k.kanji for k in \ lexicon_models.Kanji.objects.filter( partialkanji__syllabus=syllabus) ) for partial_lexeme in syllabus.partiallexeme_set.all(): for lexeme_surface in partial_lexeme.surface_set.all(): if not scripts.unique_kanji(lexeme_surface.surface ).issubset(kanji_set): raise Exception('invalid surface')
def create_kanji_graph(word_dict=get_word_freq_dict()): kgraph = nx.Graph() #word_dict = get_word_freq_dict() for rank, word in enumerate(get_common_word_list(), start=1): charpairs = permutations( scripts.unique_kanji(word_dict[word]['chars']), 2) wordweight = get_word_importance_by_rank(rank) for char1, char2 in charpairs: try: kgraph[char1][char2]['words'].append((word, rank)) word_list = sorted(set(kgraph[char1][char2]['words'])) weight = kgraph[char1][char2]['weight'] - wordweight kgraph.add_edge(char1, char2, words=word_list, weight=weight) except KeyError: kgraph.add_edge(char1, char2, words=[(word, rank)], weight=-wordweight) os.chdir(BASE_PATH) with open('output/kanji_graph_data.txt', 'w', encoding='utf-8') as outfile: outfile.write("#Kanji Graph Data\n\n## DEGREE:\n\n") outfile.write( pp.pformat(sorted(kgraph.degree, key=lambda x: x[1], reverse=True))) outfile.write("\n\n##CONNECTED COMPONENTS:\n\n") outfile.write( pp.pformat( sorted(nx.connected_components(kgraph), key=len, reverse=True))) outfile.write("\n\n##CLUSTERING:\n\n") outfile.write(pp.pformat(sorted(nx.clustering(kgraph), reverse=True))) outfile.write("\n\n##CENTRALITY:\n\n") outfile.write( pp.pformat(sorted(nx.betweenness_centrality(kgraph), reverse=True))) outfile.write("\n\n##EDGES:\n\n") outfile.write( pp.pformat( sorted(kgraph.edges(data=True), key=lambda x: x[1], reverse=True))) outfile.write("\n\n##MINIMUM SPANNING TREE:\n\n") outfile.write( pp.pformat( sorted(nx.minimum_spanning_tree(kgraph).edges(data=True)))) print('Wrote graph data to ' + BASE_PATH + 'output/kanji_graph_data.txt') return kgraph
def update_kanji_db(): for rank, word in enumerate(get_common_word_list(), start=1): for char in scripts.unique_kanji(word): if rank <= 500: db.kanji.update({'char': char}, {'$addToSet': { 'top500words': word }}, upsert=True) elif rank <= 1000: db.kanji.update({'char': char}, {'$addToSet': { 'top1000words': word }}, upsert=True) elif rank <= 3000: db.kanji.update({'char': char}, {'$addToSet': { 'top3000words': word }}, upsert=True) elif rank <= 5000: db.kanji.update({'char': char}, {'$addToSet': { 'top5000words': word }}, upsert=True) elif rank <= 10000: db.kanji.update({'char': char}, {'$addToSet': { 'top10000words': word }}, upsert=True) elif rank <= 15000: db.kanji.update({'char': char}, {'$addToSet': { 'top15000words': word }}, upsert=True) elif rank <= 20000: db.kanji.update({'char': char}, {'$addToSet': { 'top20000words': word }}, upsert=True) else: break
def _store_word_surfaces(syllabus, syllabus_bundle): """ Aligns the word surfaces with JMdict. We also use our known kanji set, replacing any unknown kanji with their readings. If this results in a surface known to JMdict, then we add that surface to the lexeme's list. """ _log.start('Building lexeme surfaces', nSteps=2) _store_reduced_surfaces(syllabus, syllabus_bundle) _log.log('Adding non-syllabus surfaces which match') for partial_lexeme in syllabus.partiallexeme_set.all(): # Only add new surfaces if we had no matches if partial_lexeme.surface_set.count() > 0: continue for lexeme_surface in partial_lexeme.lexeme.surface_set.all(): if scripts.unique_kanji(lexeme_surface.surface).issubset( syllabus_bundle.chars): partial_lexeme.surface_set.add(lexeme_surface) _log.finish()
def _store_reduced_surfaces(syllabus, syllabus_bundle): """ We may have some but not all of the kanji found in a surface available as part of the syllabus. In these cases, we see if variants of the surface which don't use the missing kanji are also available. """ _log.start('Finding reduced surfaces', nSteps=1) n_aligned = 0 for alignment in syllabus_bundle.alignments: if not alignment.has_kanji(): continue # Find the word which this alignment represents. try: partial_lexeme = syllabus.partiallexeme_set.filter( lexeme__reading_set__reading=alignment.phoneme).get( lexeme__surface_set__surface=alignment.grapheme) except: continue reading = partial_lexeme.reading_set.get(reading=alignment.phoneme) surface = partial_lexeme.lexeme.surface_set.get( surface=alignment.grapheme) if scripts.unique_kanji(surface.surface).issubset( syllabus_bundle.chars): partial_lexeme.surface_set.add(surface) syllabus.alignment_set.get_or_create( reading=reading, surface=surface, alignment=alignment.short_form(), ) n_aligned += 1 _log.finish('%d reduced surfaces' % n_aligned) return
def get_list(list_name): "Returns the kanji in the given list." f = cjkdata.get_resource('lists/char/%s' % list_name) with codecs.open(f, 'r', 'utf8') as istream: return scripts.unique_kanji(istream.read())
def get_list(list_name): """Returns the kanji in the given list.""" f = cjkdata.get_resource('lists/char/%s' % list_name) with codecs.open(f, 'r', 'utf8') as istream: return scripts.unique_kanji(istream.read())