def _check_syllabus(syllabus): """Checks for alignment errors in the syllabus.""" prior_dist = models.PriorDist.objects.get(syllabus=syllabus, tag='reading | kanji') problems = [] for partial_lexeme in syllabus.partiallexeme_set.all(): for alignment in partial_lexeme.alignments: for g_seg, p_seg in alignment: if not scripts.contains_script(scripts.Script.Kanji, g_seg): continue if prior_dist.density.filter(condition=g_seg, symbol=p_seg).count() != 1: problems.append( (alignment.alignment, g_seg, p_seg) ) problems_by_alignment = {} for alignment, g_seg, p_seg in problems: last_cases = problems_by_alignment.setdefault(alignment, list()) last_cases.append((g_seg, p_seg)) for alignment, cases in sorted(problems_by_alignment.iteritems()): _log.start(alignment, nSteps=len(cases)) for g_seg, p_seg in cases: _log.log(u'No match for %s /%s/' % (g_seg, p_seg)) _log.finish()
def _embellish(response_data): """Adds kanji contained in words as kanji exposed.""" kanji_script = scripts.Script.Kanji for pivot, pivot_type, is_correct, timestamp in response_data: yield (pivot, pivot_type, is_correct, timestamp) if pivot_type == 'w' and scripts.contains_script(kanji_script, pivot): for kanji in scripts.unique_kanji(pivot): yield kanji, 'k', is_correct, timestamp
def __init__(self): ConditionalFreqDist.__init__(self) kanji_script = scripts.Script.Kanji i_stream = sopen(_edict_aligned_file, 'r') for line in i_stream: alignment = Alignment.from_line(line) for (g, p) in alignment: if scripts.contains_script(kanji_script, g): self[g].inc(scripts.to_hiragana(p)) i_stream.close() return
def _populate_stacks(lexeme_node, lexeme_id, lexeme_surface_stack, lexeme_sense_stack, lexeme_reading_stack): surface_set = set(n.find('keb').text.upper() for n in \ lexeme_node.findall('k_ele')) reading_list = [n.find('reb').text for n in \ lexeme_node.findall('r_ele')] sense_list = lexeme_node.findall('sense') if not (reading_list and sense_list): print "Warning: lexeme is missing crucial data" return # If we have no kanji, the kana becomes the surface form if not surface_set: surface_set = set(reading_list) in_lexicon = True # All these surfaces are from the original lexicon for surface in sorted(surface_set): lexeme_surface_stack.append(( lexeme_id, surface.upper(), scripts.contains_script(scripts.Script.Kanji, surface), in_lexicon, )) for reading in reading_list: lexeme_reading_stack.append((lexeme_id, reading)) is_first_sense = True for sense_node in sense_list: for gloss in sense_node.findall('gloss'): (lang_key, ) = [ key for key in gloss.keys() if key.endswith('lang') ] lang = gloss.get(lang_key) if lang != 'eng': continue lexeme_sense_stack.append((lexeme_id, gloss.text, is_first_sense)) is_first_sense = False return
def _populate_stacks(lexeme_node, lexeme_id, lexeme_surface_stack, lexeme_sense_stack, lexeme_reading_stack): surface_set = set(n.find('keb').text.upper() for n in \ lexeme_node.findall('k_ele')) reading_list = [n.find('reb').text for n in \ lexeme_node.findall('r_ele')] sense_list = lexeme_node.findall('sense') if not (reading_list and sense_list): print "Warning: lexeme is missing crucial data" return # If we have no kanji, the kana becomes the surface form if not surface_set: surface_set = set(reading_list) in_lexicon = True # All these surfaces are from the original lexicon for surface in sorted(surface_set): lexeme_surface_stack.append(( lexeme_id, surface.upper(), scripts.contains_script(scripts.Script.Kanji, surface), in_lexicon, )) for reading in reading_list: lexeme_reading_stack.append((lexeme_id, reading)) is_first_sense = True for sense_node in sense_list: for gloss in sense_node.findall('gloss'): (lang_key,) = [key for key in gloss.keys() if key.endswith('lang')] lang = gloss.get(lang_key) if lang != 'eng': continue lexeme_sense_stack.append((lexeme_id, gloss.text, is_first_sense)) is_first_sense = False return
def _check_syllabus(syllabus): """Checks for alignment errors in the syllabus.""" prior_dist = models.PriorDist.objects.get(syllabus=syllabus, tag='reading | kanji') problems = [] for partial_lexeme in syllabus.partiallexeme_set.all(): for alignment in partial_lexeme.alignments: for g_seg, p_seg in alignment: if not scripts.contains_script(scripts.Script.Kanji, g_seg): continue if prior_dist.density.filter(condition=g_seg, symbol=p_seg).count() != 1: problems.append((alignment.alignment, g_seg, p_seg)) problems_by_alignment = {} for alignment, g_seg, p_seg in problems: last_cases = problems_by_alignment.setdefault(alignment, list()) last_cases.append((g_seg, p_seg)) for alignment, cases in sorted(problems_by_alignment.iteritems()): _log.start(alignment, nSteps=len(cases)) for g_seg, p_seg in cases: _log.log(u'No match for %s /%s/' % (g_seg, p_seg)) _log.finish()
def has_kanji(self): return scripts.contains_script(scripts.Script.Kanji, self.surface)
def has_kanji(self): return contains_script(Script.Kanji, self.grapheme)