def _check_syllabus(syllabus):
    """Checks for alignment errors in the syllabus."""
    prior_dist = models.PriorDist.objects.get(syllabus=syllabus,
            tag='reading | kanji')
    problems = []
    for partial_lexeme in syllabus.partiallexeme_set.all():
        for alignment in partial_lexeme.alignments:
            for g_seg, p_seg in alignment:
                if not scripts.contains_script(scripts.Script.Kanji, g_seg):
                    continue
                if prior_dist.density.filter(condition=g_seg, 
                        symbol=p_seg).count() != 1:
                    problems.append(
                            (alignment.alignment, g_seg, p_seg)
                        )

    problems_by_alignment = {}
    for alignment, g_seg, p_seg in problems:
        last_cases = problems_by_alignment.setdefault(alignment, list())
        last_cases.append((g_seg, p_seg))
    
    for alignment, cases in sorted(problems_by_alignment.iteritems()):
        _log.start(alignment, nSteps=len(cases))
        for g_seg, p_seg in cases:
            _log.log(u'No match for %s /%s/' % (g_seg, p_seg))
        _log.finish()
Example #2
0
def _embellish(response_data):
    """Adds kanji contained in words as kanji exposed."""
    kanji_script = scripts.Script.Kanji
    for pivot, pivot_type, is_correct, timestamp in response_data:
        yield (pivot, pivot_type, is_correct, timestamp)
        if pivot_type == 'w' and scripts.contains_script(kanji_script, pivot):
            for kanji in scripts.unique_kanji(pivot):
                yield kanji, 'k', is_correct, timestamp
Example #3
0
    def __init__(self):
        ConditionalFreqDist.__init__(self)

        kanji_script = scripts.Script.Kanji
        i_stream = sopen(_edict_aligned_file, 'r')
        for line in i_stream:
            alignment = Alignment.from_line(line)
            for (g, p) in alignment:
                if scripts.contains_script(kanji_script, g):
                    self[g].inc(scripts.to_hiragana(p))
        i_stream.close()
        return
    def __init__(self):
        ConditionalFreqDist.__init__(self)

        kanji_script = scripts.Script.Kanji
        i_stream = sopen(_edict_aligned_file, 'r')
        for line in i_stream:
            alignment = Alignment.from_line(line)
            for (g, p) in alignment:
                if scripts.contains_script(kanji_script, g):
                    self[g].inc(scripts.to_hiragana(p))
        i_stream.close()
        return
Example #5
0
def _populate_stacks(lexeme_node, lexeme_id, lexeme_surface_stack,
                     lexeme_sense_stack, lexeme_reading_stack):
    surface_set = set(n.find('keb').text.upper() for n in \
            lexeme_node.findall('k_ele'))
    reading_list = [n.find('reb').text for n in \
            lexeme_node.findall('r_ele')]
    sense_list = lexeme_node.findall('sense')

    if not (reading_list and sense_list):
        print "Warning: lexeme is missing crucial data"
        return

    # If we have no kanji, the kana becomes the surface form
    if not surface_set:
        surface_set = set(reading_list)

    in_lexicon = True  # All these surfaces are from the original lexicon
    for surface in sorted(surface_set):
        lexeme_surface_stack.append((
            lexeme_id,
            surface.upper(),
            scripts.contains_script(scripts.Script.Kanji, surface),
            in_lexicon,
        ))

    for reading in reading_list:
        lexeme_reading_stack.append((lexeme_id, reading))

    is_first_sense = True
    for sense_node in sense_list:
        for gloss in sense_node.findall('gloss'):
            (lang_key, ) = [
                key for key in gloss.keys() if key.endswith('lang')
            ]
            lang = gloss.get(lang_key)
            if lang != 'eng':
                continue
            lexeme_sense_stack.append((lexeme_id, gloss.text, is_first_sense))
            is_first_sense = False
    return
Example #6
0
def _populate_stacks(lexeme_node, lexeme_id, lexeme_surface_stack,
        lexeme_sense_stack, lexeme_reading_stack):
    surface_set = set(n.find('keb').text.upper() for n in \
            lexeme_node.findall('k_ele'))
    reading_list = [n.find('reb').text for n in \
            lexeme_node.findall('r_ele')]
    sense_list = lexeme_node.findall('sense')

    if not (reading_list and sense_list):
        print "Warning: lexeme is missing crucial data"
        return

    # If we have no kanji, the kana becomes the surface form
    if not surface_set:
        surface_set = set(reading_list)

    in_lexicon = True # All these surfaces are from the original lexicon
    for surface in sorted(surface_set):
        lexeme_surface_stack.append((
                lexeme_id,
                surface.upper(),
                scripts.contains_script(scripts.Script.Kanji, surface),
                in_lexicon,
            ))
    
    for reading in reading_list:
        lexeme_reading_stack.append((lexeme_id, reading))

    is_first_sense = True
    for sense_node in sense_list:
        for gloss in sense_node.findall('gloss'):
            (lang_key,) = [key for key in gloss.keys() if key.endswith('lang')]
            lang = gloss.get(lang_key)
            if lang != 'eng':
                continue
            lexeme_sense_stack.append((lexeme_id, gloss.text, is_first_sense))
            is_first_sense = False
    return
def _check_syllabus(syllabus):
    """Checks for alignment errors in the syllabus."""
    prior_dist = models.PriorDist.objects.get(syllabus=syllabus,
                                              tag='reading | kanji')
    problems = []
    for partial_lexeme in syllabus.partiallexeme_set.all():
        for alignment in partial_lexeme.alignments:
            for g_seg, p_seg in alignment:
                if not scripts.contains_script(scripts.Script.Kanji, g_seg):
                    continue
                if prior_dist.density.filter(condition=g_seg,
                                             symbol=p_seg).count() != 1:
                    problems.append((alignment.alignment, g_seg, p_seg))

    problems_by_alignment = {}
    for alignment, g_seg, p_seg in problems:
        last_cases = problems_by_alignment.setdefault(alignment, list())
        last_cases.append((g_seg, p_seg))

    for alignment, cases in sorted(problems_by_alignment.iteritems()):
        _log.start(alignment, nSteps=len(cases))
        for g_seg, p_seg in cases:
            _log.log(u'No match for %s /%s/' % (g_seg, p_seg))
        _log.finish()
Example #8
0
 def has_kanji(self):
     return scripts.contains_script(scripts.Script.Kanji, self.surface)
Example #9
0
 def has_kanji(self):
     return contains_script(Script.Kanji, self.grapheme)
Example #10
0
 def has_kanji(self):
     return scripts.contains_script(scripts.Script.Kanji, self.surface)
Example #11
0
 def has_kanji(self):
     return contains_script(Script.Kanji, self.grapheme)