Exemple #1
0
 def convertToKana(self):
     
     inputLen = len(self.lookup.text())
     if inputLen > 0:
         
         if scripts.script_type(self.lookup.text()) == scripts.Script.Kanji:
             pass    #TODO: ...
         #if re.search('n{1}', self.lookup.text()[ inputLen - 2: ]) is None:                            #NB: yes, regexp would be better, yet I failed miserably at it
         if self.lookup.text()[ inputLen - 1 ] != u'n' and self.lookup.text()[ inputLen - 2:] != u'ny':
             converted = romkan(self.lookup.text())      #NB: does not convert naninuneno, somehow (purpotedly, 'n' normalization is to blame)
             self.lookup.setText(converted)
             #self.testConvert.setText(converted)
         if self.lookup.text()[ inputLen - 2:] == u'nn':
             converted = romkan(normalize_double_n(self.lookup.text()))
             self.lookup.setText(converted)
             
         #print self.lookup.text()
         #scripts.script_type(cluster) == scripts.Script.Kanji:
         
         if len(scripts.script_boundaries(self.lookup.text())) == 1:
             if scripts.script_type(self.lookup.text()) == scripts.Script.Hiragana:
                 self.updateLookupResults(self.lookup.text())
     else:
         self.lookupResults.clearContents()
         self.lookupResults.setRowCount(0)
Exemple #2
0
    def _parse_line(self, line):
        "Parses a single line in the kanjdic file, returning an entry."
        segment_pattern = re.compile('[^ {]+|{.*?}', re.UNICODE)
        segments = segment_pattern.findall(line.strip())
        segments.reverse()

        kanji = segments.pop()
        jis_code = int(segments.pop(), 16)
        info = {
            'kanji': kanji,
            'gloss': [],
            'on_readings': [],
            'kun_readings': [],
            'jis_code': jis_code
        }

        while segments:
            s = segments.pop()

            if s.startswith('{'):
                info['gloss'].append(s[1:-1])

            elif (scripts.script_type(s) != scripts.Script.Ascii
                  or s.startswith('-')):
                # It must be a reading.
                char = s[0]
                if char == '-':
                    char = s[1]

                if scripts.script_type(char) == scripts.Script.Katakana:
                    info['on_readings'].append(s)
                elif scripts.script_type(char) == scripts.Script.Hiragana:
                    info['kun_readings'].append(s)
                else:
                    raise Exception("Unknown segment %s" % s)

            elif s in ('T1', 'T2'):
                continue

            else:
                # handle various codes
                code = s[0]
                remainder = s[1:]
                try:
                    remainder = int(remainder)
                except:
                    pass

                info.setdefault(remappings.get(code, code),
                                []).append(remainder)

        info['stroke_count'] = info['stroke_count'][0]
        if 'frequency' in info:
            info['frequency'] = info['frequency'][0]

        info['skip_code'] = tuple(
            int(i) for i in info['skip_code'][0].split('-'))

        return KanjidicEntry(**info)
Exemple #3
0
    def _parse_line(self, line):
        "Parses a single line in the kanjdic file, returning an entry."
        segment_pattern = re.compile('[^ {]+|{.*?}', re.UNICODE)
        segments = segment_pattern.findall(line.strip())
        segments.reverse()

        kanji = segments.pop()
        jis_code = int(segments.pop(), 16)
        info = {
            'kanji':        kanji,
            'gloss':        [],
            'on_readings':  [],
            'kun_readings': [],
            'jis_code':     jis_code
        }

        while segments:
            s = segments.pop()

            if s.startswith('{'):
                info['gloss'].append(s[1:-1])

            elif (scripts.script_type(s) != scripts.Script.Ascii
                    or s.startswith('-')):
                # It must be a reading.
                char = s[0]
                if char == '-':
                    char = s[1]

                if scripts.script_type(char) == scripts.Script.Katakana:
                    info['on_readings'].append(s)
                elif scripts.script_type(char) == scripts.Script.Hiragana:
                    info['kun_readings'].append(s)
                else:
                    raise Exception("Unknown segment %s" % s)

            elif s in ('T1', 'T2'):
                continue

            else:
                # handle various codes
                code = s[0]
                remainder = s[1:]
                try:
                    remainder = int(remainder)
                except:
                    pass

                info.setdefault(remappings.get(code, code), []).append(
                    remainder)

        info['stroke_count'] = info['stroke_count'][0]
        if 'frequency' in info:
            info['frequency'] = info['frequency'][0]

        info['skip_code'] = tuple(int(i)
                                  for i in info['skip_code'][0].split('-'))

        return KanjidicEntry(**info)
Exemple #4
0
    def sample_seq_n_uniform(self, condition_segments, n, exclude_set=None):
        # XXX Note: potential for infinite recursion if not enough candidates
        # are available. Much less likely to hit this case than non-uniform
        # sampling.
        exclude_set = set(exclude_set or [])
        results = []
        kanji_script = scripts.Script.Kanji
        while len(results) < n:
            result_seg_sets = []
            for segment in condition_segments:
                if scripts.script_type(segment) == kanji_script:
                    result_seg_sets.append(
                            [o['symbol'] for o in self.density.filter(
                                    condition=segment).order_by('?').values(
                                    'symbol')[:n]]
                        )
                else:
                    result_seg_sets.append([segment] * n)
            for result_segs in zip(*result_seg_sets):
                flat_result = ''.join(result_segs)
                if flat_result not in exclude_set:
                    results.append(result_segs)
                    exclude_set.add(flat_result)

        return results
Exemple #5
0
def canonical_segment_forms(segment, left_context=True, right_context=True):
    """
    When given a single segment, determine all possible canonical forms
    for that segment, assuming that both sequential voicing and
    sound euphony were possible (i.e. that the segment had both left
    and right context).
    """
    table = kana_table.KanaTable.get_cached()
    variants = set([segment])
    stype = scripts.script_type(segment)
    sokuon = _sokuon_map.get(stype, None)
    onbin = _onbin_map.get(stype, None)

    if sokuon is None:
        raise ValueError('Unsupported script type. '
                         'Segments must be hiragana or katakana')

    if right_context and len(segment) > 1 and segment.endswith(sokuon):
        # Can restore onbin cases
        variants.update([segment[:-1] + c for c in onbin])

    if left_context and table.is_voiced(segment[0]):
        # Can devoice
        variants.update([from_voiced[v[0]] + v[1:] for v in variants])

    return variants
Exemple #6
0
def canonical_segment_forms(segment, left_context=True, right_context=True):
    """
    When given a single segment, determine all possible canonical forms
    for that segment, assuming that both sequential voicing and
    sound euphony were possible (i.e. that the segment had both left
    and right context).
    """
    table = kana_table.KanaTable.get_cached()
    variants = set([segment])
    stype = scripts.script_type(segment)
    sokuon = _sokuon_map.get(stype, None)
    onbin = _onbin_map.get(stype, None)

    if sokuon is None:
        raise ValueError('Unsupported script type. '
                         'Segments must be hiragana or katakana')

    if right_context and len(segment) > 1 and segment.endswith(sokuon):
        # Can restore onbin cases
        variants.update([segment[:-1] + c for c in onbin])

    if left_context and table.is_voiced(segment[0]):
        # Can devoice
        variants.update([from_voiced[v[0]] + v[1:] for v in variants])

    return variants
Exemple #7
0
    def parseWordToKanji(self):

        script = scripts.script_boundaries(self.itemsMenu.wordInfo.text())
        components = u''
        kanjiList = []

        for cluster in script:
            if scripts.script_type(cluster) == scripts.Script.Kanji:
                for kanji in cluster:
                    if not kanji in kanjiList:
                        kanjiList.append(kanji)
                        try: 
                            lookup = self.kdict[kanji]
                            kun = lookup.kun_readings; on = lookup.on_readings; gloss = lookup.gloss
                            
                            components += '<b>(' + kanji + ')</b>\t'
                            #components += '(' + kanji + ')\t'
                            
                            if len(kun) > 0:
                                components += '<b>kun:</b>' + ', '.join(kun) + '\t'
                            if len(on) > 0:
                                components += '<b>on:</b>' + ', '.join(on) + '<br/>'
                            if len(gloss) > 0:
                                components += "<font style='font-family: Calibri; font-size: 11pt'>" + ", ".join(gloss) + "</font><br/>"
                                
                        except:
                            components += kanji + '<br/>'
                        
        return components.rstrip('<br/>')
Exemple #8
0
    def sample_seq_n_uniform(self, condition_segments, n, exclude_set=None):
        # XXX Note: potential for infinite recursion if not enough candidates
        # are available. Much less likely to hit this case than non-uniform
        # sampling.
        exclude_set = set(exclude_set or [])
        results = []
        kanji_script = scripts.Script.Kanji
        while len(results) < n:
            result_seg_sets = []
            for segment in condition_segments:
                if scripts.script_type(segment) == kanji_script:
                    result_seg_sets.append([
                        o['symbol']
                        for o in self.density.filter(condition=segment).
                        order_by('?').values('symbol')[:n]
                    ])
                else:
                    result_seg_sets.append([segment] * n)
            for result_segs in zip(*result_seg_sets):
                flat_result = ''.join(result_segs)
                if flat_result not in exclude_set:
                    results.append(result_segs)
                    exclude_set.add(flat_result)

        return results
Exemple #9
0
    def lookup(self, query):
        found = None

        if self.config.ignore_kana():
            if len(scripts.script_types(query)) is 1:
                if scripts.script_type(query) is scripts.Script.Hiragana or scripts.script_type(query) is scripts.Script.Katakana:
                    return found
        try:
            found = self.edict[query]

            if self.config.ignore_duplicates():
                if found.word in self.stats: found = None
                else: self.stats.append(found.word)
        except KeyError:
            if query not in self.missed: self.missed.append(query)
        finally:
            return found
Exemple #10
0
def onbin_variants(kana_segment):
    """
    Determine the sound euphony variants of a kana segment.
    """
    variants = set([kana_segment])
    if len(kana_segment) > 1:
        sokuon = _sokuon_map[scripts.script_type(kana_segment)]
        variants.add(kana_segment[:-1] + sokuon)

    return variants
Exemple #11
0
def onbin_variants(kana_segment):
    """
    Determine the sound euphony variants of a kana segment.
    """
    variants = set([kana_segment])
    if len(kana_segment) > 1:
        sokuon = _sokuon_map[scripts.script_type(kana_segment)]
        variants.add(kana_segment[:-1] + sokuon)

    return variants
Exemple #12
0
    def lookup(self, query):
        found = None

        if self.config.ignore_kana():
            if len(scripts.script_types(query)) is 1:
                if scripts.script_type(
                        query
                ) is scripts.Script.Hiragana or scripts.script_type(
                        query) is scripts.Script.Katakana:
                    return found
        try:
            found = self.edict[query]

            if self.config.ignore_duplicates():
                if found.word in self.stats: found = None
                else: self.stats.append(found.word)
        except KeyError:
            if query not in self.missed: self.missed.append(query)
        finally:
            return found
Exemple #13
0
 def sample_seq_n(self, condition_segments, n, exclude_set=None):
     dists = []
     kanji_script = scripts.Script.Kanji
     for segment in condition_segments:
         if scripts.script_type(segment) == kanji_script:
             seg_dist = ProbDist.from_query_set(self.density.filter(
                 condition=segment))
             dists.append(seg_dist)
         else:
             dists.append(segment)
     
     return SeqDist(*dists).sample_n(n, exclude_set)
Exemple #14
0
    def sample_seq_n(self, condition_segments, n, exclude_set=None):
        dists = []
        kanji_script = scripts.Script.Kanji
        for segment in condition_segments:
            if scripts.script_type(segment) == kanji_script:
                seg_dist = ProbDist.from_query_set(
                    self.density.filter(condition=segment))
                dists.append(seg_dist)
            else:
                dists.append(segment)

        return SeqDist(*dists).sample_n(n, exclude_set)
Exemple #15
0
def _get_kanji():
    """Fetches our canonical list of kanji to work with."""
    if not hasattr(_get_kanji, '_cached'):
        kanji_set = set()
        with codecs.open(settings.STROKE_SOURCE, 'r', 'utf8') as istream:
            for line in istream:
                kanji, rest = line.split()

                # check for a kanji or hanzi; our Chinese data extends into
                # the E000-F8FF private use block, so an "Unknown" script is
                # ok too
                assert len(kanji) == 1 and scripts.script_type(kanji) in \
                        (scripts.Script.Kanji, scripts.Script.Unknown)

                kanji_set.add(kanji)

        _get_kanji._cached = kanji_set

    return _get_kanji._cached
Exemple #16
0
    def process_response(self, request, response):
        if response.status_code != 200:
            return response

        if not response.get('Content-Type', '').startswith('text/html'):
            return response

        content = response.content.decode('utf8')
        if not scripts.script_types(content).intersection(
                    self.japanese_scripts):
            return response

        parts = []
        for part in scripts.script_boundaries(content):
            if scripts.script_type(part) in self.japanese_scripts:
                parts.append('<span lang="ja" xml:lang="ja">%s</span>' % part)
            else:
                parts.append(part)

        response.content = u''.join(parts).encode('utf8')

        return response
Exemple #17
0
def expand_long_vowels(kana_string):
    """
    Expands whatever long vowels are possible to expand.

        >>> a = expand_long_vowels(u'すー')
        >>> b = u'すう'
        >>> a == b
        True
    """
    script_converters = {
        scripts.Script.Hiragana: lambda x: x,
        scripts.Script.Katakana: scripts.to_katakana
    }

    table = kana_table.KanaTable.get_cached()

    out_string = ''
    for segment in scripts.script_boundaries(kana_string):
        if len(segment):
            char_type = scripts.script_type(segment)

            if char_type not in script_converters:
                out_string += segment
                continue

            reverse_operation = script_converters[char_type]
            segment = scripts.to_hiragana(segment)
        else:
            continue

        for m in _long_finder.finditer(segment):
            i = m.start()
            vowel = table.to_vowel_line(segment[i - 1])
            segment = segment[:i] + vowel + segment[i + 1:]

        out_string += reverse_operation(segment)

    return out_string
Exemple #18
0
def expand_long_vowels(kana_string):
    """
    Expands whatever long vowels are possible to expand.

        >>> a = expand_long_vowels(u'すー')
        >>> b = u'すう'
        >>> a == b
        True
    """
    script_converters = {scripts.Script.Hiragana: lambda x: x,
                         scripts.Script.Katakana: scripts.to_katakana}

    table = kana_table.KanaTable.get_cached()

    out_string = ''
    for segment in scripts.script_boundaries(kana_string):
        if len(segment):
            char_type = scripts.script_type(segment)

            if char_type not in script_converters:
                out_string += segment
                continue

            reverse_operation = script_converters[char_type]
            segment = scripts.to_hiragana(segment)
        else:
            continue

        for m in _long_finder.finditer(segment):
            i = m.start()
            vowel = table.to_vowel_line(segment[i-1])
            segment = segment[:i] + vowel + segment[i+1:]

        out_string += reverse_operation(segment)

    return out_string
Exemple #19
0
def sift_nonj_characters(data, plain):
    parts = scripts.script_boundaries(plain)
    for part in parts:
        if scripts.script_type(part) is scripts.Script.Ascii:
            data = data.replace(part, '')
    return data
Exemple #20
0
def _is_kanji(kanji):
    return isinstance(kanji, unicode) and len(kanji) == 1 \
            and scripts.script_type(kanji) == scripts.Script.Kanji
Exemple #21
0
    def eventFilter(self, object, event):

        if event.type() == QEvent.HoverLeave:
            object.setStyleSheet("QLabel { color: rgb(0, 0, 0); }")
            
            object.parent().info.hide()
            object.parent().allInfo.hide()
            object.parent().kanjiInfo.hide()
            object.parent().kanjiGroups.hide()

            desktop = QApplication.desktop().screenGeometry()
            object.parent().info.setGeometry(QRect(desktop.width() - H_INDENT - I_WIDTH - I_INDENT, desktop.height() - V_INDENT, I_WIDTH, I_HEIGHT))
        
        if event.type() == QEvent.HoverEnter:
            object.setStyleSheet("QLabel { color: rgb(0, 5, 255); }")
            
            object.parent().info.item.setText(object.text())
            
            reading = object.parent().srs.getWordPronunciationFromExample(object.text())
            if reading != object.text() :  object.parent().info.reading.setText(reading)
            else:   object.parent().info.reading.setText(u'')
            
            #parsing word
            script = scripts.script_boundaries(object.text())
            components = []

            for cluster in script:
                if scripts.script_type(cluster) == scripts.Script.Kanji:
                    for kanji in cluster:
                        components = components + list(object.parent().rdk[kanji]) + list('\n')
                
            #setting radikals
            if len(components) > 0: components.pop()    #remove last '\n'
            object.parent().info.components.setText(' '.join(components))
            object.parent().info.show()

        if event.type() == QEvent.MouseButtonPress:
            # item context menu #
            if event.button() == Qt.MiddleButton:
                
                object.parent().info.hide()
                object.parent().allInfo.hide()
                object.parent().kanjiInfo.hide()
                
                script = scripts.script_boundaries(object.text())
                resulting_info = u''
#                kanji_groups = {}
                kanji_groups = OrderedDict()
    
                for cluster in script:
                    if scripts.script_type(cluster) == scripts.Script.Kanji:
#                        for kanji in cluster[::-1]:
                        for kanji in cluster:
                            similar = object.parent().groups.findSimilarKanji(kanji)
                            try:
                                kanji_groups[kanji] = similar[:similar.index(kanji)] + similar[similar.index(kanji) + 1:] 
                            except Exception:
                                kanji_groups[kanji] = object.parent().groups.findSimilarKanji(kanji)
                                log.debug(u'Not in group: ' + kanji)
                            
                for kanji in kanji_groups:
#                for kanji in list(reversed(sorted(kanji_groups.keys()))):
                    resulting_info += kanji + u' ~\t'
                    for item in kanji_groups[kanji]:
                        lookup = object.parent().kjd[item]
                        resulting_info += " " + item + " <font style='font-family: Calibri; font-size: 12pt'>(" + lookup.gloss[0] + ")</font> "
                    resulting_info += '<br/>'
                
                if resulting_info == u'': resulting_info = u'No such groups in Kanji.Odyssey!'
                object.parent().kanjiGroups.info.setText(resulting_info)
                
                object.parent().kanjiGroups.show()
                
            # kanji info #
            if event.button() == Qt.RightButton:
                    
                object.parent().info.hide()
                object.parent().allInfo.hide()
                object.parent().kanjiGroups.hide()
                
                object.parent().kanjiInfo.info.setText(u'')
                
                script = scripts.script_boundaries(object.text())
                resulting_info = u''
    
                for cluster in script:
                    if scripts.script_type(cluster) == scripts.Script.Kanji:
                        for kanji in cluster:
                            try:
                                lookup = object.parent().kjd[kanji]
                                kun = lookup.kun_readings; on = lookup.on_readings; gloss = lookup.gloss
                                
                                resulting_info += "<font style='font-family: " + Fonts.HiragiNoMyoutyouProW3 + "; font-size: 16.5pt'>(" + kanji + ")</font>\t"
                            
                                if len(kun) > 0:
                                    resulting_info += '<b>kun: </b>' + ', '.join(kun) + '\t'
                                if len(on) > 0:
                                    resulting_info += '<b>on:</b>' + ', '.join(on) + '<br/>'
                                if len(gloss) > 0:
                                    resulting_info += "<font style='font-family: Calibri; font-size: 12pt'>" + ", ".join(gloss) + "</font><br/>"
                            except:
                                components += kanji + '<br/>'
                
                if resulting_info != '':  
                    if resulting_info.count('<br/>') > 7:  object.parent().kanjiInfo.setStyleSheet('QLabel { font-size: 13pt }')
                    object.parent().kanjiInfo.info.setText(resulting_info.rstrip('<br/>'))
                    
                else: object.parent().kanjiInfo.info.setText(u'No such kanji in kanjidic!')
                object.parent().kanjiInfo.show()
                
            # translation and strokes info #
            if event.button() == Qt.LeftButton:
                
                object.parent().kanjiInfo.hide()
                object.parent().info.hide()
                object.parent().kanjiGroups.hide()
                              
                unfillLayout(object.parent().allInfo.layout)
                object.parent().allInfo.layout.setMargin(1)
                
                kanjiList = []
                script = scripts.script_boundaries(object.text())

                for cluster in script:
                    if scripts.script_type(cluster) == scripts.Script.Kanji:
                        for kanji in cluster:
                            kanjiList.append(kanji)
                
                i=0; j=0;
                # kanji strokes
                if len(kanjiList) > 0:
                    
                    infile = open(PATH_TO_RES + STROKES + KANJI_MANIFEST, 'r')
                    text = infile.read()
                    infile.close()
                    
                    for kanji in kanjiList:
                        
                        if( text.find(kanji.encode('utf-8').encode('hex')) != -1):
                        
                            gif = QLabel()
                            gif.setSizePolicy(QSizePolicy.Expanding, QSizePolicy.Expanding)        
                            gif.setAlignment(Qt.AlignCenter) 
    
                            movie = QMovie(PATH_TO_RES + STROKES + kanji.encode('utf-8').encode('hex') + '.gif', QByteArray(), self) 
                            movie.setCacheMode(QMovie.CacheAll) 
                            movie.setSpeed(150) 

                            gif.setMovie(movie)
                            object.parent().allInfo.layout.addWidget(gif, i, j);   j = j + 1
                            movie.start()
                              
                    i = i + 1
                
                # words translation
                translations = QLabel(u'')
                translations.setFont(QFont('Calibri', 11))
                translations.setWordWrap(True)
                translations.setAlignment(Qt.AlignCenter)
                try:
                    search = object.parent().edict[object.parent().srs.getWordNonInflectedForm(object.text())]

                    translationText = u''
                    
                    variants = search.senses_by_reading()[object.parent().srs.getWordPronounciation(object.parent().srs.getWordNonInflectedForm(object.text()))][:3]
                    variants = filter (lambda e: e != '(P)', variants)                                                                         
                    
                    translationText += '<b>' + object.parent().srs.getWordPronunciationFromExample(object.text()) + '</b>:\t' + ', '.join(variants)
                    translations.setText(translationText.rstrip('\n'))
                    
                except:
                    ### by reading
                    search = object.parent().jmdict.lookupTranslationByReadingJoin(object.parent().srs.getWordPronounciation(object.parent().srs.getWordNonInflectedForm(object.text())), object.parent().options.getLookupLang())
                    if len(search) > 0:
                        if len(search) > 5: search = search[:5]
                        translations.setText('<b>' + object.parent().srs.getWordPronunciationFromExample(object.text())+ '</b>:\t' + ', '.join(search))
                    ### by kanji
                    else:
                        search = object.parent().jmdict.lookupItemByReading(object.parent().srs.getWordPronounciation(object.parent().srs.getWordNonInflectedForm(object.text())))
                        if len(search) > 0:
                            lookup = object.parent().jmdict.lookupItemTranslationJoin(search[0], object.parent().options.getLookupLang())
                            if len(lookup) > 5: lookup = lookup[:5]
                            translations.setText('<b>' + object.parent().srs.getWordPronunciationFromExample(object.text())+ '</b>:\t' + ', '.join(lookup))
                    ### nothing found
                    if len(search) == 0: translations.setText(u'Alas, no translation in edict or jmdict!')
                
                if i > 0:
                    separator = QFrame()
                    separator.setFrameShape(QFrame.HLine)
                    separator.setFrameShadow(QFrame.Sunken)
                    object.parent().allInfo.layout.addWidget(separator, i, 0, 1, j);   i = i + 1
                
                object.parent().allInfo.layout.addWidget(translations, i, 0, 1, j)
                
                object.parent().allInfo.update()
                object.parent().allInfo.show()
                
            elif object.parent().allInfo.isVisible():

                object.parent().allInfo.hide()   
                object.parent().info.show()
            
        return False
Exemple #22
0
 def addKanjiToStudy(self):
     script = scripts.script_boundaries(self.itemsMenu.wordInfo.text())
     for cluster in script:
         if scripts.script_type(cluster) == scripts.Script.Kanji:
             for kanji in cluster:
                 self.db.addKanjiToDb(kanji)
Exemple #23
0
def sift_nonj_characters(data, plain):
    parts = scripts.script_boundaries(plain)
    for part in parts:
        if scripts.script_type(part) is scripts.Script.Ascii:
            data = data.replace(part, '')
    return data
Exemple #24
0
 def test_script_type_empty(self):
     self.assertEqual(scripts.script_type(''), scripts.Script.Unknown)