Beispiel #1
0
 def deinflect(self, corpus, explain=False):
     '''Attempt to find the plain form of an inflected word or phrase.'''
     variants = [ ]
     for pos in Nihongo.STEMS:
         forms = getattr(Nihongo, pos)
         stem = ''
         if not hasattr(Nihongo, pos):
             print 'no Nihongo attrib for pos %s' %s
             continue
         stem = romaji.kana(Nihongo.STEMS[pos])
         for form in forms:
             for variant in forms[form]:
                 kana = romaji.kana(variant)
                 if corpus.endswith(kana):
                     candidate = (pos, variant, len(kana), kana, stem, form)
                     variants.append( candidate )
     if not variants:
         if explain: return (corpus, None)
         return corpus
     variants.sort(lambda x,y: y[2]-x[2])
     while variants:
         winner = variants.pop()
         word = corpus[:-winner[2]] + winner[4]
         found = self.db.find(word)
         if found and found[0].is_pos(winner[0]):
             if explain: return (word, winner[-1])
             return word
     if explain: return (corpus, None)
     return corpus
Beispiel #2
0
def katakanaize(hiragana):
    """
    Return katakana

    Transform a hiragana string to katakana through the circuitous
    route of converting it to rōmaji, then to uppercase, than to
    kana again.

    """
    return romaji.kana(romaji.roma(hiragana).upper())
Beispiel #3
0
 def find(self, word, roma=True, kana=True, kanji=True):
     '''Searches for entries matching given word (in kana or kanji).'''
     if not word: return None
     found = [ ]
     if roma and is_other(word[0]):
         word = romaji.kana(word)
         kana = True
     if kanji and word[0] in self._keb0:
         found.extend( [ e for e in self._keb0[word[0]] if e == word ] )
     if kana and word[0] in self._reb0:
         found.extend( [ e for e in self._reb0[word[0]] if e == word ] )
     # search by english words in meanings is not yet supported
     return found
Beispiel #4
0
    try:
        import psyco
        psyco.full()
    except:
        print '(no psyco acceleration available)'
        pass
    
    import os
    sys.stdout = codecs.lookup('utf-8')[-1](sys.stdout)

    # load the dictionary
    db = JmDict('JMdict_e')

    # self-test: look up a common word
    if False:
        term = romaji.kana('konnichiha')
        entries = db.find(term)
        print term, '=>'
        for e in entries:
            print u' %s' % e
        for k in e.keb: print u' kanji: %s' % k
        for r in e.reb: print u'  kana: %s' % r
        if e.is_uk(): print 'usually kana'
        print

    # prepare for dictionary work
    corpus = []
    source = 'corpus.txt'
    xl = Nihongo(db)

    # self-test: de-inflect some simple words via dictionary