def test_can_get_reading_word_by_word(self): """Test that we can get reading word by word""" for word, reading in MeCab().wordByWord(u'来週からテストが始まる。'): if word == u'来週': assert reading == u'らいしゅう' elif word == u'から': assert reading == u'' elif word == u'始まる': assert reading == u'はじまる' elif word == u'テスト': assert reading == u'' elif word == u'が': assert reading == u''
def lookup_item(key): """Looks up item definition, translations, readings, examples and so on""" # Convert key to unicode key = unicode(key, "utf-8") # TODO: profile & move to controller|processor, etc # TODO: supported language list (dict?) supported = { # Single kanji characters may be recognised as Chinese "Japanese": set(["ja", "zh"]), "English": set(["en"]), # Some words are nearly identical in Ukrainian "Russian": set(["ru", "uk"]), } # TODO: 'Processor' module to use in scheduler # TODO: language detector (in 'Processor'?) # Get a set of possible key languages detected = set(Language().detect(key)) # If at least one detected language is supported if detected.intersection(supported["Japanese"]): results = [] # Get examples examples = Weblio().examples(key, 10) mecab = MeCab() # Get readings for examples # TODO: double check, that everything is in unicode # TODO: stopped working, check why for example, translation in examples: # reading = mecab.reading(example) readings = mecab.wordByWord(example) results.append({"example": example, "readings": readings, "translation": translation}) # TODO: add another (optional) key to route -> response type, json|html # return {'term': key, 'data': results} return render("lookup", term=key, examples=results) else: return {"result": "error", "reason": "Unsupported language"}
def addItem(self, key): """Add pending item to DB""" key = unicode(key, 'utf-8') # Check, if key already exists if len(Key.objects(value=key)) == 1: return None # Detect language detected = set(Language().detect(key)) supported = [ lang for lang in languages if detected.intersection(languages.get(lang)) ] # Supported language detected if supported: # NB: should deinflect to baseform (not so simple, actually!) item = Key(value=key, lang=supported.pop()) if(item.lang == 'Japanese'): # Set tags item.tags = ['testing'] # Detect part of speech # NB: should probably do this in POST-PROCESSING item.pos = MeCab().partOfSpeech(item.value) #item.pos, base = MeCab().partOfSpeech(item.value) #if base != item.value: #item.value = base # TODO: get type (somehow, based on pos) # TODO: if noun & 1 symbol == kanji, if two and more = word... if len(item.value) == 1: item.category = 'kanji' elif item.pos == '': item.category = 'compound' else: item.category = 'word' # Unprocesed item item.status = 'new' # Save item item.save() return item # Unsupported language else: return None
def process(self, category='kanji', limit=100): """Process all new & unprocessed kanji keys""" wn = Wordnet() mc = MeCab() ji = Jisho() # 0. Find unprocessed kanji key try: for key in Key.objects( category=category, status='new' ).timeout(False).limit(limit): print 'Processing ', key.value # 0a. Get reading for kanji itself key_reading = mc.reading(key.value) key_gloss = Gloss() key_gloss.readings.update({'default': key_reading}) key_gloss.save() # 0b. Initialize corresponding Fact key_fact = Fact(key=key, gloss=key_gloss) # 1. Get usages from WordNet words = wn.complete(key.value) if words: for word in words[:7]: # 2. Check, if reading is found reading = mc.reading(word) if(not reading): continue # 3. Check, if definition is found definitions = wn.lookup(word) if(not definitions): continue # 4. Create new Key and corresponding Fact entities try: # Check if such item already exists existing_key = Key.objects.get(value=word) fact = existing_key.fact except (DoesNotExist, MultipleObjectsReturned): # 5a. Create Gloss entity for most common definitions gloss = Gloss() # No more than 2-4 definitions! for definition in definitions[:3]: gloss.translations.append(definition['gloss']) gloss.readings.update({'default': reading}) gloss.save() # 5b. Create corresponding key & fact new_key = Key( value=word, category='word', tags=['minor'] ).save() fact = Fact(key=new_key, gloss=gloss).save() new_key.fact = fact new_key.status = 'processed' new_key.save() # TODO: add synonyms based on 'words'? # TODO: parse components? # TODO: find advanced examples? #6. Link fact to key-fact as usages key_fact.usages.append(fact) # 1a. If still no usages found (or not enough) if len(key_fact.usages) < 2: words = ji.define(key.value, 7) for word, info in words: # 4. Create new Key and corresponding Fact entities try: # Check if such item already exists existing_key = Key.objects.get(value=word) fact = existing_key.fact except (DoesNotExist, MultipleObjectsReturned): # 5a. Create Gloss entity for most common definitions gloss = Gloss() gloss.translations.append(info['meaning']) gloss.readings.update({'default': info['kana']}) gloss.save() # 5b. Create corresponding key & fact new_key = Key( value=word, category='word', tags=['minor'] ).save() fact = Fact(key=new_key, gloss=gloss).save() new_key.fact = fact new_key.status = 'processed' new_key.save() #6. Link fact to key-fact as usages key_fact.usages.append(fact) #7. Save key fact and corresponding key (bi-directional link) key_fact.save() key.fact = key_fact if len(key_fact.usages) > 0: # todo: if still nothing found -> lookup in names # dictionary (jisho) key.status = 'processed' key.save() print 'Total usages: ', len(key.usages()) print '----------------' except OperationFailure as e: print 'There was an error querying mongo db: %s' % e
def test_can_get_sentence_reading_in_hiragana(self): """Test that we can get correct sentence reading""" assert (MeCab().reading(u'来週からテストが始まる。') == u'らいしゅうからてすとがはじまる。')
def test_can_get_single_kanji_reading(self): """Test that we can get single kanji reading""" assert MeCab().reading(u'音') == u'おと'
def test_can_get_sentence_reading_in_katakana(self): """Test that we can get correct sentence reading without conversion""" assert (MeCab().reading(u'来週からテストが始まる。', hiragana=False) == u'ライシュウカラテストガハジマル。')