def __solve_kana(self): for branch in self.__current_branches: #The next character in the reading that this branch starts at r_index = branch.next_reading if r_index >= len(self.reading): continue r_char = self.reading[r_index] if is_kata(self.__w_char) and is_hira(r_char): r_char = hira_to_kata(r_char) if is_hira(self.__w_char) and is_kata(r_char): r_char = kata_to_hira(r_char) if self.__w_char == r_char: s = Segment(None, self.__w_char, self.__w_char, 0, self.reading[r_index], self.__w_char) n_branch = Branch(branch, s) self.__branches_at[self.__w_index+1].append(n_branch) self.__usable_branches += 1
def get_id(char, reading): """Returns the database id of the reading of char. Reading can be either hiragana or katakana (on or kun readings can be found with either).""" if len(reading) == 0: return None conn.row_factory = sqlite3.Row # Get both the hiragana and katakana form of reading if tools.is_kata(reading[0]): k_reading = reading h_reading = tools.kata_to_hira(reading) else: h_reading = reading k_reading = tools.hira_to_kata(reading) s = "SELECT * FROM reading WHERE character=? and (reading=? or reading=?)" result = conn.execute(s, [char, h_reading, k_reading]).fetchall() if len(result) <= 0: return None else: return result[0]["id"]
def __solve_character(self): #replace 々 with its respective kanji if self.__w_char == u'々' and self.__w_index > 0: q_char = self.word[self.__w_index-1] else: q_char = self.__w_char s = "select id, reading from reading where character=?" char_readings = c.execute(s, q_char).fetchall() for cr in char_readings: dic_r = cr['reading'] word = self.word[self.__w_index:] word_len = len(word) #so we don't check ahead of it variants = get_variants(dic_r) # print "-----" + cr['reading'] + "-----" # for (r, tag, rl) in variants: # print r, tag for b in self.__current_branches: r_index = b.next_reading if r_index >= len(self.reading): continue reading = self.reading[r_index:] for var in variants: r = var.reading tags = var.tags rl = var.length known_r = reading[:rl] kr_is_kata = is_kata(known_r[0]) #If they're not both katakana, convert the non-katakana #to hiragana so we can compare them. if kr_is_kata and not is_kata(r[0]): known_r = kata_to_hira(known_r) elif not kr_is_kata and is_kata(r[0]): r = kata_to_hira(r) #we want a complete match of reading to kanji reading variant match_length = 0 #trailing kana (after kanji) in the word that is part of the reading w_trail = 0 for (i, j) in zip(known_r, r): if i == j: match_length += 1 #also check ahead for kana in the word that could #belong to this reading if w_trail+1 < word_len and word[w_trail + 1] == i: w_trail += 1 if match_length == len(r): seg = Segment(tags, self.__w_char, dic_r, cr['id'], reading[:rl], word[:1]) n_branch = Branch(b, seg) self.__branches_at[self.__w_index+1].append(n_branch) self.__usable_branches += 1 if w_trail > 0: seg = Segment(tags, self.__w_char, dic_r, cr['id'], reading[:rl], word[:1+w_trail]) n_branch = Branch(b, seg) self.__branches_at[self.__w_index+1+w_trail].append(n_branch) self.__usable_branches += 1