def forward_matching(text): length = 1 max_length = min(len(text), DACTORY.getNaiveDictionary().query('maxTermSize')) # is_digit_alphabet = lambda x: x.isalnum() or x in ['.','-','+','/','(',')'] while length<=max_length: # condition length==1 是为了将单字也输出为结果,因为词典里面没有包含单字 if length==1 or DACTORY.getNaiveDictionary().query(text[:length].encode('utf-8')): # print text[length-1] yield length length += 1
def ln_frequency_for_sigle_chars(unicode_text, triplets): def sigle_char(unicode_text, triplets): i = 0 for l in triplets: if l-i==1: yield unicode_text[i:l] i = l sigle_chars = [x for x in sigle_char(unicode_text, triplets)] # print 'sigle chars:', '|'.join(sigle_chars) ln_freq = [] for char in sigle_chars: freq = DACTORY.get_char_dictionary().query(char.encode('utf-8')) if freq: # print freq ln_freq.append(math.log(int(freq))) # print ln_freq return reduce(lambda x,y:x+y,ln_freq)
def maximumMatching(text): uniText = text.decode('utf-8') terms = [] size = len(uniText) dic = DACTORY.getNaiveDictionary() start = 0 maxSize = dic.query('maxTermSize') while start < size: curSize = size - start end = start + min(curSize, maxSize) # print uniText[start:end] while end - start > 1 and not dic.query( uniText[start:end].encode('utf-8')): end -= 1 terms.append(uniText[start:end]) start = end return map(lambda x: x.encode('utf-8'), terms)
def reverseMaximumMatching(text): uniText = text.decode('utf-8') terms = [] size = len(uniText) dic = DACTORY.getNaiveDictionary() end = size maxSize = dic.query('maxTermSize') while end > 0: curSize = end start = end - min(curSize, maxSize) # print uniText[start:end] while end - start > 1 and not dic.query( uniText[start:end].encode('utf-8')): start += 1 terms.append(uniText[start:end]) end = start terms.reverse() return map(lambda x: x.encode('utf-8'), terms)
i = l sigle_chars = [x for x in sigle_char(unicode_text, triplets)] # print 'sigle chars:', '|'.join(sigle_chars) ln_freq = [] for char in sigle_chars: freq = DACTORY.get_char_dictionary().query(char.encode('utf-8')) if freq: # print freq ln_freq.append(math.log(int(freq))) # print ln_freq return reduce(lambda x,y:x+y,ln_freq) if __name__ == '__main__': print DACTORY.getNaiveDictionary().query('长江大桥') print DACTORY.getNaiveDictionary().query('科学') # 居然还有"和服务"这种词,为了测试规则四只能手动删了 print DACTORY.getNaiveDictionary().query('和服务') DACTORY.getNaiveDictionary().dic_.pop('和服务') print DACTORY.getNaiveDictionary().query('和服务') print DACTORY.getNaiveDictionary().query('施和') # print DACTORY.getNaiveDictionary().query('色') # print len('武汉市'.decode('utf-8')) # print '|'.join(siple_mmseg('武汉市长江大桥')) # print get_longest_triplets_for_1st_char('武汉市长江大桥'.decode('utf-8')) # print get_longest_triplets_for_1st_char('武汉市'.decode('utf-8')) # print get_longest_triplets_for_1st_char('研究生命科学'.decode('utf-8')) # print get_longest_triplets_for_1st_char('科学'.decode('utf-8')) # terms = siple_mmseg('研究生命科学') print '|'.join(complex_mmseg('研究生命科学'))
import sys, traceback sys.path.append('../0_Dictionary') from DictionaryFactory import DACTORY def maximumMatching(text): uniText = text.decode('utf-8') terms = [] size = len(uniText) dic = DACTORY.getNaiveDictionary() start = 0 maxSize = dic.query('maxTermSize') while start < size: curSize = size - start end = start + min(curSize, maxSize) # print uniText[start:end] while end - start > 1 and not dic.query( uniText[start:end].encode('utf-8')): end -= 1 terms.append(uniText[start:end]) start = end return map(lambda x: x.encode('utf-8'), terms) if __name__ == '__main__': print DACTORY.getNaiveDictionary().query('中国人') print DACTORY.getNaiveDictionary().query('武汉市') print len('武汉市'.decode('utf-8')) terms = maximumMatching('武汉市长江大桥') print '|'.join(terms)
sys.path.append('../0_Dictionary') from DictionaryFactory import DACTORY def reverseMaximumMatching(text): uniText = text.decode('utf-8') terms = [] size = len(uniText) dic = DACTORY.getNaiveDictionary() end = size maxSize = dic.query('maxTermSize') while end > 0: curSize = end start = end - min(curSize, maxSize) # print uniText[start:end] while end - start > 1 and not dic.query( uniText[start:end].encode('utf-8')): start += 1 terms.append(uniText[start:end]) end = start terms.reverse() return map(lambda x: x.encode('utf-8'), terms) if __name__ == '__main__': # print DACTORY.getNaiveDictionary().query('中国人') print DACTORY.getNaiveDictionary().query('人') # print len('武汉市'.decode('utf-8')) terms = reverseMaximumMatching('武汉市长江大桥') print '|'.join(terms)