Ejemplo n.º 1
0
 def forward_matching(text):
   length = 1
   max_length = min(len(text), DACTORY.getNaiveDictionary().query('maxTermSize'))
   # is_digit_alphabet = lambda x: x.isalnum() or x in ['.','-','+','/','(',')']
   while length<=max_length:
     # condition length==1 是为了将单字也输出为结果,因为词典里面没有包含单字
     if length==1 or DACTORY.getNaiveDictionary().query(text[:length].encode('utf-8')):
       # print text[length-1]
       yield length
     length += 1
Ejemplo n.º 2
0
def ln_frequency_for_sigle_chars(unicode_text, triplets):
  def sigle_char(unicode_text, triplets):
    i = 0
    for l in triplets:
      if l-i==1:
        yield unicode_text[i:l]
      i = l
  sigle_chars = [x for x in sigle_char(unicode_text, triplets)]
  # print 'sigle chars:', '|'.join(sigle_chars)
  ln_freq = []
  for char in sigle_chars:
    freq = DACTORY.get_char_dictionary().query(char.encode('utf-8'))
    if freq:
      # print freq
      ln_freq.append(math.log(int(freq)))
  # print ln_freq
  return reduce(lambda x,y:x+y,ln_freq)
Ejemplo n.º 3
0
def maximumMatching(text):
    uniText = text.decode('utf-8')
    terms = []
    size = len(uniText)
    dic = DACTORY.getNaiveDictionary()
    start = 0
    maxSize = dic.query('maxTermSize')
    while start < size:
        curSize = size - start
        end = start + min(curSize, maxSize)
        # print uniText[start:end]
        while end - start > 1 and not dic.query(
                uniText[start:end].encode('utf-8')):
            end -= 1
        terms.append(uniText[start:end])
        start = end
    return map(lambda x: x.encode('utf-8'), terms)
def reverseMaximumMatching(text):
    uniText = text.decode('utf-8')
    terms = []
    size = len(uniText)
    dic = DACTORY.getNaiveDictionary()
    end = size
    maxSize = dic.query('maxTermSize')
    while end > 0:
        curSize = end
        start = end - min(curSize, maxSize)
        # print uniText[start:end]
        while end - start > 1 and not dic.query(
                uniText[start:end].encode('utf-8')):
            start += 1
        terms.append(uniText[start:end])
        end = start
    terms.reverse()
    return map(lambda x: x.encode('utf-8'), terms)
Ejemplo n.º 5
0
      i = l
  sigle_chars = [x for x in sigle_char(unicode_text, triplets)]
  # print 'sigle chars:', '|'.join(sigle_chars)
  ln_freq = []
  for char in sigle_chars:
    freq = DACTORY.get_char_dictionary().query(char.encode('utf-8'))
    if freq:
      # print freq
      ln_freq.append(math.log(int(freq)))
  # print ln_freq
  return reduce(lambda x,y:x+y,ln_freq)
  


if __name__ == '__main__':
  print DACTORY.getNaiveDictionary().query('长江大桥')
  print DACTORY.getNaiveDictionary().query('科学')
  # 居然还有"和服务"这种词,为了测试规则四只能手动删了
  print DACTORY.getNaiveDictionary().query('和服务')
  DACTORY.getNaiveDictionary().dic_.pop('和服务')
  print DACTORY.getNaiveDictionary().query('和服务')
  print DACTORY.getNaiveDictionary().query('施和')
  # print DACTORY.getNaiveDictionary().query('色')
  # print len('武汉市'.decode('utf-8'))
  # print '|'.join(siple_mmseg('武汉市长江大桥'))
  # print get_longest_triplets_for_1st_char('武汉市长江大桥'.decode('utf-8'))
  # print get_longest_triplets_for_1st_char('武汉市'.decode('utf-8'))
  # print get_longest_triplets_for_1st_char('研究生命科学'.decode('utf-8'))
  # print get_longest_triplets_for_1st_char('科学'.decode('utf-8'))
  # terms = siple_mmseg('研究生命科学')
  print '|'.join(complex_mmseg('研究生命科学'))
Ejemplo n.º 6
0
import sys, traceback
sys.path.append('../0_Dictionary')
from DictionaryFactory import DACTORY


def maximumMatching(text):
    uniText = text.decode('utf-8')
    terms = []
    size = len(uniText)
    dic = DACTORY.getNaiveDictionary()
    start = 0
    maxSize = dic.query('maxTermSize')
    while start < size:
        curSize = size - start
        end = start + min(curSize, maxSize)
        # print uniText[start:end]
        while end - start > 1 and not dic.query(
                uniText[start:end].encode('utf-8')):
            end -= 1
        terms.append(uniText[start:end])
        start = end
    return map(lambda x: x.encode('utf-8'), terms)


if __name__ == '__main__':
    print DACTORY.getNaiveDictionary().query('中国人')
    print DACTORY.getNaiveDictionary().query('武汉市')
    print len('武汉市'.decode('utf-8'))
    terms = maximumMatching('武汉市长江大桥')
    print '|'.join(terms)
sys.path.append('../0_Dictionary')
from DictionaryFactory import DACTORY


def reverseMaximumMatching(text):
    uniText = text.decode('utf-8')
    terms = []
    size = len(uniText)
    dic = DACTORY.getNaiveDictionary()
    end = size
    maxSize = dic.query('maxTermSize')
    while end > 0:
        curSize = end
        start = end - min(curSize, maxSize)
        # print uniText[start:end]
        while end - start > 1 and not dic.query(
                uniText[start:end].encode('utf-8')):
            start += 1
        terms.append(uniText[start:end])
        end = start
    terms.reverse()
    return map(lambda x: x.encode('utf-8'), terms)


if __name__ == '__main__':
    # print DACTORY.getNaiveDictionary().query('中国人')
    print DACTORY.getNaiveDictionary().query('人')
    # print len('武汉市'.decode('utf-8'))
    terms = reverseMaximumMatching('武汉市长江大桥')
    print '|'.join(terms)