def seg(id, text, out, type):
  text = filter.filter(text)
  counter.add(START_WORD)
  counter.add(END_WORD)
  l = gezi.cut(text, type)

  if type != 'word':
    for x, y in l:
      counter.add(x)
      counter2.add(y)
    words = ['%s|%s' % (x, y) for x,y in l]
  else:
    if FLAGS.seg_method == 'char':
      l2 = []
      for i, w in enumerate(l):
        for ch in w:
          counter.add(ch)
          counter2.add(str(i))
          l2.append((ch, i))
      words =  ['%s|%d' % (x, y) for x,y in l2]
    else:
      words = l
      for w in words:
        counter.add(w)

  if not FLAGS.for_pretrain:
    print(id, '\x09'.join(words), sep='\t', file=out)
  else:
    print(' '.join([x.split('|')[0] for x in words]), file=out)
def seg(id, text, out, counter):
    text = filter.filter(text)
    words = []
    for i, word in enumerate(gezi.cut(text)):
        counter.add(str(i))
        if vocab.has(word) and not word.isdigit():
            words.append('%s|%d' % (word, i))
        else:
            if six.PY2:
                for ch in word.decode('utf8'):
                    words.append('%s|%d' % (ch.encode('utf8'), i))
            else:
                for ch in word:
                    words.append('%s|%d' % (ch, i))

    if not FLAGS.for_pretrain:
        print(id, '\x09'.join(words), sep='\t', file=out)
    else:
        print(' '.join([x.split('|')[0] for x in words]), file=out)
Beispiel #3
0
def seg_(text):
    l = gezi.cut(text, type)

    if type != 'word':
        for x, y in l:
            counter.add(x)
            counter2.add(y)
        words = ['%s|%s' % (x, y) for x, y in l]
    else:
        if FLAGS.seg_method == 'char':
            l2 = []
            for i, w in enumerate(l):
                for ch in w:
                    counter.add(ch)
                    counter2.add(str(i))
                    l2.append((ch, i))
            words = ['%s|%d' % (x, y) for x, y in l2]
        else:
            words = l
            for w in words:
                counter.add(w)

    return words