def seg(id, text, out, type): text = filter.filter(text) counter.add(START_WORD) counter.add(END_WORD) l = gezi.cut(text, type) if type != 'word': for x, y in l: counter.add(x) counter2.add(y) words = ['%s|%s' % (x, y) for x,y in l] else: if FLAGS.seg_method == 'char': l2 = [] for i, w in enumerate(l): for ch in w: counter.add(ch) counter2.add(str(i)) l2.append((ch, i)) words = ['%s|%d' % (x, y) for x,y in l2] else: words = l for w in words: counter.add(w) if not FLAGS.for_pretrain: print(id, '\x09'.join(words), sep='\t', file=out) else: print(' '.join([x.split('|')[0] for x in words]), file=out)
def seg(id, text, out, counter): text = filter.filter(text) words = [] for i, word in enumerate(gezi.cut(text)): counter.add(str(i)) if vocab.has(word) and not word.isdigit(): words.append('%s|%d' % (word, i)) else: if six.PY2: for ch in word.decode('utf8'): words.append('%s|%d' % (ch.encode('utf8'), i)) else: for ch in word: words.append('%s|%d' % (ch, i)) if not FLAGS.for_pretrain: print(id, '\x09'.join(words), sep='\t', file=out) else: print(' '.join([x.split('|')[0] for x in words]), file=out)
def seg_(text): l = gezi.cut(text, type) if type != 'word': for x, y in l: counter.add(x) counter2.add(y) words = ['%s|%s' % (x, y) for x, y in l] else: if FLAGS.seg_method == 'char': l2 = [] for i, w in enumerate(l): for ch in w: counter.add(ch) counter2.add(str(i)) l2.append((ch, i)) words = ['%s|%d' % (x, y) for x, y in l2] else: words = l for w in words: counter.add(w) return words