def init(vocab_path=None, append=None): global vocab, Segmentor if vocab is None: vocabulary.init(vocab_path, append=append) print('ENCODE_UNK', ENCODE_UNK, file=sys.stderr) vocab = vocabulary.get_vocab() Segmentor = gezi.Segmentor()
flags.DEFINE_string('seg_method', 'default', '') import sys,os import nowarning from libsegment import * import conf from conf import WORDS_SEP #need ./data ./conf #Segmentor.Init() print('seg_method:', FLAGS.seg_method, file=sys.stderr) sys.path.append('../') import gezi Segmentor = gezi.Segmentor() for line in open(sys.argv[1]): l = line.rstrip().split('\t') img = l[0] img = img[img.rindex('/') + 1:] if len(l) < 3: continue index = 0 for comment in l[2:]: if len(comment) == 0: continue #words = Segmentor.Segment(comment, ' ') words = WORDS_SEP.join(Segmentor.Segment(comment, FLAGS.seg_method))
def init(): global vocab, Segmentor if vocab is None: print('ENCODE_UNK', ENCODE_UNK, file=sys.stderr) vocab = vocabulary.get_vocab() Segmentor = gezi.Segmentor()
# \author chenghuige # \date 2016-09-05 11:48:05.006754 # \Description # ============================================================================== from __future__ import absolute_import from __future__ import division from __future__ import print_function import tensorflow as tf import gezi import libsegment seg = gezi.Segmentor() print('\t'.join(seg.Segment('美女一定要支持'))) print('\x01'.join(seg.Segment('Oh q the same thing to me'))) print('\x01'.join(seg.Segment('Oh q the same thing to me', 'phrase_single'))) print('\x01'.join(seg.Segment('Oh q the same thing to me', 'phrase'))) print('\t'.join(seg.Segment('绿鹭'))) print('\t'.join(seg.segment('绿鹭'))) print('\t'.join(seg.segment_phrase('绿鹭'))) print('\t'.join(gezi.seg.Segment('绿鹭', libsegment.SEG_NEWWORD))) print('\t'.join(gezi.seg.Segment('绿鹭'))) print('|'.join(gezi.segment_char('a baby is looking at 我的小伙伴oh 我不no no没关系 是不是 tian, that not '))) from libword_counter import Vocabulary