def test4(): corpus = [u'SCANV网址安全中心(http://scanv.com)是一个综合性的网址安全服务平台。通过网址安全中心,用户可以方便的查询到要访问的网址是否存在恶意行为,同时可以在SCANV中在线举报曝光违法恶意网站。', u'工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作',] tagger = Tagger() for str in corpus: print '\nCPPJIEBA:' for s in tagger.pos(list(cppjiebapy.cut(str))): print s[0],'/',s[1], " ", print '\nCRF:' for s in tagger.cut_pos(str): print s[0],'/',s[1], " ", print '\n'
def test4(): corpus = [ u'SCANV网址安全中心(http://scanv.com)是一个综合性的网址安全服务平台。通过网址安全中心,用户可以方便的查询到要访问的网址是否存在恶意行为,同时可以在SCANV中在线举报曝光违法恶意网站。', u'工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作', ] tagger = Tagger() for str in corpus: print '\nCPPJIEBA:' for s in tagger.pos(list(cppjiebapy.cut(str))): print s[0], '/', s[1], " ", print '\nCRF:' for s in tagger.cut_pos(str): print s[0], '/', s[1], " ", print '\n'
def mksogou_(num): copus_sample = '/opt/projects/packages/sogou_corpus/Sample' print 'processing ', num tagger = Tagger() fout = codecs.open('sogou_out%d'%num, 'w', 'utf-8') with open('sogou_%d'%num) as fw: for f in fw: with open(f.replace('\n', '')) as file: for line in file: content = line.decode('gb2312', 'ignore').encode('utf-8').decode('utf-8', 'replace') line = [] for word_pos in tagger.cut_pos(content): m = en_word_re.match(word_pos[0]) if m: thew = m.group()+word_pos[1].strip() line.append(thew) if len(line) > 3: fout.write(u' '.join(line)+u'\n')
def sogou_files(): copus_sample = '/opt/projects/packages/sogou_corpus/Sample' tagger = Tagger() for d in os.listdir(copus_sample): d1 = os.path.join(copus_sample, d) if os.path.isdir(d1): for d3 in os.listdir(d1): f = os.path.join(d1, d3) if os.path.isfile(f): yield f
class Tokenizer: crf_tagger = Tagger() def tokenize(self, text): for c in self.crf_tagger.cut(text): yield c
class CrfServer(object): tagger = Tagger() def cut(self, line): return [s.encode('utf-8') for s in self.tagger.cut(line)] def cut_pos(self, line): return [(s[0]+'/'+s[1]).encode('utf-8') for s in self.tagger.cut_pos(line)]