def test4(): corpus = [u'SCANV网址安全中心(http://scanv.com)是一个综合性的网址安全服务平台。通过网址安全中心,用户可以方便的查询到要访问的网址是否存在恶意行为,同时可以在SCANV中在线举报曝光违法恶意网站。', u'工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作',] tagger = Tagger() for str in corpus: print '\nCPPJIEBA:' for s in tagger.pos(list(cppjiebapy.cut(str))): print s[0],'/',s[1], " ", print '\nCRF:' for s in tagger.cut_pos(str): print s[0],'/',s[1], " ", print '\n'
def test4(): corpus = [ u'SCANV网址安全中心(http://scanv.com)是一个综合性的网址安全服务平台。通过网址安全中心,用户可以方便的查询到要访问的网址是否存在恶意行为,同时可以在SCANV中在线举报曝光违法恶意网站。', u'工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作', ] tagger = Tagger() for str in corpus: print '\nCPPJIEBA:' for s in tagger.pos(list(cppjiebapy.cut(str))): print s[0], '/', s[1], " ", print '\nCRF:' for s in tagger.cut_pos(str): print s[0], '/', s[1], " ", print '\n'
def mksogou_(num): copus_sample = '/opt/projects/packages/sogou_corpus/Sample' print 'processing ', num tagger = Tagger() fout = codecs.open('sogou_out%d'%num, 'w', 'utf-8') with open('sogou_%d'%num) as fw: for f in fw: with open(f.replace('\n', '')) as file: for line in file: content = line.decode('gb2312', 'ignore').encode('utf-8').decode('utf-8', 'replace') line = [] for word_pos in tagger.cut_pos(content): m = en_word_re.match(word_pos[0]) if m: thew = m.group()+word_pos[1].strip() line.append(thew) if len(line) > 3: fout.write(u' '.join(line)+u'\n')