Beispiel #1
0
def test4():
    corpus = [u'SCANV网址安全中心(http://scanv.com)是一个综合性的网址安全服务平台。通过网址安全中心,用户可以方便的查询到要访问的网址是否存在恶意行为,同时可以在SCANV中在线举报曝光违法恶意网站。',
            u'工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作',]

    tagger = Tagger()
    for str in corpus:
        print '\nCPPJIEBA:'
        for s in tagger.pos(list(cppjiebapy.cut(str))):
            print s[0],'/',s[1], " ",

        print '\nCRF:'
        for s in tagger.cut_pos(str):
            print s[0],'/',s[1], " ",
        print '\n'
Beispiel #2
0
def test4():
    corpus = [
        u'SCANV网址安全中心(http://scanv.com)是一个综合性的网址安全服务平台。通过网址安全中心,用户可以方便的查询到要访问的网址是否存在恶意行为,同时可以在SCANV中在线举报曝光违法恶意网站。',
        u'工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作',
    ]

    tagger = Tagger()
    for str in corpus:
        print '\nCPPJIEBA:'
        for s in tagger.pos(list(cppjiebapy.cut(str))):
            print s[0], '/', s[1], " ",

        print '\nCRF:'
        for s in tagger.cut_pos(str):
            print s[0], '/', s[1], " ",
        print '\n'
Beispiel #3
0
def mksogou_(num):
    copus_sample = '/opt/projects/packages/sogou_corpus/Sample'
    print 'processing ', num
    tagger = Tagger()
    fout = codecs.open('sogou_out%d'%num, 'w', 'utf-8')
    with open('sogou_%d'%num) as fw:
        for f in fw:
            with open(f.replace('\n', '')) as file:
                for line in file:
                    content = line.decode('gb2312', 'ignore').encode('utf-8').decode('utf-8', 'replace')
                    line = []
                    for word_pos in tagger.cut_pos(content):
                        m = en_word_re.match(word_pos[0])
                        if m:
                            thew = m.group()+word_pos[1].strip()
                            line.append(thew)
                    if len(line) > 3:
                        fout.write(u' '.join(line)+u'\n')
Beispiel #4
0
def sogou_files():
    copus_sample = '/opt/projects/packages/sogou_corpus/Sample'
    tagger = Tagger()

    for d in os.listdir(copus_sample):
        d1 = os.path.join(copus_sample, d)
        if os.path.isdir(d1):
            for d3 in os.listdir(d1):
                f = os.path.join(d1, d3)
                if os.path.isfile(f):
                    yield f
Beispiel #5
0
class Tokenizer:
    crf_tagger = Tagger()

    def tokenize(self, text):
        for c in self.crf_tagger.cut(text):
            yield c
Beispiel #6
0
class CrfServer(object):
    tagger = Tagger()
    def cut(self, line):
        return [s.encode('utf-8') for s in self.tagger.cut(line)]
    def cut_pos(self, line):
        return [(s[0]+'/'+s[1]).encode('utf-8') for s in self.tagger.cut_pos(line)]