Ejemplo n.º 1
0
def main():
    gkseg.init('data/model.txt')
    p = Pool(5)
    start = time.time()
    p.map(seg, listdir('tests/text'))
    print '---------------------------------------------------------------'
    print time.time() - start
    print '---------------------------------------------------------------'
    gkseg.destroy()
Ejemplo n.º 2
0
def main():
    gkseg.init('data/model.txt')
    p = Pool(5)
    start = time.time()
    p.map(seg, listdir('tests/text'))
    print '---------------------------------------------------------------'
    print time.time() - start
    print '---------------------------------------------------------------'
    gkseg.destroy()
Ejemplo n.º 3
0
def main():
    gkseg.init('data/model.txt')
    count = 0
    length = 0
    start = time.time()
    for rawfile in listdir('tests/text'):
        text = '\n'.join(codecs.open(rawfile, 'r', 'utf-8').readlines())
        (wds, terms, labeled, err) = gkseg.process(text, True)
        count = count + len(err)
        length = length + len(wds)
    print '---------------------------------------------------------------'
    print float(count)/length
    print '---------------------------------------------------------------'
    gkseg.destroy()
Ejemplo n.º 4
0
def main():
    gkseg.init('data/model.txt')
    count = 0
    start = time.time()
    for rawfile in listdir('tests/text'):
        text = '\n'.join(codecs.open(rawfile, 'r', 'utf-8').readlines())
        wds = gkseg.seg(text)
        o = codecs.open(os.path.join('tests/temp', os.path.basename(rawfile)), 'w', 'utf-8')
        o.write(' '.join(wds))
        o.close()
        count = count + 1
    print '---------------------------------------------------------------'
    print time.time() - start
    print count
    print '---------------------------------------------------------------'
    gkseg.destroy()
Ejemplo n.º 5
0
def main():
    gkseg.init('data/model.txt')
    count = 0
    start = time.time()
    for rawfile in listdir('tests/text'):
        text = '\n'.join(codecs.open(rawfile, 'r', 'utf-8').readlines())
        wds = gkseg.seg(text)
        o = codecs.open(os.path.join('tests/temp', os.path.basename(rawfile)),
                        'w', 'utf-8')
        o.write(' '.join(wds))
        o.close()
        count = count + 1
    print '---------------------------------------------------------------'
    print time.time() - start
    print count
    print '---------------------------------------------------------------'
    gkseg.destroy()
Ejemplo n.º 6
0
#segment the sentence into a list of words
seg = gkseg.seg(text)
# for s in seg:
#     print s.encode('utf-8')

#extract the important words from the sentence
# terms = gkseg.term(text)
# for t in terms:
#     print t.encode('utf-8')

#label the sentence
# labels = gkseg.label(text)
# for l in labels:
#     print l.encode('utf-8')

# prepare Chinese seg for ENR
seg_str = ""
for s in seg:
    seg_str += s.encode('utf-8') + " "
# print seg_str

# get all entities
tags = tagger.get_entities(seg_str)
for t in tags:
    print t.encode('utf-8')

# tags_json = tagger.json_entities(seg_text)
# print tags_json

gkseg.destroy()
Ejemplo n.º 7
0
#segment the sentence into a list of words
seg = gkseg.seg(text)
# for s in seg: 
#     print s.encode('utf-8')

#extract the important words from the sentence
# terms = gkseg.term(text)
# for t in terms: 
#     print t.encode('utf-8')

#label the sentence
# labels = gkseg.label(text)
# for l in labels: 
#     print l.encode('utf-8')

# prepare Chinese seg for ENR
seg_str =""
for s in seg: 
    seg_str += s.encode('utf-8')+" "
# print seg_str

# get all entities 
tags= tagger.get_entities(seg_str)
for t in tags: 
    print t.encode('utf-8')

# tags_json = tagger.json_entities(seg_text)
# print tags_json

gkseg.destroy()