def main(): gkseg.init('data/model.txt') p = Pool(5) start = time.time() p.map(seg, listdir('tests/text')) print '---------------------------------------------------------------' print time.time() - start print '---------------------------------------------------------------' gkseg.destroy()
def main(): gkseg.init('data/model.txt') count = 0 length = 0 start = time.time() for rawfile in listdir('tests/text'): text = '\n'.join(codecs.open(rawfile, 'r', 'utf-8').readlines()) (wds, terms, labeled, err) = gkseg.process(text, True) count = count + len(err) length = length + len(wds) print '---------------------------------------------------------------' print float(count)/length print '---------------------------------------------------------------' gkseg.destroy()
def main(): gkseg.init('data/model.txt') count = 0 start = time.time() for rawfile in listdir('tests/text'): text = '\n'.join(codecs.open(rawfile, 'r', 'utf-8').readlines()) wds = gkseg.seg(text) o = codecs.open(os.path.join('tests/temp', os.path.basename(rawfile)), 'w', 'utf-8') o.write(' '.join(wds)) o.close() count = count + 1 print '---------------------------------------------------------------' print time.time() - start print count print '---------------------------------------------------------------' gkseg.destroy()
#segment the sentence into a list of words seg = gkseg.seg(text) # for s in seg: # print s.encode('utf-8') #extract the important words from the sentence # terms = gkseg.term(text) # for t in terms: # print t.encode('utf-8') #label the sentence # labels = gkseg.label(text) # for l in labels: # print l.encode('utf-8') # prepare Chinese seg for ENR seg_str = "" for s in seg: seg_str += s.encode('utf-8') + " " # print seg_str # get all entities tags = tagger.get_entities(seg_str) for t in tags: print t.encode('utf-8') # tags_json = tagger.json_entities(seg_text) # print tags_json gkseg.destroy()
#segment the sentence into a list of words seg = gkseg.seg(text) # for s in seg: # print s.encode('utf-8') #extract the important words from the sentence # terms = gkseg.term(text) # for t in terms: # print t.encode('utf-8') #label the sentence # labels = gkseg.label(text) # for l in labels: # print l.encode('utf-8') # prepare Chinese seg for ENR seg_str ="" for s in seg: seg_str += s.encode('utf-8')+" " # print seg_str # get all entities tags= tagger.get_entities(seg_str) for t in tags: print t.encode('utf-8') # tags_json = tagger.json_entities(seg_text) # print tags_json gkseg.destroy()