def main(): gkseg.init('data/model.txt') p = Pool(5) start = time.time() p.map(seg, listdir('tests/text')) print '---------------------------------------------------------------' print time.time() - start print '---------------------------------------------------------------' gkseg.destroy()
def main(): usage = "usage: %prog [options] text" parser = OptionParser(usage) parser.add_option("-m", "--model", dest="model", help="the path of the model file") (options, args) = parser.parse_args() if len(args) >= 1: gkseg.init(options.model) print ' '.join(gkseg.seg(codecs.decode(args[0], 'utf-8'))) else: print 'error: input text should not be empty.'
def main(): gkseg.init('data/model.txt') count = 0 length = 0 start = time.time() for rawfile in listdir('tests/text'): text = '\n'.join(codecs.open(rawfile, 'r', 'utf-8').readlines()) (wds, terms, labeled, err) = gkseg.process(text, True) count = count + len(err) length = length + len(wds) print '---------------------------------------------------------------' print float(count)/length print '---------------------------------------------------------------' gkseg.destroy()
def main(): gkseg.init('data/model.txt') count = 0 start = time.time() for rawfile in listdir('tests/text'): text = '\n'.join(codecs.open(rawfile, 'r', 'utf-8').readlines()) wds = gkseg.seg(text) o = codecs.open(os.path.join('tests/temp', os.path.basename(rawfile)), 'w', 'utf-8') o.write(' '.join(wds)) o.close() count = count + 1 print '---------------------------------------------------------------' print time.time() - start print count print '---------------------------------------------------------------' gkseg.destroy()
def run(model='data/model.txt'): gkseg.init(model)
#!/usr/bin/env python # -*- coding: utf-8 -*- import gkseg import ner text = '【哈尔滨雾霾舆论数据分析】哈尔滨PM2.5爆表,微博讨论声量在10月20日~21日持续上升。21日负面情绪指数大幅蔓延,微博成为当地人民表达负面情绪的一大渠道。疾病指数也在21日上午10:00~11:00达到第一个高峰,社交媒体数据与实际医疗数据的强相关性在此事件中得到体现。更多数据,请参见微博长图。'.decode( 'utf-8') # http://weibo.com/2392261910/Afjg5e6bQ # init gkseg.init('../miner/gkseg/data/model.txt') # for tagger to work, we need to launch Stanford NER Java socket server tagger = ner.SocketNER(host='localhost', port=1234) #segment the sentence into a list of words seg = gkseg.seg(text) # for s in seg: # print s.encode('utf-8') #extract the important words from the sentence # terms = gkseg.term(text) # for t in terms: # print t.encode('utf-8') #label the sentence # labels = gkseg.label(text) # for l in labels: # print l.encode('utf-8') # prepare Chinese seg for ENR
def run(model="data/model.txt"): gkseg.init(model)
tweets.append(tweet) # showTweets() print str(len(tweets))+" tweets processed" print "mentions:"+str(len(mentions))+ " urls:"+str(len(urls)) + " hashtags:"+str(len(hashtags)) print "total entities found: " +str(len(mentions)+ len(urls) + len(hashtags)) print tweets[1].to_JSON() ######################## # NLP + NER # NER server should be up: see ner-server/ ######################## print 'start NLP' gkseg.init('../miner/gkseg/data/model.txt') tagger = ner.SocketNER(host='localhost', port=1234) for t in tweets: # txt=t.txt.decode('utf-8') #segment the sentence into a list of words seg = gkseg.seg(txt) #extract the important words from the sentence terms = gkseg.term(txt) t.keywords=terms
import gkseg text = '话说天下大势,分久必合,合久必分'.decode('utf-8') gkseg.init() print gkseg.seg(text) #segment the sentence into a list of words print gkseg.term(text) #extract the important words from the sentence print gkseg.label(text) #label the sentence gkseg.destory()