Example #1
0
def main():
    gkseg.init('data/model.txt')
    p = Pool(5)
    start = time.time()
    p.map(seg, listdir('tests/text'))
    print '---------------------------------------------------------------'
    print time.time() - start
    print '---------------------------------------------------------------'
    gkseg.destroy()
Example #2
0
def main():
    gkseg.init('data/model.txt')
    p = Pool(5)
    start = time.time()
    p.map(seg, listdir('tests/text'))
    print '---------------------------------------------------------------'
    print time.time() - start
    print '---------------------------------------------------------------'
    gkseg.destroy()
Example #3
0
def main():
    usage = "usage: %prog [options] text"
    parser = OptionParser(usage)
    parser.add_option("-m", "--model", dest="model",
                      help="the path of the model file")

    (options, args) = parser.parse_args()
    if len(args) >= 1:
        gkseg.init(options.model)
        print ' '.join(gkseg.seg(codecs.decode(args[0], 'utf-8')))
    else:
        print 'error: input text should not be empty.'
Example #4
0
def main():
    gkseg.init('data/model.txt')
    count = 0
    length = 0
    start = time.time()
    for rawfile in listdir('tests/text'):
        text = '\n'.join(codecs.open(rawfile, 'r', 'utf-8').readlines())
        (wds, terms, labeled, err) = gkseg.process(text, True)
        count = count + len(err)
        length = length + len(wds)
    print '---------------------------------------------------------------'
    print float(count)/length
    print '---------------------------------------------------------------'
    gkseg.destroy()
Example #5
0
def main():
    gkseg.init('data/model.txt')
    count = 0
    start = time.time()
    for rawfile in listdir('tests/text'):
        text = '\n'.join(codecs.open(rawfile, 'r', 'utf-8').readlines())
        wds = gkseg.seg(text)
        o = codecs.open(os.path.join('tests/temp', os.path.basename(rawfile)), 'w', 'utf-8')
        o.write(' '.join(wds))
        o.close()
        count = count + 1
    print '---------------------------------------------------------------'
    print time.time() - start
    print count
    print '---------------------------------------------------------------'
    gkseg.destroy()
Example #6
0
def main():
    gkseg.init('data/model.txt')
    count = 0
    start = time.time()
    for rawfile in listdir('tests/text'):
        text = '\n'.join(codecs.open(rawfile, 'r', 'utf-8').readlines())
        wds = gkseg.seg(text)
        o = codecs.open(os.path.join('tests/temp', os.path.basename(rawfile)),
                        'w', 'utf-8')
        o.write(' '.join(wds))
        o.close()
        count = count + 1
    print '---------------------------------------------------------------'
    print time.time() - start
    print count
    print '---------------------------------------------------------------'
    gkseg.destroy()
Example #7
0
def run(model='data/model.txt'):
    gkseg.init(model)
Example #8
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import gkseg
import ner

text = '【哈尔滨雾霾舆论数据分析】哈尔滨PM2.5爆表,微博讨论声量在10月20日~21日持续上升。21日负面情绪指数大幅蔓延,微博成为当地人民表达负面情绪的一大渠道。疾病指数也在21日上午10:00~11:00达到第一个高峰,社交媒体数据与实际医疗数据的强相关性在此事件中得到体现。更多数据,请参见微博长图。'.decode(
    'utf-8')
# http://weibo.com/2392261910/Afjg5e6bQ

# init
gkseg.init('../miner/gkseg/data/model.txt')
# for tagger to work, we need to launch Stanford NER Java socket server
tagger = ner.SocketNER(host='localhost', port=1234)

#segment the sentence into a list of words
seg = gkseg.seg(text)
# for s in seg:
#     print s.encode('utf-8')

#extract the important words from the sentence
# terms = gkseg.term(text)
# for t in terms:
#     print t.encode('utf-8')

#label the sentence
# labels = gkseg.label(text)
# for l in labels:
#     print l.encode('utf-8')

# prepare Chinese seg for ENR
Example #9
0
def run(model="data/model.txt"):
    gkseg.init(model)
Example #10
0
    tweets.append(tweet) 

# showTweets()
print str(len(tweets))+" tweets processed"
print "mentions:"+str(len(mentions))+ " urls:"+str(len(urls)) + " hashtags:"+str(len(hashtags))
print "total entities found: " +str(len(mentions)+ len(urls) + len(hashtags))
print tweets[1].to_JSON()


######################## 
# NLP + NER
# NER server should be up: see ner-server/
######################## 
print 'start NLP'

gkseg.init('../miner/gkseg/data/model.txt')
tagger = ner.SocketNER(host='localhost', port=1234)

for t in tweets:

    # 

    txt=t.txt.decode('utf-8')

    #segment the sentence into a list of words
    seg = gkseg.seg(txt)

    #extract the important words from the sentence
    terms = gkseg.term(txt)
    t.keywords=terms
import gkseg

text = '话说天下大势,分久必合,合久必分'.decode('utf-8')

gkseg.init()

print gkseg.seg(text)  #segment the sentence into a list of words

print gkseg.term(text)  #extract the important words from the sentence

print gkseg.label(text)  #label the sentence

gkseg.destory()