def _messageCB(self, con, msg): """Called when a message is recieved""" if not msg.getBody(): # Dont process blank messages return # Jabber IDs contain a slash between the server name and the # resource name. This presents a problem, since Alice uses # the session name as the filename for the log/session files. # So the session name for Jabber sessions is just # "username@server". source = str(msg.getFrom()).split("/")[0] # In some case, message body is Unicode, and some bot doesn't like # Unicode. Convert it to ASCII with str() before submitting. #response = self.submit(str(msg.getBody()), source+"@JABBER") try: # for Thai, it need word segmentation cmd = wordseg( msg.getBody() ) # get the answer response = self._bot.reply(cmd).encode("utf-8") # learn something new self._bot.learn(cmd) except UnicodeEncodeError: response = "Unicode Encoding Error!" time.sleep( random.random() * self._maxdelay ) self.display(response, source)
def test_wordseg(): for hashtag in hdata: wordseg(hashtag)
def test_wordseg(): for hashtag in hdata: wordseg(hashtag) def test_wordsegment(): for hashtag in hdata: segment(hashtag) hdata = open('hashtags.txt').read().split() n = 100 print("==wordseg==\t\t==wordsegment==") for hashtag in hdata: print("[" + " ".join(wordseg(hashtag)[0]) + "]\t\t[" + " ".join(segment(hashtag)) + "]") print("==wordseg==\t\t==wordsegment==") print( str( timeit.timeit("test_wordseg()", number=n, setup="from __main__ import test_wordseg")) + "\t\t" + str( timeit.timeit("test_wordsegment()", number=n, setup="from __main__ import test_wordsegment")))
#!/opt/local/bin/python2.7 # -*- coding: utf-8 -*- import sys sys.path+=['.'] import wordseg s=wordseg.wordseg() s.load_dict("data/dict.txt") for l in open("data/p.txt"): l=l.strip() if len(l)>0: b=s.segment(l,1,0) print l, " => ", for i in b: print i, print