Example #1
0
    def testReference(self):
        import jieba # May fail to load jieba
        jieba.initialize(usingSmall=False)
        import jieba.posseg as pseg
        pwords = []
        content = u'上海今日新确诊3例人感染H7N9禽流感病例'
        _ = """
ns 上海
t 今日
a 新
v 确诊
m 3
n 例人
v 感染
eng H7N9
n 禽流感
n 病例
"""
        content = u'李克强:在半岛挑事无异于搬石头砸自己脚'
        _ = """
nr 李克强
p 在
n 半岛
v 挑事
l 无异于
v 搬
l 石头砸
r 自己
n 脚
"""
        for word in pseg.cut(content):
            print word.flag, word.word
Example #2
0
 def __init__(self, dics = {}):
     self.word_dic = dics
     self.fcounter = 0
     self.default_idf = 10
     self.log_base = math.e
     self.rubbish_set, self.rubbish_hd = self.get_rubbish_set()
     jieba.initialize()
Example #3
0
def serve(filename):
    if os.path.exists(filename):
        try:
            receive(filename, b'["ping"]\n')
            return
        except:
            # not removed socket
            print("Found abandoned socket")
            os.unlink(filename)
    try:
        with socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) as sock:
            sock.bind(filename)
            sock.listen(5)
            jieba.initialize()
            while 1:
                conn, addr = sock.accept()
                received = conn.recv(1024)
                while received[-1] != 10:
                    received += conn.recv(1024)
                result = handle(received.decode('utf-8'))
                if result is None:
                    conn.sendall(b'\n')
                elif result == b'stop':
                    conn.sendall(b'\n')
                    conn.close()
                    break
                else:
                    conn.sendall(result + b'\n')
                conn.close()
    finally:
        if os.path.exists(filename):
            os.unlink(filename)
        print("Server stopped.")
Example #4
0
    def __init__(self):

        self.word_to_pinyins = defaultdict(list)
        f = open(FILE_WORDS, 'rb')
        for line in f:
            pinyin, words = line.strip().decode("utf-8").split()
            for item in words:
                self.word_to_pinyins[item].append(pinyin)
        f.close()

        self.word_to_pinyin = {}
        f = open(FILE_WORD, 'rb')
        for line in f:
            word, pinyin = line.strip().decode("utf-8").split(",")
            self.word_to_pinyin[word] = pinyin
        f.close()

        self.term_to_pinyin = {}
        f = open(FILE_TERM, 'rb')
        for line in f:
            term, pinyin = line.strip().decode("utf-8").split("#")
            self.term_to_pinyin[term] = pinyin.split("@")
        f.close()

        f = open(FILE_USER_DICT, 'rb')
        jieba.setLogLevel(logging.INFO)
        jieba.initialize()
        jieba.load_userdict(f)
        f.close()
Example #5
0
def word_list(path = conf.output_dir + "/tmp/"):
    jieba.initialize()
    jieba.load_userdict("./user_dict.txt")
    print "cutting words"
    dict = {}
    f = open(path+"/all_json.txt", "r")
    i = 0
    for line in f:
        if (i %100) == 0:
            sys.stderr.write(str(i) + "\n")
        i += 1
        json_obj = json.loads(line)
        danmu = json_obj['ci']
        for k in danmu.keys():
            words_list = danmu[k]
            word = jieba.cut(words_list)
            for w in list(word):
                if w in dict.keys():
                    dict[w] += 1
                else:
                    dict[w] = 1

    f.close()

    out = codecs.open(path + "words.txt", "wb", "utf-8")
    for k in dict.keys():
        out.write(k)
        out.write(" ")
        out.write(unicode(dict[k]))
        out.write("\n")

    out.close()
Example #6
0
    def get(self, keyword):
        pages = []
        spages = []
        words = []
        if keyword:
            import jieba # May fail to load jieba
            jieba.initialize(usingSmall=True)
            words = list(jieba.cut(keyword, cut_all=False))
            words = [ word for word in words if len(word) > 1 ]
            # words = list(jieba.cut_for_search(keyword))
            keyword = stringutil.parseUnicode(keyword)
            pages = snapi.getAllPages()
            pages = globalutil.search(pages, words)
            globalutil.populateSourceUrl(pages)

            twitterAccount = globalconfig.getTwitterAccount()
            spages = bs.search(words[0], twitterAccount)

        templateValues = {
            'keyword': keyword,
            'pages': pages,
            'spages': spages,
            'words': words,
        }
        self.render(templateValues, 'search.html')
Example #7
0
 def load(self):
     # load jieba first
     if not jieba.initialized:
         jieba.set_dictionary(self.jieba_dict_path)
         jieba.initialize()
     self.pydict = {}
     f = None
     try:
         # py.txt
         f = open(self.dict_path)
         for line in f:
             try:
                 line = line.strip()
             except:
                 continue
             sps = line.split('\t')
             if len(sps) != 3:
                 print >>sys.stderr, 'bad format line [%s]' % line
                 continue
             word = sps[0]
             py = sps[1]
             freq = float(sps[2])
             if word in self.pydict:
                 wordInfoLen = len(self.pydict[word])
                 i = 0
                 dup = False
                 while i < wordInfoLen:
                     if self.pydict[word][i].py == py:
                         if self.pydict[word][i].freq < freq:
                             self.pydict[word][i].freq = freq
                         dup = True
                         break
                     if self.pydict[word][i].freq < freq:
                         break
                     i += 1
                 if not dup:
                     pyInfo = PyInfo()
                     pyInfo.py = py
                     pyInfo.freq = freq
                     self.pydict[word].insert(i, pyInfo)
                     wordInfoLen += 1
                     for j in range(i + 1, wordInfoLen):
                         if self.pydict[word][j].py == py:
                             del self.pydict[word][j]
                             break
             else:
                 pyInfo = PyInfo()
                 pyInfo.py = py
                 pyInfo.freq = freq
                 self.pydict[word] = [ pyInfo ]
     except Exception as e:
         try:
             f.close()
         except:
             pass
         return False
     self.is_load = True
     return True
Example #8
0
 def __init__(self, model=None, model_file=None):
     if model:
         self.pipeline, self.label_encoder = model
     elif model_file:
         self.load_model(model_file)
     else:
         raise Exception("param model or model_file should be passed")
     jieba.initialize()
     logging.info("predictor init sucessfully.")
Example #9
0
 def __init__(self):
     # 取得当前包路径
     _package_path_ =_context_path
     self._user_dict = _package_path_+os.sep+"dic.data"
     self._user_stword = _package_path_+os.sep+"stword.data"
     #构造停用词列表
     self._stop_word_list = list(line.strip().decode("utf8") for line in open(self._user_stword,'r').readlines())
     # print(self._user_dict,self._user_stword)
     jieba.set_dictionary(self._user_dict)
     jieba.initialize()
Example #10
0
def split(toCut):
    jieba.initialize()

    toCut = unicode(toCut.decode("gbk"))
    retList = list(jieba.analyse.extract_tags(toCut, topK=255, withWeight=1))
    retList = [i for i in retList if i[0] != " "]

    for i in range(0, len(retList)):
        retList[i] = [retList[i][0].encode("GBK"), retList[i][1]]

    return retList
Example #11
0
 def participle(self):
     jieba.set_dictionary("dict/dict.txt")
     jieba.initialize()
     if(self.radioButton.isChecked()):
         self.result=jieba.cut(self.filetext,cut_all=True)
     elif(self.radioButton_2.isChecked()):
         self.result=jieba.cut(self.filetext,cut_all=False)
     elif(self.radioButton_3.isChecked()):
         self.result=jieba.cut_for_search(self.filetext)
     else:
         self.result=jieba.cut(self.filetext,cut_all=False)
     self.textBrowser.clear()
     self.textBrowser.setText('/'.join(self.result))
Example #12
0
    def __init__(self, processnum=1):
        logger.info('Initializing jieba...')
        jieba.initialize()
        logger.info('Successfully initialized jieba.')

        if processnum == 0:
            processnum = multiprocessing.cpu_count()

        if processnum > 1:
            logger.info(
                'jieba running in parallel mode with %d processes.',
                processnum
            )
            jieba.enable_parallel(processnum)
Example #13
0
def _getTopWords(psegs, titles, stopWordPatterns, stopWords, userDict):
    content = '\n'.join(titles)

    import jieba # May fail to load jieba
    if psegs:
        jieba.initialize(usingSmall=True)
        import jieba.posseg as pseg
        pseg.loadDictModel(usingSmall=True)
        pwords = []
        flags = psegs
        for word in pseg.cut(content):
            if word.flag not in flags:
                continue
            pwords.append(word.word)
    else:
        jieba.initialize(usingSmall=False)
        if userDict:
            jieba.load_userdict_items(userDict)
        pwords = jieba.cut(content, cut_all=False)

    words = []
    for word in pwords:
        # sometime "\r\n\n" encountered
        word = word.strip()
        if not word:
            continue
        if word in stopWords:
            continue
        if _isStopWord(stopWordPatterns, word):
            continue
        words.append(word)
    words.sort()

    lastWord = None
    lastCount = 0
    result = []
    _MIN_WORD_COUNT = 2
    for word in words:
        if lastWord != word:
            if lastCount >= _MIN_WORD_COUNT:
                result.append({'name': lastWord, 'count': lastCount})
            lastWord = word
            lastCount = 0
        lastCount += 1
    if lastCount >= _MIN_WORD_COUNT:
        result.append({'name': lastWord, 'count': lastCount})

    result.sort(key=lambda item: len(item['name']), reverse=True)
    result.sort(key=lambda item: item['count'], reverse=True)
    return [ item['name'] for item in result ]
Example #14
0
def initialize():
	# Load conjuction data
	global CONJUNCTIONS
	CONJUNCTIONS = []
	
	from codecs import open
	with open('vendor/moedict.dict', 'r', encoding='utf8') as data:
		for entry in data:
			CONJUNCTIONS.append(entry.split()[0])
	
	# Load CJK parsing library
	jieba.set_dictionary('vendor/jieba_tc.dict')
	jieba.load_userdict('vendor/chewing.dict')
	jieba.initialize()
Example #15
0
def run():
    start_time = time.clock()
    jieba.set_dictionary('jieba/dict.txt.big')
    jieba.initialize()
    print ("jieba " + str(time.clock() - start_time))
    
    start_time = time.clock()

    news_rss_url = "http://hk.news.yahoo.com/rss/hong-kong"
    # news_rss_url = "http://hk.news.yahoo.com/rss/china"
    info = feedparser.parse(news_rss_url)

    
    start_time = time.clock()

    for entry in info.entries:
        # word count of each word of summary
        word_list = getBagOfWords(preprocess(jieba.cut(stripTag(entry.summary))))
        # word count of each word of title
        bag_of_word_of_title = getBagOfWords(preprocess(jieba.cut(stripTag(entry.title))))
        
        # Combine word count of both summary and title and title weights more
        bag_of_word = Counter()
        for i in range(3):
            bag_of_word.update(bag_of_word_of_title)
        bag_of_word.update(word_list)
        entry["bag_of_words"] = bag_of_word

    print ("preprocess " + str(time.clock() - start_time))
        

#     result = Counter()
#     for entry in info.entries:
#         result.update(entry["bag_of_words"])
#     printList(result) 
        
    # Clustering them
    start_time = time.clock()
    clusters = clustering.clustering([Cluster([Vector(entry)]) for entry in info.entries])
    print ("clustering " + str(time.clock() - start_time))

    # Print the result        
    newsList = []
    for (index, cluster) in enumerate(clusters):
        for vector in cluster.listOfVectors:
            news = News(index, (vector == cluster.centroidVector), vector.data["title"], vector.data["published"], vector.data["link"])
            newsList.append(news.__dict__)
    return json.dumps(newsList)
Example #16
0
 def __init__(self, dics = None):
     '''
     fname_dic = {fid:set([word list])}
     word_dic = {'word':{fid:tf, ...}}
     '''
     if dics:
         self.set_vars_by_dics(dics)
     else:
         self.word_dic = {}
         self.fname_dic = {}
         self.fcounter = 0
         self.default_idf = 0
         self.log_base = math.e
         self.rubbish_set = set()
         self.proportion = 0.3
     jieba.initialize()
Example #17
0
    def get(self, eventScope, eventId):
        event = models.getEvent(eventScope, eventId)
        if not event:
            self.error(404)
            return
        event["pages"].sort(key=lambda page: page.get("published") or page["added"], reverse=True)
        if "keyword" in self.extraValues:
            import jieba  # May fail to load jieba

            jieba.initialize(usingSmall=True)
            words = list(jieba.cut(self.extraValues["keyword"], cut_all=False))
            for page in event["pages"]:
                page["grade"] = 0
                for word in words:
                    if len(word) <= 1:
                        continue
                    if stringutil.contains(page.get("title", ""), word):
                        page["grade"] += len(word)
            event["pages"].sort(key=lambda page: page["grade"], reverse=True)
        templateValues = {"event": event}
        self.render(templateValues, "event.html")
Example #18
0
def visit_offcanvas(request):
  #bug: 同个客户端同时刷新好几次,可能同时返回导致内容混合
  ip = None
  if request.META.has_key('HTTP_X_FORWARDED_FOR'):  
    ip =  request.META['HTTP_X_FORWARDED_FOR']  
  else:  
    ip = request.META['REMOTE_ADDR'] 
  logger.info("%s BEGIN. POST:%s, GET:%s"%(ip,str(request.POST),str(request.GET)))

  global is_first_load
  mutex_update_news.acquire()
  if is_first_load:
    #print "[LOG %s] init news."%(time.strftime("%Y-%m-%d %X", time.localtime()))
    logger.info("init news.")
    if platform.system() == "Linux":
      jieba.enable_parallel(8)
    jieba.initialize()
    #jieba.set_dictionary('data/dict.txt.big')
    update_base()
    init_news2()
    thread.start_new_thread(thread_update_news, ("",))
    is_first_load = False
  mutex_update_news.release()

  queryDict=None
  if request.method == 'GET':
    queryDict = request.GET
  elif request.method == 'POST':
    queryDict = request.POST
  jsondata = get_jsondata(queryDict)
  fp = open('django_composite/offcanvas.html')  
  t = Template(fp.read())  
  fp.close()  
  html = t.render(Context(jsondata))  
  logger.info("%s END."%ip)
  return HttpResponse(html) 
  '''
Example #19
0
    def __init__(self):
        self.br = br = mechanize.Browser()
        self.br.set_handle_robots(False)  # ignore robots
        self.br.set_handle_refresh(False)
        self.sixHourBeforeTime = time.time() - 60 * 60 * 6
        self.db_address = "127.0.0.1"  #'54.251.147.205'

        if platform.system() == "Windows":
            self.features = "html5lib"
        else:
            self.features = "lxml"

        oauth_args = dict(
            client_id="482698495096073",
            client_secret="8c58b055fcb762a9780638dc401c85e2",
            grant_type="client_credentials",
        )

        oauth_curl_cmd = ["curl", "https://graph.facebook.com/oauth/access_token?" + urllib.urlencode(oauth_args)]
        oauth_response = subprocess.Popen(oauth_curl_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()[
            0
        ]
        print oauth_curl_cmd
        print str(oauth_response)
        try:
            oauth_access_token = urlparse.parse_qs(str(oauth_response))["access_token"][0]
            self.graph = facebook.GraphAPI(oauth_access_token)
        except KeyError:
            print ("Unable to grab an access token!")

            # self._pre_dict_combine('combine_dict.txt')
            # jieba.set_dictionary('combine_dict.txt')
        dict_path = os.path.dirname(os.path.abspath(__file__)) + "/dict.txt"
        print dict_path
        jieba.set_dictionary(dict_path)
        jieba.initialize()
Example #20
0
 def __init__(self, stop_file=None, use_tfidf=False):
     self.stop_words = ["", " "]
     if stop_file:
         with open(stop_file, 'r') as rf:
             tokens = rf.readlines()
             tokens = [t.strip().decode("u8") for t in tokens]
             self.stop_words.extend(tokens)
             print "*****"
             logging.info("load %d stop words" % len(self.stop_words))
     jieba.initialize()
     self.label_encoder = LabelEncoder()
     if use_tfidf:
         self.pipeline = Pipeline([
                     ('vec',   CountVectorizer(stop_words=self.stop_words)),
                     ('feat',  TfidfTransformer()),
                     #('clf', SGDClassifier())
                     ('clf', MultinomialNB())
              ])
     else:
         self.pipeline = Pipeline([
                     ('vec',   CountVectorizer(stop_words=self.stop_words, binary=True)),
                     ('clf',   BernoulliNB(fit_prior=True))
              ])
     logging.info("init the classifier")     
print >> log_f, w.encode("utf-8"), "/" ,

print 'speed' , len(content)/tm_cost, " bytes/second"
实验结果:在4核3.4GHz Linux机器上,对金庸全集进行精确分词,获得了1MB/s的速度,是单进程版的3.3倍。
其他词典

占用内存较小的词典文件 https://github.com/fxsjy/jieba/raw/master/extra_dict/dict.txt.small 
支持繁体分词更好的词典文件 https://github.com/fxsjy/jieba/raw/master/extra_dict/dict.txt.big 
下载你所需要的词典,然后覆盖jieba/dict.txt 即可或者用jieba.set_dictionary('data/dict.txt.big')

模块初始化机制的改变:lazy load (从0.28版本开始)

jieba采用延迟加载,"import jieba"不会立即触发词典的加载,一旦有必要才开始加载词典构建trie。如果你想手工初始jieba,也可以手动初始化。

import jieba
jieba.initialize()  # 手动初始化(可选)
在0.28之前的版本是不能指定主词典的路径的,有了延迟加载机制后,你可以改变主词典的路径: 
jieba.set_dictionary('data/dict.txt.big')
例子: 
#encoding=utf-8
import sys
sys.path.append("../")
import jieba

def cuttest(test_sent):
result = jieba.cut(test_sent)
print " ".join(result)

def testcase():
cuttest("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。")
cuttest("我不喜欢日本和服。")
Example #22
0
def JIEBAInit(file_path='OpinionAnalysis/dict/'):
    logging.info('Loading dictionary and initializing jieba...')
    jieba.set_dictionary(file_path + 'dict.txt.big')
    jieba.load_userdict(file_path + 'userdict.txt')
    jieba.initialize()
Example #23
0
		line_name=name
		line={'x':line_x,'y':line_y,'name':line_name}
		# print(line_x)
		# print(line_y)
		result={'pie':pie,'bar':bar,'line':line}

		##数据多了主题提取可能比较慢所以用子线程通过全局变量返回结果
		##start subthread for lda analyse
		try:
			_thread.start_new_thread( subthread_lda_analyse, ("Thread-lda", comments, ) )
		except:
			print ("Error: can not start thread")

		return 'result generate success'
	return 'flask not get name'

if __name__=='__main__':
	print('jieba initializing...')
	jieba.initialize()
	print('loading fasttext model...')
	fasttext_model=fasttext.load_model(fasttext_model_path)

	print('loading lda model...')
	model_path='./train_model/lda_model/LDA_model'
	lda_model = gensim.models.ldamodel.LdaModel.load(model_path)
	radar_x=[]
	radar_y=[]

	result={}
	app.run("0.0.0.0",threaded=True)
def jieba_initialize():
    if not platform.system().upper().startswith("WINDOWS"):
        jieba.enable_parallel(multiprocessing.cpu_count())
    jieba.load_userdict('resources/QAattrdic.txt')
    jieba.initialize()
Example #25
0
    import jieba.posseg
    posdelim = args.pos

    def cutfunc(sentence, _, HMM=True):
        for w, f in jieba.posseg.cut(sentence, HMM):
            yield w + posdelim + f
else:
    cutfunc = jieba.cut

delim = text_type(args.delimiter)
cutall = args.cutall
hmm = args.hmm
fp = open(args.filename, 'r') if args.filename else sys.stdin

if args.dict:
    jieba.initialize(args.dict)
else:
    jieba.initialize()
if args.user_dict:
    jieba.load_userdict(args.user_dict)

ln = fp.readline()
while ln:
    l = ln.rstrip('\r\n')
    result = delim.join(cutfunc(ln.rstrip('\r\n'), cutall, hmm))
    if PY2:
        result = result.encode(default_encoding)
    print(result)
    ln = fp.readline()

fp.close()
Example #26
0
class WordSegmentation(object):
    '''
    分词
    '''

    stop_words_file = {}.fromkeys([line.decode('utf8').strip()
                                   for line in open(util_path.stop_words_path)])  # 加载停用词典
    jieba.set_dictionary(util_path.jieba_dict_path)  # 加载专业词jieba词典
    jieba.initialize()


    def addotherdics(self):
        dfolder=util_path.otherdict_folder
        othdpath=[os.path.join(dfolder,i) for i in os.listdir(dfolder)]
        for inf in othdpath:
            print("add words from %s" %inf)
            with codecs.open(inf,'rU',encoding='utf8') as f:
                for w in f:
                    w=w.strip()
                    if w:
                        jieba.add_word(w)



    def segment(self, sent, stop_words_file=stop_words_file, mode='normal',addotherdic=False):
        """
        对输入文本进行分词处理,可选择性加载停用词,以及选择分词模式
        :param sent: 需分词处理的句子
        :type sent: unicode string
        :param stop_words_file: 停用词表,设置stopwords=None为不过滤停用词。默认加载自带停用词表
        :type stop_words_file: dict
        :param mode: 分词模式(normal,tf-idf,TextRank)。normal为普通模式分词;tf-idf为基于TF-IDF算法的关键词抽取;TextRank为基于TextRank算法的关键词抽取。默认选择的是normal模式分词
        :type mode: unicode string
        :return: 经过分词处理过后的句子列表
        :rtype: list
        """
        sentence_words = []
        # 去除标点符号及空格
        # punct = set(
        #     u''' :!),.:;?]}¢'"、。〉》」』】〕〗〞︰︱︳﹐、﹒﹔﹕﹖﹗﹚﹜﹞!),.:;?|}︴︶︸︺︼︾﹀﹂﹄﹏、~¢々‖•·ˇˉ―--′’”
        #     ([{£¥'"‵〈《「『【〔〖([{£¥〝︵︷︹︻︽︿﹁﹃﹙﹛﹝({“‘-—_…''')
        # filterpunt = lambda s: ''.join(filter(lambda x: x not in punct, s))
        # sent = filterpunt(sent)
        if addotherdic:
            self.addotherdics()
        if mode == 'tf-idf':
            sent = jieba.analyse.extract_tags(sent.strip(' \t\n\r'))  # 基于 TF-IDF 算法的关键词抽取
        elif mode == 'TextRank':
            tr = jieba.analyse.TextRank()  # 基于 TextRank 算法的关键词抽取
            tr.span = 2
            sent = tr.textrank(sent.strip(' \t\n\r'), topK=20, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v'))
        else:
            sent = jieba.cut(sent.strip(' \t\n\r'))  # 格式化,去换行符等. 普通模式分词
        for w in sent:
            # seg = str(w.encode('utf-8').strip())
            seg = w.strip()
            if stop_words_file is None:
                sentence_words.append(seg)
            else:
                res = SentenceSegmentation.has_number_character(seg)
                if not stop_words_file.has_key(seg) and not res:  # 正文需要去停用词(将原来puct符号集移至停用词字典,摒除数字单个成字的可能)
                    sentence_words.append(seg)
        return sentence_words
#      ┃   ┃  +         
#      ┃    ┗━━━┓ + +
#      ┃        ┣┓
#      ┃        ┏┛
#      ┗┓┓┏━┳┓┏┛ + + + +
#       ┃┫┫ ┃┫┫
#       ┗┻┛ ┗┻┛+ + + +
"""
Author = Eric_Chan
Create_Time = 2016/05/29
构建词库
"""

import jieba
import sys
jieba.initialize()  # 手动启动jieba模块


def load_file(file_name, charset='utf-8'):
    """
    读取文件,按列返回列表
    :param file_name: 文件路径
    :param charset: 文本内容decode的编码,默认为utf-8
    :return: 文本内容列表
    """
    f1 = open(file_name)
    line = f1.readline().decode(charset).strip()
    line_list = []
    while line:
        line = line.strip()
        if line:
Example #28
0
                               page=page,
                               error=True)
    except:
        print('high search error')


# 跳转到具体的界面
@app.route('/search/<id>/', methods=['GET', 'POST'])
def content(id):
    try:
        doc = find([id], extra=True)
        return render_template('content.html', doc=doc[0])
    except:
        print('content error')


# 推荐最接近的K个
def get_k_nearest(db_path, docid, k=5):
    conn = sqlite3.connect(db_path)
    c = conn.cursor()
    c.execute("SELECT * FROM knearest WHERE id=?", (docid, ))
    docs = c.fetchone()
    #print(docs)
    conn.close()
    return docs[1:1 + (k if k < 5 else 5)]  # max = 5


if __name__ == '__main__':
    jieba.initialize()  # 手动初始化(可选)
    app.run()
def mentioned_trend(baseurl,
                    mysqlhostIP,
                    mysqlUserName='******',
                    mysqlPassword='',
                    dbname='btv'):
    # 分词
    jieba.initialize()
    # 连接数据库
    sqlConn = MySQLdb.connect(host=mysqlhostIP,
                              user=mysqlUserName,
                              passwd=mysqlPassword,
                              db=dbname,
                              charset='utf8')
    sqlcursor = sqlConn.cursor()
    sqlcursor.execute(
        '''CREATE TABLE IF NOT EXISTS gala_region_interaction(pk bigint NOT NULL PRIMARY KEY AUTO_INCREMENT, region varchar(50), interaction bigint(50), date Date, program_id varchar(50), program varchar(50)) DEFAULT CHARSET=utf8;'''
    )
    print '新建库成功'

    os.popen('kinit -k -t ctvit.keytab ctvit')
    kerberos_auth = HTTPKerberosAuth(mutual_authentication=OPTIONAL)
    # 表名
    tablename = "DATA:WEIBO_POST_Keywords"
    r = requests.get(baseurl + "/" + tablename + "/*",
                     auth=kerberos_auth,
                     headers={"Accept": "application/json"})
    if issuccessful(r) == False:
        print "Could not get messages from HBase. Text was:\n" + r.text
    quit()
    bleats = json.loads(r.text)

    # 存储评论数据
    tempData = []
    count = 0
    printCount = 0
    # row_prefix, limit可以限定次数
    # for key,data in table.scan(limit = 10, batch_size = 10):
    region_box = list()
    date_mentioned_dict = dict()
    # 时间属性
    # inter为0,即为当日
    inter = 0
    now = int(time.time()) - 86400 * inter
    timeArray = time.localtime(now)
    otherStyleTime = time.strftime("%Y-%m-%d", timeArray)
    print otherStyleTime

    # bleats is json file
    for row in bleats['Row']:
        for cell in row['Cell']:
            columnname = base64.b64decode(cell['column'])
            value = cell['$']
            if value == None:
                print 'none'
                continue
            if columnname == "base_info:match":
                column = base64.b64decode(value)
            if column == "春晚":
                if columnname == "'base_info:cdate'":
                    cdate = base64.b64decode(value)
                    if cdate == otherStyleTime:
                        if columnname == "'base_info:geo'":
                            city_mentioned = base64.b64decode(value)
    region_box_count = dict(Counter(region_box))
    region_box_count = sorted(region_box_count.iteritems(),
                              key=lambda e: e[1],
                              reverse=True)
    for i in region_box_count:
        city = i[0]
        how_many_times = i[1]
        # 地域
        tempData.append(city)
        # 出现次数
        tempData.append(how_many_times)
        # 日期,这是插表的时间
        now = datetime.datetime.now()
        tempData.append(now)
        # program_id
        tempData.append('12345')
        # 节目名称
        tempData.append('2016年北京卫视春节联欢晚会')
        sqlcursor.execute(
            '''insert into gala_region_interaction(region, interaction, date, program_id, program) values (%s, %s, %s, %s, %s)''',
            tempData)
        sqlConn.commit()
        tempData = []
    sqlConn.close()
Example #30
0
 def init_jieba(self):
     jieba.initialize()
     for key in self.data.keys():
         self.data[key]['name'] = list(jieba.cut(self.data[key]['name']))
         self.data[key]['syptom'] = list(jieba.cut(
             self.data[key]['syptom']))
class NlpUtil(object):

    punctuations_set = _load_words(GlobalNames.PUNCTUATIONS_FILE)
    stopwords_set = _load_words(GlobalNames.STOPWORDS_FILE)
    user_define_words = _load_words(GlobalNames.USER_DEFINE_WORDS)
    remove_words_set = _load_words(GlobalNames.REMOVE_WORDS_FILE)

    # Init jieba
    jieba.initialize()
    for w in user_define_words:
        jieba.add_word(w, freq=1000000)

    corpus_dict = None
    tfidf_model = None

    url_pattern = re.compile(r"(https|http)://.+?html")
    digit_pattern = re.compile(r"\d+")
    bracket_pattern = re.compile(r"\[.+?\]")

    not_place_set = set([
        "京东", "上门", "东西", "拜拜", "满意度", "新旧", "入口", "莫大", "蓝牙", "英伦", "顺顺利利",
        "哥哥", "立马", "海鲜", "回邮", "太多", "长北", "南那", "白跑", "天黑", "天阿", "美华", "华联",
        "日及", "山山", "京福顺", "卡拿", "太卡", "太大", "千古", "英哥", "两棵树", "太累", "包邮",
        "加半", "中华人名共和国", "六便士", "串联", "非顺丰", "中考", "北冰洋", "下嫩", "安安", "太鲜",
        "上拉", "入店", "上下水", "图京", "之城", "中断", "中武", "伦理", "中道", "之康", "多维度",
        "黑边", "中爱", "之泰", "锦园店", "三国", "阿门", "肯本", "刚京麦", "大黑", "朝霞", "关门大吉",
        "哥别", "沧桑", "下山", "日京京", "沙沙", "牙牙", "顿顿", "山高", "钱和京", "非买", "上旧",
        "四科", "西东", "上岗", "大山", "福尔马林", "滑黑", "上东", "中上", "内马尔", "中同", "中达",
        "下欧", "四门", "深春", "正东", "江南春", "入维", "大班", "中联", "猫沙", "长卡", "几环",
        "尾塞", "小桥流水", "澳邮", "上中", "英雄", "镇镇", "如东", "上口", "加邮", "八国", "福利",
        "台基", "那本", "中邮", "六本", "维沙", "中黑", "上美", "加花", "天哇", "远超过", "大拿",
        "贵干", "苏中", "三本", "酒塞", "七本", "美院", "中通", "美人壶加", "中充", "下国", "京伦",
        "九联", "上马", "美化", "江湖", "黑店", "几米远", "午安", "七哥", "角美", "日春", "几比",
        "确保安全", "壶水", "荷塘月色", "云集", "拉边", "欧克", "中右", "加的京", "上路", "烟嘴",
        "临证指南", "串口卡", "新建", "安利", "山泉水", "苏泊尔", "墨黑", "胶盆", "长达", "商城"
    ])

    @classmethod
    def place_recognize(cls, text):
        places = [
            w for w, flag in pseg.cut(text)
            if "ns" in flag and len(w) >= 2 and w not in cls.not_place_set
            and "哈" not in w and "之" not in w and "本" not in w and "中" not in w
            and "嫩" not in w and "大" not in w and "鲜" not in w and "国" not in w
            and "上" not in w and "确" not in w and "牙" not in w and "壶" not in w
            and "阿" not in w and "入" not in w and "哥" not in w and "颗" not in w
            and "的" not in w and "联" not in w and "哇" not in w
        ]

        return places

    @classmethod
    def tokenize(cls,
                 text,
                 filter_punctuations=False,
                 filter_stopwords=False,
                 filter_alpha=False,
                 remove_words=False,
                 normalize_url=False,
                 recognize_place=False,
                 minimum_tokens_num=1):
        '''Tokenize text'''
        try:
            places = cls.place_recognize(text)
            for w in places:
                text = text.replace(w, "[地址x]")
            text = cls.digit_pattern.sub("[数字x]", text)
            if normalize_url:
                text = cls.url_pattern.sub("URL", text)
            tokens = jieba.lcut(text)
            text = " ".join(tokens)
            for s in cls.bracket_pattern.findall(text):
                text = text.replace(s, s.replace(" ", ""))
            text = text.replace(u"# E - s [数字x]", u"#E-s[数字x]")
            text = text.replace(u"# E - s DIGIT [数字x]", u"#E-s[数字x]")
            text = text.replace(u"< s >", "<s>")
            tokens = text.split()
            tokens_copy = copy.copy(tokens)

            # Filter words.
            if filter_punctuations:
                tokens = [w for w in tokens if w not in cls.punctuations_set]
            if filter_stopwords:
                tokens = [w for w in tokens if w not in cls.stopwords_set]
            if filter_alpha:
                tokens = [
                    w for w in tokens
                    if not w.encode("utf-8").isalpha() or w in set(["URL"])
                ]
            if remove_words:
                tokens = [w for w in tokens if w not in cls.remove_words_set]

            if len(tokens) < minimum_tokens_num:
                tokens = tokens_copy

            new_tokens = tokens[:1]
            t_len = len(tokens)
            for i in range(1, t_len):
                if tokens[i] != tokens[i - 1]:
                    new_tokens.append(tokens[i])
            return new_tokens
        except Exception as e:
            print("text=%s, errmsg=%s" % (text, e))
            return [text]

    @classmethod
    def get_tfidf(cls, words):
        if cls.tfidf_model is None:
            corpus_dict_path = get_file_path(GlobalNames.CORPUS_DICT_FILE)
            cls.corpus_dict = corpora.Dictionary.load(corpus_dict_path)
            corpus_tfidf_path = get_file_path(GlobalNames.CORPUS_TFIDF_FILE)
            cls.tfidf_model = models.tfidfmodel.TfidfModel.load(
                corpus_tfidf_path)
        bow = cls.corpus_dict.doc2bow(words)
        tfidf = cls.tfidf_model[bow]
        tfidf = [(cls.corpus_dict[x[0]], x[1]) for x in tfidf]
        tfidf.sort(key=lambda x: x[1], reverse=True)
        return tfidf

    @classmethod
    def get_keywords(cls, text, size=3, way=None):
        if way == None or way == "tfidf":
            tokens = cls.tokenize(text)
            tfidf = cls.get_tfidf(tokens)
            ret_tokens = [x[0] for x in tfidf[:size]]
            return ret_tokens
        elif way == "textrank":
            return jieba.analyse.textrank(text, topK=size)
Example #32
0
 def search_init(self):
     jieba.initialize()
     self.pagerank = PageRank()
Example #33
0
class NLPUtil(object):
    _valid_token_len = 5

    _wordseg_pattern_cfg = [
        re.compile(r'{.*?}', re.U),
    ]

    _emoji_pattern_cfg = re.compile('[\U00010000-\U0001ffff]', re.U)

    _replace_pattern_cfg = {
        'float_t': re.compile('\d+\.\d+'),
        'phone_t': re.compile('1[0-9]{10}'),
        'email_t': re.compile('[^@|\s]+@[^@]+\.[^@|\s]+'),
    }

    _illegal_char_set = set([])

    # init jieba
    jieba.initialize()
    ud_words = config.g_ud_words_cfg
    for w in ud_words:
        jieba.add_word(w, freq=100000000)

    @classmethod
    def remove_illegal_gbk_char(cls, text_unicode):
        try:
            text_unicode.encode('gbk')
            return text_unicode
        except UnicodeEncodeError as e:
            illegal_ch = e.object[e.start:e.end]
            illegal_set = cls._illegal_char_set
            illegal_set.add(illegal_ch)
            # try to replace directly
            for ch in illegal_set:
                text_unicode = text_unicode.replace(ch, '')
            # remove recursively
            return cls.remove_illegal_gbk_char(text_unicode)

    @classmethod
    def remove_emoji_char(cls, text_unicode):
        res = cls._emoji_pattern_cfg.sub('', text_unicode)
        return res

    @classmethod
    def conv_fenc_u8_to_gbk(cls, in_fpath, out_fpath):
        try:
            with codecs.open(in_fpath, 'r', 'utf-8') as rfd, \
                codecs.open(out_fpath, 'w', 'gbk') as wfd:
                # read utf8, write gbk
                for line in rfd:
                    line = cls.remove_illegal_gbk_char(line)
                    wfd.write(line)
        except Exception as e:
            logger.get().warn('errmsg=%s' % (e))

    @classmethod
    def tokenize_via_jieba(cls, text, filter_stop_word=True, norm_flag=True):
        tokens = jieba.lcut(text.lower())
        if filter_stop_word:
            stop_words = config.g_stop_words_cfg
            tokens = filter(lambda x: x not in stop_words, tokens)
            if norm_flag:
                norm_func = cls._normalize_token
                return map(norm_func, tokens)
            else:
                return tokens
        else:
            return tokens

    @classmethod
    def stat_token_freq(cls, in_fpath, out_fpath):
        stop_words = config.g_stop_words_cfg
        try:
            word_counter = Counter()
            with codecs.open(in_fpath, 'r', 'utf-8') as rfd:
                for line in rfd:
                    raw_str, word_seg = line.strip('\n').split('\t')
                    tokens = word_seg.split()
                    tokens = filter(lambda x: x not in stop_words, tokens)
                    tokens = map(cls._normalize_token, tokens)
                    for t in tokens:
                        if ('{[' not in t) and len(t) <= cls._valid_token_len:
                            word_counter[t] += 1
                        else:
                            logger.get().warn('invalid token, token=%s' % (t))
                            # tokenize via jieba
                            for n_t in jieba.cut(t):
                                word_counter[n_t] += 1
                                logger.get().debug('jieba cut, token=%s' %
                                                   (n_t))
            # dump word_counter
            sorted_words = sorted(word_counter.keys(),
                                  key=lambda k: word_counter[k],
                                  reverse=True)
            with codecs.open(out_fpath, 'w', 'utf-8') as wfd:
                for word in sorted_words:
                    tmp = '%s\t%s\n' % (word, word_counter[word])
                    wfd.write(tmp)
        except Exception as e:
            logger.get().warn('errmsg=%s' % (e))

    @classmethod
    def _normalize_token(cls, token):
        token = token.lower()
        try:
            # 11 usually means phone number
            if len(token) != 11 and token.isdigit():
                token = 'int_t'
            for k, v in cls._replace_pattern_cfg.items():
                if v.match(token):
                    token = k
                    break
            if '{[' not in token:
                return token
            for item in cls._wordseg_pattern_cfg:
                token = item.sub('', token)
            return token
        except Exception as e:
            logger.get().warn('token=%s, errmsg=%s' % (token, e))
            return token
Example #34
0
def addCommentTable(mongodbIP,
                    mysqlhostIP,
                    mysqlUserName='******',
                    mysqlPassword='******',
                    dbname='cctvTimer'):

    # 读停用词
    path = os.path.abspath(os.path.dirname(sys.argv[0]))
    dicFile = open(path + '/tools/NTUSD_simplified/stopwords.txt', 'r')
    stopwords = dicFile.readlines()
    stopwordList = []
    stopwordList.append(' ')
    for stopword in stopwords:
        temp = stopword.strip().replace('\r\n', '').decode('utf8')
        stopwordList.append(temp)
    dicFile.close()

    # 分词
    jieba.initialize()

    # 连接数据库
    sqlConn = MySQLdb.connect(host=mysqlhostIP,
                              user=mysqlUserName,
                              passwd=mysqlPassword,
                              db=dbname,
                              charset='utf8')
    sqlcursor = sqlConn.cursor()
    # 删库
    # sqlcursor.execute('''DROP TABLE IF EXISTs commentTable;''')
    # print '删库成功'

    sqlcursor.execute(
        '''CREATE TABLE IF NOT EXISTS commentTable(countIndex bigint(64) primary key, commentId bigint(64), weiboId bigint(64), userId bigint(64), comment varchar(1024), 
                    sentimentKeywords varchar(128), contentKeywords varchar(1024), sentiment varchar(16), sentimentScore int(16), userName varchar(64), userSex varchar(16),
                    userLocation varchar(64), userFollowerCount int(64), userFriendCount int(64), userStatusCount int(64), userType varchar(32), 
                    spammerJudge varchar(16), replyTime varchar(128)) DEFAULT CHARSET=utf8;'''
    )
    print '新建库成功'

    # 连接mongoDB数据库
    mongoConn = pymongo.Connection(host=mongodbIP, port=27017)
    # check time
    mongoCursor = mongoConn.weibo.timestamp.find({'type': 'comment'}).limit(1)
    timeRangeBeginning = datetime.datetime.now() - datetime.timedelta(
        days=9999)
    #     print timeRangeBeginning
    #     a=dict()
    #     a['type']='comment'
    #     a['time']=timeRangeBeginning
    #     mongoConn.weibo.timestamp.insert(a)
    #     a=dict()
    #     a['type']='repost'
    #     a['time']=timeRangeBeginning
    #     mongoConn.weibo.timestamp.insert(a)
    #     a=dict()
    #     a['type']='weibo'
    #     a['time']=timeRangeBeginning
    #     mongoConn.weibo.timestamp.insert(a)
    for i in mongoCursor:
        timeRangeBeginning = i['time']
    newTimestamp = timeRangeBeginning
    # 查询某条微博的回复
    mongoCursor = mongoConn.weibo.comment.find({
        'task_time': {
            '$gt': timeRangeBeginning
        }
    }).sort('task_time').batch_size(30)
    print '查询mongoDB成功'
    # 计数
    sqlcursor.execute('select count(*) from commentTable;')
    totalCount = sqlcursor.fetchall()
    totalCount = list(list(totalCount)[0])[0]

    # 存储评论数据
    commentsData = []
    tempData = []
    # 处理情感
    emProcess = emotionProcess()
    rmIrr = removeIrrelevant()
    spamDet = spammerdetect()

    emotionsWord = []
    emotionsScore = 0
    count = 0
    printCount = totalCount
    # 处理每一条
    # try:
    for comment in mongoCursor:
        #        if comment['task_time']>timeRangeBeginning+ datetime.timedelta(days=1):
        #            continue
        count += 1
        printCount += 1
        tempData.append(printCount)
        #     评论id
        tempData.append(comment['comment_id'])
        #     微博id
        tempData.append(comment['weibo_id'])
        #     用户id
        tempData.append(comment['comment_user_id'])
        #     评论内容
        tempData.append(comment['comment_text'])
        #     情感关键词
        (emotionsWord, emotionsScore) = emProcess.processSentence(
            rmIrr.removeEverythingButEmotion(comment['comment_text']))
        emotionsWord = ','.join(emotionsWord)
        tempData.append(emotionsWord)
        #     print comment['mid']
        #     print comment['status']['mid']
        #     print comment['user']['id']
        #     print comment['text']
        # 内容分词
        tempcut_out = jieba.cut(rmIrr.removeEverything(
            comment['comment_text']))
        cut_out = []
        for i in tempcut_out:
            if i not in stopwordList:
                cut_out.append(i)
        tempData.append(','.join(cut_out))
        #     倾向性判断
        if emotionsScore > 0:
            tempData.append('正面')
        elif emotionsScore == 0:
            tempData.append('中立')
        else:
            tempData.append('负面')


#         sentimentScore
        tempData.append(emotionsScore)
        #     用户昵称
        tempData.append(comment['comment_user_name'])
        #     用户性别
        tempData.append(comment['comment_gender'])
        #     用户地域信息
        tempData.append(comment['comment_location'])
        #     用户粉丝数
        tempData.append(comment['comment_followers_count'])
        #     用户关注数
        tempData.append(comment['comment_friends_count'])
        #     用户微博数
        tempData.append(comment['comment_statuses_count'])
        #     用户类型
        if (comment['comment_verified_type'] == -1):
            tempData.append('普通用户')
        elif (comment['comment_verified_type']
              == 220) or (comment['comment_verified_type'] == 200):
            tempData.append('微博达人')
        elif (comment['comment_verified_type'] == 0):
            tempData.append('个人认证')
        else:
            tempData.append('企业认证')
    # 是否水军
        userInfo = {}
        userInfo['statuses_count'] = comment['comment_statuses_count']
        userInfo['followers_count'] = comment['comment_followers_count']
        userInfo['friends_count'] = comment['comment_friends_count']
        userInfo['bi_followers_count'] = comment['comment_bi_followers_count']
        userInfo['domain'] = comment['comment_user_domain']
        userInfo['url'] = comment['comment_url']
        userInfo['description'] = comment['comment_description']
        userInfo['location'] = comment['comment_location']
        userInfo['verified'] = comment['comment_verified']
        userInfo['verified_type'] = comment['comment_verified_type']

        newTimestamp = comment['task_time']

        spamScore = spamDet.detectSpammer(userInfo)
        if spamScore > 0:
            tempData.append("正常")
        else:
            tempData.append("水军")
    #     回复时间
        hh = time.strptime(str(comment['comment_created_at']),
                           '%Y-%m-%d %H:%M:%S')
        commentTime = time.strftime("%a %b %d %H:%M:%S %Y", hh)

        tempData.append(commentTime)
        #     转换为元组
        commentsData.append(tuple(tempData))
        tempData = []
        if count >= 10:
            sqlcursor.executemany(
                '''insert into commentTable(countIndex, commentId, weiboId, userId, comment, sentimentKeywords, contentKeywords, sentiment, sentimentScore, userName, 
                        userSex,userLocation, userFollowerCount, userFriendCount, userStatusCount, userType,spammerJudge, replyTime) 
                        values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)''',
                commentsData)
            sqlConn.commit()
            commentsData = []
            count = 0
            print '插入' + str(printCount) + '个'
    # # except:
    # #      print tempData
    sqlcursor.executemany(
        '''insert into commentTable(countIndex, commentId, weiboId, userId, comment, sentimentKeywords, contentKeywords, sentiment, sentimentScore, userName, userSex,
                        userLocation, userFollowerCount, userFriendCount, userStatusCount, userType,spammerJudge, replyTime) 
                        values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)''',
        commentsData)
    sqlConn.commit()
    sqlConn.close()
    mongoConn.weibo.timestamp.update({'type': 'comment'},
                                     {'$set': {
                                         'time': newTimestamp
                                     }})
    mongoConn.close()
Example #35
0
#coding=utf-8
'''
Created on 2014-2-22

@author: yuzhang
'''

import jieba.posseg as jbp
import jieba as jb
import time

jb.enable_parallel()
jb.initialize()
text = '''
'''

start = time.clock()
for i in range(1000000):
    jb.cut(text)
print time.clock() - start

start = time.clock()
for i in range(1000000):
    jbp.cut(text)
print time.clock() - start
Example #36
0
def test():
  if FLAGS.src_word_seg == 'word':
    import jieba
    jieba.initialize()
  sess = tf.Session()
  src_vocab_dict, _ = data_utils.read_map(FLAGS.source_data + '.' + str(FLAGS.src_vocab_size) + '.mapping')
  _ , trg_vocab_list = data_utils.read_map(FLAGS.target_data + '.' + str(FLAGS.trg_vocab_size) + '.mapping')
  model = create_seq2seq(sess, 'TEST')
  model.batch_size = 1
  
  sys.stdout.write("Input sentence: ")
  sys.stdout.flush()
  sentence = sys.stdin.readline()
  if FLAGS.src_word_seg == 'word':
    sentence = (' ').join(jieba.lcut(sentence))
    print('sentence: ',sentence)
  elif FLAGS.src_word_seg == 'char':
    sentence = (' ').join([s for s in sentence])
  while(sentence):
    token_ids = data_utils.convert_to_token(tf.compat.as_bytes(sentence), src_vocab_dict, False)
    bucket_id = len(buckets) - 1
    for i, bucket in enumerate(buckets):
      if bucket[0] >= len(token_ids):
        bucket_id = i
        break
    # Get a 1-element batch to feed the sentence to the model.
    encoder_input, decoder_input, weight = model.get_batch({bucket_id: [(token_ids, [], "", "")]}, bucket_id)
    # Get output logits for the sentence.
    output = model.run(sess, encoder_input, decoder_input, weight, bucket_id)
    # This is a greedy decoder - outputs are just argmaxes of output_logits.
    
    # beam search all
    if bool(model.beam_search) is True:
        if bool(FLAGS.debug):
            outs = []
            for _ in range(model.beam_size):
                outs.append([])
   
            for out in output:
                for i,o in enumerate(out):
                    outs[i].append(o)
            outs = np.array(outs)
            #print('outs: ',outs.shape)
            outputss = []
            for out in outs:
                #print('out: ',out.shape)
                outputs = [int(np.argmax(logit)) for logit in out]
                outputss.append(outputs)
    
            for i,outputs in enumerate(outputss):
                sys_reply = "".join([tf.compat.as_str(trg_vocab_list[output]) for output in outputs])
                sys_reply = data_utils.sub_words(sys_reply)
                sys_reply = qulify_sentence(sys_reply)
                if i == 0:
                    print(colored("Syetem reply(bs best): " + sys_reply,"red"))
                else:
                    print("Syetem reply(bs all): " + sys_reply)
        else:
            output = model.run(sess, encoder_input, decoder_input, weight, bucket_id)
            outputs = [int(np.argmax(logit, axis=1)) for logit in output]
            if data_utils.EOS_ID in outputs:
              outputs = outputs[:outputs.index(data_utils.EOS_ID)]
            sys_reply = "".join([tf.compat.as_str(trg_vocab_list[output]) for output in outputs])
            sys_reply = data_utils.sub_words(sys_reply)
            sys_reply = qulify_sentence(sys_reply)
            print("Syetem reply(bs best): " + sys_reply)
            

    # MLE
    else:
        output = model.run(sess, encoder_input, decoder_input, weight, bucket_id)
        print(output)
        print('output: ', len(output), output.shape, output[0].shape)
        outputs = [int(np.argmax(logit, axis=1)) for logit in output]
        # If there is an EOS symbol in outputs, cut them at that point.
        if data_utils.EOS_ID in outputs:
          outputs = outputs[:outputs.index(data_utils.EOS_ID)]
        sys_reply = "".join([tf.compat.as_str(trg_vocab_list[output]) for output in outputs])
        sys_reply = data_utils.sub_words(sys_reply)
        sys_reply = qulify_sentence(sys_reply)
        print("Syetem reply(MLE): " + sys_reply)


    # Print out French sentence corresponding to outputs.
    #print("Syetem reply: " + "".join([tf.compat.as_str(trg_vocab_list[output]) for output in outputs]))
    print ("User input  : ")
    sys.stdout.flush()
    sentence = sys.stdin.readline()
    if FLAGS.src_word_seg == 'word':
      sentence = (' ').join(jieba.lcut(sentence))
      print ('sentence: ', sentence)
    elif FLAGS.src_word_seg == 'char':
      sentence = (' ').join([s for s in sentence])
Example #37
0
def main():
    jieba.initialize(
        "/home/Ming-Yi/MingYi/Behavior/Behavior/dict/dict.txt.big")
    jieba.load_userdict(
        "/home/Ming-Yi/MingYi/Behavior/Behavior/dict/NameDict_Ch_v2")
    Read_Json_Data("rawdata/" + creat_dir_paht)
Example #38
0
def main(args):
    # varibales
    input_text_folder = join('..', 'input_ASR_results')

    conn = MongoClient('localhost', 27017)
    db = conn.googlecrawlstream
    timestamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    print('time stamp:', timestamp)
    matchfile_pre = 'fcr23.ws.re.wav.all2'
    matchfile_tmp = 'fcr23.ws.re.wav.all2.res'
    matchfile_result = 'fcr23.ws.re.wav.all2.match'

    #load config
    if args.load_config:
        with open('config', 'r', encoding='utf8') as f:
            config = json.loads(f.readlines()[0].strip())
        outputpath = config['outputpath']
        Nasgoogle_crawl_dir = config['Nasgoogle_crawl_dir']
        ASR_result = config['ASR_result']
        bashfilepath = config['bashfilepath']
        input_text_folder = config['input_text_folder']
        finishpath = config['finishpath']
    else:
        outputpath = args.output_path
        Nasgoogle_crawl_dir = args.google_crawl_dir
        ASR_result = args.pinyin_access_ASR_result_path
        bashfilepath = args.bashfilepath
        input_text_folder = args.python_access_input_folder
        finishpath = args.web_data_path

    #os.makedirs(outputpath, exist_ok=True)
    #os.makedirs(finishpath, exist_ok=True)
    searchEngine = args.search_engine
    # firebaseurl = config['firebaseurl']
    # fb = firebase.FirebaseApplication(firebaseurl,None)

    jieba.set_dictionary('dict.txt.big')
    jieba.initialize()

    # load from input text path
    input_text_path = [
        join(input_text_folder, os.path.basename(x))
        for x in glob.glob(join(input_text_folder, ('*')))
        if '.cm' in x and '.cm2' not in x and '.syl' not in x
    ]
    #print(input_text_path)
    input_text_path = sorted(input_text_path,
                             key=functools.cmp_to_key(myCompare))
    search_enging = search()
    #searchEngine = 'Google'
    for eachTarget in [
            reconstruct_search_words(eachpath, 0.845)
            for eachpath in input_text_path
    ]:
        for filename, keywordlist in eachTarget.items():
            crawlflow = {}
            # get web urls from google each 15 seconds

            logger.info('Start: ' + filename)
            n_segment_urls = {}  # is there a repetition in urls
            alldata = []

            if not args.special_file_name == '':
                if not filename == args.special_file_name:
                    continue
            print(filename, keywordlist)
            thisTurnData = []
            crawlflow['keywordlist'] = keywordlist

            for keyword in keywordlist:
                crawlflow['filename'] = filename
                crawlflow['keyword'] = keyword
                [
                    os.remove(filename) for filename in glob.glob(
                        join(outputpath, ('fcr23.ws.re.wav*')))
                ]
                [
                    os.remove(filename)
                    for filename in glob.glob(join(outputpath, ('*.txt')))
                ]
                [
                    os.remove(filename)
                    for filename in glob.glob(join(outputpath, ('*.line')))
                ]
                tFirstStart = time.time()
                if searchEngine == 'Google':
                    webUrls = search_enging.google_get_url(
                        keyword)  # crawl google
                elif searchEngine == 'Bing':
                    webUrls = search_enging.bing_get_url(
                        keyword)  # crawl google
                crawlflow['searchEngine'] = searchEngine
                crawlflow['webUrls'] = webUrls
                crawlflow['round'] = keywordlist.index(keyword)
                for url in webUrls:
                    if url in n_segment_urls:
                        n_segment_urls[url] += 1
                        webUrls.remove(url)
                    else:
                        n_segment_urls[url] = 1
                #pool = mp.Pool()
                #thisTurnData = pool.map(crawlpage,webUrls)
                thisTurnData = Parallel(n_jobs=-1, backend="threading")(
                    delayed(crawlpage)(url) for url in webUrls)
                alldata.extend(thisTurnData)

                #pool.close()
                #pool.join()

                crawlPagetime = str(int(time.time() - tFirstStart))
                crawlflow['crawlPagetime'] = crawlPagetime
                thisTurnData = [
                    data for data in thisTurnData if len(''.join(data)) < 30000
                    and not data == '' and not data == []
                ]
                after_filter_page_num = len(thisTurnData)
                crawlflow['afterFilterPageNum'] = after_filter_page_num
                if not thisTurnData:
                    crawlflow['filename'] = filename + '-' + str(
                        keywordlist.index(keyword))
                    db[timestamp + 'fail'].insert_one(crawlflow.copy())
                    #fb.post('/'+timestamp+'fail', crawlflow)
                    # crawlflow = {}
                    # crawlflow['filename'] = filename
                    # crawlflow['keywordlist'] = keywordlist
                    continue
                # write down those data from web page
                for data in thisTurnData:
                    webcontent = ''.join(data)
                    if len(webcontent) > 0:
                        with open(join(
                                outputpath, filename + '-' +
                                str(alldata.index(data)) + '.txt'),
                                  'w',
                                  encoding='utf8') as f:
                            f.write(''.join(data))
                # use pin yin to transfer data
                tStart = time.time()
                rq.get(
                    bashfilepath +
                    '?text={}&asr={}'.format(Nasgoogle_crawl_dir, ASR_result))
                tranfPinYintime = str(int(time.time() - tStart))
                crawlflow['tranfPinYintime'] = tranfPinYintime
                tStart = time.time()
                # use match method to find paragraph

                p1 = subprocess.Popen([
                    'python3', 'generate_diff.py',
                    join(outputpath, matchfile_pre),
                    join(outputpath, matchfile_tmp)
                ],
                                      cwd="Match/wav_matched/",
                                      stdout=subprocess.PIPE,
                                      shell=False)
                p1.wait()
                p2 = subprocess.Popen([
                    'python3', 'filter_crawl_result.py',
                    join(outputpath, matchfile_tmp),
                    join(outputpath, matchfile_result)
                ],
                                      cwd="Match/wav_matched/",
                                      stdout=subprocess.PIPE,
                                      shell=False)
                p2.wait()
                matchFunctiontime = str(int(time.time() - tStart))
                crawlflow['matchFunctiontime'] = matchFunctiontime
                # Analyze - read match file and decide to query this file or not
                if analyze(filename, outputpath, finishpath, 0.9, thisTurnData,
                           input_text_path, crawlflow)[0] == 'Get paragraph':
                    # from crawlflow['oriASRresult'] to compare with crawlflow['paragraph']
                    crawl_compare_match = SequenceMatcher(
                        None, crawlflow['oriASRresult'],
                        crawlflow['paragraph']).get_matching_blocks()
                    same_sents = [
                        crawlflow['oriASRresult'][m[0]:m[0] + m[2]]
                        for m in crawl_compare_match
                    ]
                    same_sents = [
                        sentence for sentence in same_sents
                        if len(sentence) > 1
                    ]
                    crawlflow['oriASRresult'] = crawlflow[
                        'oriASRresult'].replace(' ', '')
                    crawlflow['paragraph'] = crawlflow['paragraph'].replace(
                        ' ', '')
                    opc1 = SequenceMatcher(
                        None, crawlflow['oriASRresult'],
                        crawlflow['paragraph']).get_opcodes()
                    hint_dict = {}
                    for tag, i1, i2, j1, j2 in opc1:
                        if tag == 'replace':
                            hint_dict[(j1, j2)] = crawlflow['paragraph'][j1:j2]
                    jiebacut_result = [
                        w for w in jieba.cut(crawlflow['paragraph'])
                    ]
                    orihints = [
                        crawlflow['paragraph'][j1:j2]
                        for tag, i1, i2, j1, j2 in opc1 if tag == 'replace'
                    ]
                    # hints.extend(same_sents)
                    crawlflow['orihints'] = orihints
                    reconstruct_hints = diff_word_reconstruct(
                        hint_dict, jiebacut_result, crawlflow['paragraph'])
                    hints = reconstruct_hints.copy()
                    crawlflow['reconstruct_hints'] = reconstruct_hints
                    tmpparagraph = crawlflow['paragraph']
                    for hint in hints:
                        tmpparagraph = tmpparagraph.replace(hint, ' ')
                    hints.extend([
                        sent for sent in ''.join(tmpparagraph).split(' ')
                        if len(sent) > 1
                    ])
                    hintlength = 0
                    tmphint = []
                    for hint in hints:
                        hintlength += len(hint)
                        if hintlength >= 5000:
                            break
                        else:
                            if len(hint) < 100:
                                tmphint.append(hint)

                    crawlflow['hints'] = tmphint
                    db[timestamp].insert_one(crawlflow.copy())
                    #fb.post('/'+timestamp, crawlflow)
                    break
                else:
                    thisTurnData = []
                    crawlflow['filename'] = filename + '-' + str(
                        keywordlist.index(keyword))
                    db[timestamp + 'fail'].insert_one(crawlflow.copy())
                    #fb.post('/'+timestamp+'fail', crawlflow)
                    # crawlflow.clear()
                    # crawlflow['filename'] = filename
                    # crawlflow['keywordlist'] = keywordlist

                    #x = input('wait here')
                tEnd = time.time()
                sleeptime = 15 - int(tEnd - tFirstStart)
                if sleeptime > 0 and searchEngine == 'Google':
                    print('sleep', sleeptime)
                    time.sleep(sleeptime)
Example #39
0
class ProcessHandler(tornado.web.RequestHandler):
    # Global instance to store todos. You should use a database in reality.
    jieba.initialize()
    #mysqlHandler = MysqlHandler('localhost', 'root', 'Aqaz123!', 't5_rent')
    mysqlHandler = MysqlHandler(MYSQL_HOST, MYSQL_USER, MYSQL_PASSWORD, MYSQL_DATABASE)

    detail_json = ""
    with open('/root/product/t5_rent/room_data') as fd:
        txt = fd.read().replace('\n','')
        detail_json = txt.decode('gbk').encode('utf-8')
    key = RSA.generate(2048)
    priv_pem = key.exportKey()
    pub_pem = key.publickey().exportKey()
    priv_key = RSA.importKey(priv_pem)
    pub_key = RSA.importKey(pub_pem)
 
    def send_res(self, result):
        final_result = '{"retcode":-1,"result":[]}';
        if (result != NO_RESULT):
            final_result = result
        self.write(final_result)

    def check_user(self, claims):
        return True

    def get(self):
        # return all todos
 
 
        # Just dump data to json, and return it
        operation = self.request.uri
        '''
        try:
            token = self.get_argument('token')
        except:
            payload = { 'site': 'zhiliaohou.online', 'name': 'litong'}
            token = jwt.generate_jwt(payload, self.priv_key, 'RS256', datetime.timedelta(minutes=5))
            self.send_res('{"token":"%s"}' % token)
            return
        if (token == None or token == ""):
            self.send_res(NO_RESULT)
            return 
        else:
            try:
                header, claims = jwt.verify_jwt(token, self.pub_key, ['RS256'])
            except:
                self.send_res(NO_RESULT)
                return
            if (self.check_user(claims) is False):
                self.send_res(NO_RESULT)
                return
        '''
        t1 = int(time.time()*1000)
        if operation.find('house_list') != -1:
            result_list = []
            try:
                queryString = self.get_argument('query')
                print queryString
                seg_list = jieba.cut_for_search(unquote(queryString), HMM=False)
                print seg_list
            except:
                seg_list = None
            print seg_list
            doclist = self.mysqlHandler.GetDocIdList(seg_list)
            ret_dic = []
            for doc in doclist:
                print doc
            docs = self.mysqlHandler.GetContent(doclist)
            for doc in docs:
                (id, title, subdistrict, faceto, floor, year, dinner_num, room_num, fitment, area, pic) = doc
                house = House(id, title, subdistrict, faceto, floor, year, dinner_num, room_num, fitment, area, "", "", pic)
                house_dic = house.to_dict()
                result_list.append(house_dic)
            final_result = {'retcode':len(result_list), 'result': result_list}
            json_result = simplejson.dumps(final_result)
            #print json_result
            #send_str = str(simplejson.loads(json_result)).decode('utf8').encode('raw_unicode_escape')
            #print send_str
            self.send_res(json_result)
            t2 = int(time.time()*1000)
            print "[%s] cost=%d,ret=%d" % ((datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')), t2 - t1, len(result_list))
            return
        elif operation.find('house_detail') != -1:
            self.send_res(self.detail_json)
            t2 = int(time.time()*1000)
            print "[%s] cost=%d,ret=%d" % ((datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')), t2 - t1, 0)
            return
Example #40
0
 def __init__(self):
     jieba.initialize()
     jieba.enable_parallel(8)
Example #41
0
def jieba_initialize():
    jieba.load_userdict(
        os.path.dirname(os.path.split(os.path.realpath(__file__))[0]) +
        '/resources/QAattrdic.txt')
    jieba.initialize()
Example #42
0
 def __init__(self):
     jieba.initialize()
     self.ltpseg = pyltp.Segmentor()
     self.ltpseg.load('model/ltp_data_v3.4.0/cws.model')
     self.thu1 = thulac.thulac(seg_only=True)
     pynlpir.open()
Example #43
0
    def __init__(self, userdict):
        preNormalSeg.__init__(self)

        jb.load_userdict(userdict)
        jb.initialize()
Example #44
0
 def __init__(self) -> None:
     import jieba
     import jieba.posseg as pseg
     self.__tokenize = pseg.cut
     jieba.initialize()
#http://www.oss.io/p/fxsjy/jieba
'''
1. 分词
jieba.cut 方法接受三个输入参数: 需要分词的字符串;cut_all 参数用来控制是否采用全模式;HMM 参数用来控制是否使用 HMM 模型
jieba.cut_for_search 方法接受两个参数:需要分词的字符串;是否使用 HMM 模型。该方法适合用于搜索引擎构建倒排索引的分词,粒度比较细
待分词的字符串可以是 unicode 或 UTF-8 字符串、GBK 字符串。注意:不建议直接输入 GBK 字符串,可能无法预料地错误解码成 UTF-8
jieba.cut 以及 jieba.cut_for_search 返回的结构都是一个可迭代的 generator,可以使用 for 循环来获得分词后得到的每一个词语(unicode),或者用
jieba.lcut 以及 jieba.lcut_for_search 直接返回 list
jieba.Tokenizer(dictionary=DEFAULT_DICT) 新建自定义分词器,可用于同时使用不同词典。jieba.dt 为默认分词器,所有全局分词相关函数都是该分词器的映射。
'''

import jieba
import time

jieba.initialize()  #手动初始化
time.sleep(1)

seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
print("Full Mode: " + "/ ".join(seg_list))  # 全模式

seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
print("Default Mode: " + "/ ".join(seg_list))  # 精确模式

seg_list = jieba.cut("他来到了网易杭研大厦")  # 默认是精确模式
print(", ".join(seg_list))

seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造")  # 搜索引擎模式
print(", ".join(seg_list))
'''
2. 添加自定义词典
Example #46
0
 def __init__(self, **kwargs) -> None:
     jieba.initialize()
     pass
Example #47
0
 def __init__(self, general_thesaurus_path):
     """Read the general thesaurus"""
     jieba.initialize(general_thesaurus_path)
Example #48
0
for tk in result:
    print("%s \t start at: %d \t end at: %d" % (tk[0], tk[1], tk[2]))

# In[23]:

# ###搜索模式
# 把句子中所有的可以成词的词语都扫描出来并确定位置。

result = jieba.tokenize(u"永和服装饰品有限公司", mode="search")
for tk in result:
    print("%s \t start at: %d \t end at: %d" % (tk[0], tk[1], tk[2]))

# In[ ]:

# ## Part 7 延迟加载机制

# * jieba采用延迟加载,import jieba 和 jieba.Tokenizer() 不会立即触发词典的加载,一旦有必要才开始加载词典构建前缀字典。如果你想手工初始jieba,也可以手动初始化

# In[24]:

import jieba
jieba.initialize()  # 手动初始化,可选

# In[25]:

# 在 0.28 之前的版本是不能指定主词典的路径的,有了延迟加载机制后,你可以改变主词典的路径:
# jieba.set_dictionary("data/dict.txt.big")
# 也可以下载你所需要的词典,然后覆盖jieba/dict.txt即可。

# In[ ]:
Example #49
0
def jieba_initialize():
    jieba.load_userdict(os.path.dirname(os.path.split(os.path.realpath(__file__))[0])+'/resources/QAattrdic.txt')
    jieba.initialize()
Example #50
0
import codecs
import re
import jieba
#import jieba.posseg
import jieba.analyse
import logging
import os
from functions import *
from hanziconv import HanziConv
import argparse
"""
This is a file to pre-process the data
"""

logging.basicConfig()  #level=logging.NOTSET)
jieba.initialize()  # (optional)

parser = argparse.ArgumentParser(
    description='preprocess files to remove unrelated flags')

parser.add_argument('--data_path', type=str, help='the origin data path')
parser.add_argument('--aim_path', type=str, help='the path of processed data ')
parser.add_argument(
    '--process_answer',
    type=lambda s: s.lower() in ['true', 't', 'yes', '1'],
    default=False,
    help=
    'a switch to to process answer ,set true when processing training data ')

args = parser.parse_args()
fp = codecs.open(args.data_path, "r", "utf-8")
Example #51
0
#!/usr/bin/env python
#encoding:utf8

import jieba
import jieba.posseg as pseg

jieba.initialize(dictionary="dict.txt")

from pyspark import SparkContext

def tokenize(text):
    docid, body = text.split('\t', 1)
    items = []
    for word,flag in pseg.cut(body):
        items.append('%s/%s'%(word,flag))
    result = "%s\t%s"%(docid, ' '.join(items))
    return result

if __name__ == "__main__":
    sc =SparkContext(appName="Python Tokens")
    #input_file = 'liuxufeng/nlp/doc_text/part-00040'
    input_file = 'liuxufeng/nlp/doc_text/*'
    bodies = sc.textFile(input_file)
    items = bodies.map(tokenize) #.collect()
    #for item in items:
    #    print item.encode("utf8")
    items.saveAsTextFile("liuxufeng/nlp/doc_tokens")
    sc.stop()
Example #52
0
                    default=True, help="don't use the Hidden Markov Model")
parser.add_argument("-q", "--quiet", action="store_true", default=False,
                    help="don't print loading messages to stderr")
parser.add_argument("-V", '--version', action='version',
                    version="Jieba " + jieba.__version__)
parser.add_argument("filename", nargs='?', help="input file")

args = parser.parse_args()

if args.quiet:
    jieba.setLogLevel(60)
delim = str(args.delimiter)
cutall = args.cutall
hmm = args.hmm
fp = open(args.filename, 'r') if args.filename else sys.stdin

if args.dict:
    jieba.initialize(args.dict)
else:
    jieba.initialize()
if args.user_dict:
    jieba.load_userdict(args.user_dict)

ln = fp.readline()
while ln:
    l = ln.rstrip('\r\n')
    print(delim.join(jieba.cut(ln.rstrip('\r\n'), cutall, hmm)))
    ln = fp.readline()

fp.close()
# coding=utf8

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
import re
import json
import codecs
import jieba
jieba.initialize()   #manual initialize jieba

# import jieba.analyse
import jieba.posseg as pseg
# import redis
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

import os
from gensim import corpora, models, similarities
from flask import Flask, request, abort,g,current_app
# from werkzeug.contrib.fixers import ProxyFix
app = Flask(__name__)

project_path = './'
docpath='/home/workspace/news'

# @app.before_first_request
# @app.before_request
def appd():
    app.config['stopwords'] = codecs.open(project_path + 'stopwords.txt', encoding='UTF-8').read()
    app.config['dictionary'] = corpora.Dictionary.load(project_path + 'lsi/' + 'viva.dict')
def fenci_initalize():
	jieba.initialize()
Example #55
0
from jieba import lcut as jc
from jieba import initialize
from pkuseg import pkuseg
from thulac import thulac
from telegram import InputTextMessageContent, InlineQueryResultArticle
from telegram.ext import (
    Updater,
    CommandHandler,
    CallbackContext,
    InlineQueryHandler,
)
from telegram.update import Update

from config import BOT_TOKEN

initialize()

tc = thulac(seg_only=True).cut  # pylint: disable=C0103
pc = pkuseg().cut  # pylint: disable=C0103


def words(update: Update, context: CallbackContext) -> None:
    """Words the inline message."""
    query = update.inline_query.query

    resj = " ".join(jc(query, cut_all=False))
    rest = tc(query, text=True)
    resp = " ".join(pc(query))

    print("--" * 10)
    print(query, end="\n")
#coding=utf-8

'''
基于词典的情绪化分法
'''
__author__ = 'Eric_Chan'
import re
import jieba
import chardet
import time

jieba.initialize() #手动启动结巴模块
print '结巴系统启动完毕'
mood_dist = { 0:'厌恶',1:'同情',2:'喜欢',3:'怨恨',4:'悲伤',5:'愉快',6:'愤怒',7:'焦虑',8:'其他'}


def load_word_data(filename):#获得情绪词条词典
    file1 = open('/Users/Har/Desktop/DM/舆情/学习/基于规则的情绪划分/emotion_words/情绪词/%s'%filename,'r')
    line = file1.readline().strip()
    words = []
    while line:
        charset = chardet.detect(line) #检测文件编码
        code = charset['encoding']
        # print code
        line = line.decode(code,'ignore')
        words.append(line)
        line = file1.readline().strip()
    file1.close()

    file2 = open('/Users/Har/Desktop/DM/舆情/学习/基于规则的情绪划分/emotion_words/网络新词/%s'%filename,'r')
    line = file2.readline().strip()
Example #57
0
 def __init__(self, ranker):
     jieba.initialize()
     jieba.enable_parallel(8)
     self.ranker = ranker
def addcustomerEvaluation_informal(hbaseIP,
                                   mysqlhostIP,
                                   mysqlUserName='******',
                                   mysqlPassword='',
                                   dbname='btv'):
    # 读停用词
    path = os.path.abspath(os.path.dirname(sys.argv[0]))
    dicFile = open(path + '/tools/NTUSD_simplified/stopwords.txt', 'r')
    stopwords = dicFile.readlines()
    stopwordList = []
    stopwordList.append(' ')
    for stopword in stopwords:
        temp = stopword.strip().replace('\r\n', '').decode('utf8')
        stopwordList.append(temp)
    dicFile.close()

    # 分词
    jieba.initialize()
    source_1 = 'weibo'
    # 连接数据库
    sqlConn = MySQLdb.connect(host=mysqlhostIP,
                              user=mysqlUserName,
                              passwd=mysqlPassword,
                              db=dbname,
                              charset='utf8')
    sqlcursor = sqlConn.cursor()

    sqlcursor.execute(
        '''CREATE TABLE IF NOT EXISTS media_evaluation(pk bigint NOT NULL PRIMARY KEY AUTO_INCREMENT, flag int(1), evaluation bigint(20), content varchar(200), date Date,
                    program_id varchar(200), program varchar(200)) DEFAULT CHARSET=utf8;'''
    )
    print '新建库成功'
    # 时间属性
    # inter为0,即为当日
    # 库中是2.29
    inter = 37
    now = int(time.time()) - 86400 * inter
    timeArray = time.localtime(now)
    otherStyleTime = time.strftime("%Y-%m-%d", timeArray)
    print otherStyleTime
    # 连接hbase数据库
    conn = happybase.Connection(hbaseIP)
    conn.open()
    # 存储评论数据

    # 处理情感
    emProcess = emotionProcess()
    rmIrr = removeIrrelevant()
    # 首先获取栏目,注意栏目及相关hbase存储信息有更新,请在hbase_info做同步
    # print 'SELECT DISTINCT(program) from hbase_info where source = %s' %source_1
    sqlcursor.execute(
        "SELECT DISTINCT(program) from hbase_info where source = 'buzz';")
    bufferTemp = sqlcursor.fetchall()
    # print len(bufferTemp)
    for one_program in bufferTemp:
        commentsData = []
        tempData = []
        one_program = one_program[0].encode('utf8')
        print type(one_program), one_program
        sqlcursor.execute(
            '''SELECT hbase_table from hbase_info where program = %s and source = %s;''',
            (one_program, source_1))
        bufferTemp = sqlcursor.fetchone()
        program_hbase_table = bufferTemp[0]
        print program_hbase_table
        # 以“JQJM”为关键词的微博原贴代替,按理应该是底下的评论
        table = conn.table(str(program_hbase_table))
        # customerEvaluation_informal需要有program标识
        emotionsWord = []
        emotionsScore = 0
        count = 0
        printCount = 0
        sqlcursor.execute(
            '''SELECT program_id from competition_analysis where program = %s''',
            (one_program, ))
        bufferTemp = sqlcursor.fetchone()
        program_id = bufferTemp[0]
        print program_id
        # row_prefix, limit可以限定次数
        for key, data in table.scan(limit=10, batch_size=10):
            # print 'hhh',key,data
            # for key,data in table.scan(row_prefix = 'row', limit = 10, batch_size = 10):
            date_created = data['base_info:cdate']
            if date_created == otherStyleTime:
                content = data['base_info:text']
                print 'q', content
                # 暂时没有program_id
                # program_id = data['base_info:program_id']

                count += 1
                printCount += 1
                # 处理每一条
                #     情感关键词
                (emotionsWord, emotionsScore) = emProcess.processSentence(
                    rmIrr.removeEverythingButEmotion(content))
                #     倾向性判断flag:1是正面,0是中性,-1是负面

                #     情感极性判断,这里我限制了更严格的条件
                if emotionsScore > 0:
                    tempData.append('1')
                elif emotionsScore == 0:
                    tempData.append('0')
                elif emotionsScore < 0:
                    tempData.append('-1')
        #         情感得分sentimentScore
                tempData.append(emotionsScore)
                # 评论内容
                tempData.append(content)
                #     日期时间
                tempData.append(otherStyleTime)
                # 栏目id
                tempData.append(program_id)
                # 栏目名称
                tempData.append(one_program)
                #     转换为元组
                commentsData.append(tuple(tempData))
                tempData = []
                if count >= 10:
                    sqlcursor.executemany(
                        '''insert into media_evaluation(flag, evaluation, content, date, program_id, program)
                                values (%s, %s, %s, %s, %s, %s)''',
                        commentsData)
                    sqlConn.commit()
                    commentsData = []
                    count = 0
                    print '插入' + str(printCount) + '个'
        # # except:
        # #      print tempData
        sqlcursor.executemany(
            '''insert into media_evaluation(flag, evaluation, content, date, program_id, program)
                            values (%s, %s, %s, %s, %s, %s)''', commentsData)
        sqlConn.commit()
    sqlConn.close()
Example #59
0
                    nargs='?', const=' ',
                    help="use DELIM instead of ' / ' for word delimiter; use a space if it is without DELIM")
parser.add_argument("-a", "--cut-all",
                    action="store_true", dest="cutall", default=False,
                    help="full pattern cutting")
parser.add_argument("-n", "--no-hmm", dest="hmm", action="store_false",
                    default=True, help="don't use the Hidden Markov Model")
parser.add_argument("-q", "--quiet", action="store_true", default=False,
                    help="don't print loading messages to stderr")
parser.add_argument("-V", '--version', action='version',
		    version="Jieba " + jieba.__version__)
parser.add_argument("filename", nargs='?', help="input file")

args = parser.parse_args()

if args.quiet:
	jieba.setLogLevel(60)
delim = unicode(args.delimiter)
cutall = args.cutall
hmm = args.hmm
fp = open(args.filename, 'r') if args.filename else sys.stdin

jieba.initialize()
ln = fp.readline()
while ln:
	l = ln.rstrip('\r\n')
	print(delim.join(jieba.cut(ln.rstrip('\r\n'), cutall, hmm)).encode('utf-8'))
	ln = fp.readline()

fp.close()
Example #60
0
 def __init__(self):
     config = Config()
     #jieba.set_dictionary(config.zh_dict_txt_big)
     #jieba.load_userdict(config.zh_my_dict)
     jieba.initialize()
     self.model = models.Word2Vec.load(config.word2vec_model_zh)