Ejemplo n.º 1
0
    poller = zmq.Poller()
    poller.register(receiver, zmq.POLLIN)
    poller.register(controller, zmq.POLLIN)

    parser = ArgumentParser()
    parser.add_argument('-r', '--remote_stub', action='store_true', help='remote stub')
    args = parser.parse_args(sys.argv[1:])
    remote_stub = args.remote_stub

    dbpath = XAPIAN_DB_PATH
    xapian_indexer = XapianIndex(dbpath, SCHEMA_VERSION, remote_stub)

    fill_field_funcs = []
    from consts import XAPIAN_EXTRA_FIELD
    from triple_sentiment_classifier import triple_classifier

    def fill_sentiment(item):
        sentiment = triple_classifier(item)
        item[XAPIAN_EXTRA_FIELD] = sentiment
        return item
    fill_field_funcs.append(fill_sentiment)

    s = load_scws()

    def cut_text(item):
        text = item['text'].encode('utf-8')
        item['terms'] = cut(s, text, cx=False)
        return item
    fill_field_funcs.append(cut_text)
    xapian_index_forever(xapian_indexer, receiver, controller, poller, fill_field_funcs=fill_field_funcs)
Ejemplo n.º 2
0
#! /usr/bin/env python
# -*- coding: utf-8 -*-

import eventlet
from eventlet import wsgi
from xapian_weibo.utils import load_scws
import json
import urllib

JSON_HEADER = [('content-Type', 'application/json;charset=UTF-8'),
               ("Access-Control-Allow-Origin", "*"),
               ('Server', 'WDC-eventlet')]

s = load_scws()


def cut(text, f=None):
    global s
    if f:
        return [
            token[0].decode('utf-8') for token in s.participle(text)
            if token[1] in f and (token[0].isalnum() or len(token[0]) > 3)
        ]
    else:
        return [
            token[0].decode('utf-8') for token in s.participle(text)
            if token[0].isalnum() or len(token[0]) > 3
        ]


def word_seg(env, start_response):
import nltk
import re
from gensim import corpora, models, similarities
import math
import string
from nltk import probability
from nltk.probability import FreqDist
import cPickle as pickle
import leveldb
from xapian_weibo.xapian_backend import XapianSearch
from xapian_weibo.xapian_backend_extra import _load_weibos_from_xapian
from xapian_weibo.utils import load_scws
from xapian_weibo.utils import cut


cut_str = load_scws()

##情绪类标
HAPPY = 1
ANGRY = 2
SAD = 3


def emoticon(zan_set, angry_set, sad_set, text):
    """ text是微博文本,不是关键词"""

    emotion_pattern = r'\[(\S+?)\]'
    remotions = re.findall(emotion_pattern, text)
    zan = 0
    angry = 0
    sad = 0
Ejemplo n.º 4
0
# -*- coding: utf-8 -*-

#  gathering snmp data
from __future__ import division
import re
import opencc
import os
from gensim import corpora
import cPickle as pickle
from xapian_weibo.utils import load_scws, cut, load_emotion_words

AB_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')

cut_str = load_scws()

cc = opencc.OpenCC('s2t', opencc_path='/usr/bin/opencc')
emotions_words = load_emotion_words()
emotions_words = [unicode(e, 'utf-8') for e in emotions_words]
t_emotions_words = [cc.convert(e) for e in emotions_words]
emotions_words.extend(t_emotions_words)
emotions_words = [w.encode('utf-8') for w in emotions_words]
emotions_words_set = set(emotions_words)
emotion_pattern = re.compile(r'\[(\S+?)\]')


def if_emoticoned_weibo(r):
    # 微博是否包含指定的表情符号集
    emotions = re.findall(emotion_pattern, r['text'])
    is_emoticoned = 1 if set(emotions) & emotions_words_set else 0
    return is_emoticoned