Ejemplo n.º 1
0
def createWordTree():
    awords = []
    # for b in open('sensitive_words.txt', 'rb'):
    #     awords.append(b.strip())
    sensitive_word_list = []
    sensitive_word_list = r_sensitive.hgetall("sensitive_words")
    awords = json.dumps(sensitive_word_list)

    for word in awords:
        temp = wordTree
        for a in range(0, len(word)):
            index = ord(word[a])
            if a < (len(word) - 1):
                if temp[index] == None:
                    node = [[None for x in range(256)], 0]
                    temp[index] = node
                elif temp[index] == 1:
                    node = [[None for x in range(256)], 1]
                    temp[index] = node

                temp = temp[index][0]
            else:
                temp[index] = 1
def cal_text_sensitive(item):
    text = item['text']
    uid = item['uid']
    timestamp = item['timestamp']
    date = ts2datetime(timestamp)
    ts = datetime2ts(date)
    if isinstance(text, str):
        text = text.decode('utf-8', 'ignore')

    sensitive_word_list = []
    sensitive_word_list = r_sensitive.hgetall("sensitive_words")
    SENSITIVE_WORD = json.dumps(sensitive_word_list)

    sensitive_result = [word for word in SENSITIVE_WORD if word in text]
    if sensitive_result:
        sensitive_dict = dict()
        for word in sensitive_result:
            try:
                sensitive_dict[word] += 1
            except:
                sensitive_dict[word] = 1
        try:
            sensitive_count_string = r_cluster.hget('sensitive_' + str(ts),
                                                    str(uid))
            sensitive_count_dict = json.loads(sensitive_count_string)
            for word in sensitive_dict:
                count = sensitive_dict[word]
                try:
                    sensitive_count_dict[word] += count
                except:
                    sensitive_count_dict[word] = count
            r_cluster.hset('sensitive_' + str(ts), str(uid),
                           json.dumps(sensitive_count_dict))
        except:
            r_cluster.hset('sensitive_' + str(ts), str(uid),
                           json.dumps(sensitive_dict))