def update_day_sensitive(uid_list):
    results = {}
    count = 0
    for uid in uid_list:
        results[uid] = {"sensitive": 0, 'sensitive_string': "", 'sensitive_dict': json.dumps({})}
    all_results = {}
    now_ts = time.time()
    if RUN_TYPE == 1:
        now_date_ts = datetime2ts(ts2datetime(now_ts))
    else:
        now_date_ts = datetime2ts('2013-09-02')
    today_sensitive_dict = {}
    sensitive_results = redis_cluster.hmget("sensitive_"+str(now_date_ts), uid_list)
    for item in sensitive_results:
        if not item:
            count += 1
            continue
        print type(item)
        uid = uid_list[count]
        item = json.loads(item)
        sensitive_index = 0
        sensitive_words_dict = {}
        for word, count in item.iteritems():
            tmp_stage = r_sensitive.hget("sensitive_words", word)
            if tmp_stage:
                tmp = json.loads(tmp_stage)
                sensitive_index += sensitive_score_dict[str(tmp[0])] * count
        sensitive_words_string = "&".join(item.keys())
        results[uid] = {'sensitive': sensitive_index, "sensitive_words_string":sensitive_words_string, "sensitive_words_dict":item}
        count += 1

    return results
def update_day_sensitive(uid_list):
    results = {}
    for uid in uid_list:
        results[uid] = {"sensitive": 0, "sensitive_string": "", "sensitive_dict": json.dumps({})}
    all_results = {}
    now_ts = time.time()
    if RUN_TYPE == 1:
        now_date_ts = datetime2ts(ts2datetime(now_ts))
    else:
        now_date_ts = datetime2ts("2013-09-03")
    today_sensitive_dict = {}
    sensitive_results = redis_cluster.hmget("sensitive_" + str(now_date_ts), uid_list)
    for item in sensitive_results:
        for uid, words_dict in item.iteritems():
            sensitive_index = 0
            sensitive_words_dict = {}
            if words_dict:
                sensitive_words_dict = json.dumps(words_dict)
                for word, count in words_dict.iter_items():
                    tmp_stage = r_sensitive.hget("sensitive_words", word)
                    if tmp_stage:
                        tmp = json.loads(tmp_stage)
                        sensitive_index += sensitive_score_dict[tmp[0]] * count
            sensitive_words_string = "&".join(sensitive_words_dict.keys())
            results[uid] = {
                "sensitive": sensitive_index,
                "sensitive_words_string": sensitive_words_string,
                "sensitive_words_dict": sensitive_words_dict,
            }

    return results
def createWordTree():
    wordTree = [None for x in range(256)]
    wordTree.append(0)
    nodeTree = [wordTree, 0]
    awords = []
    #for b in open('sensitive_words.txt', 'rb'):
    #    awords.append(b.strip())
    awords = r.hkeys('sensitive_words')    
    print awords
    for word in awords:
        temp = wordTree
        for a in range(0,len(word)):
            index = ord(word[a])
            if a < (len(word) - 1):
                if temp[index] == None:
                    node = [[None for x in range(256)],0]
                    temp[index] = node
                elif temp[index] == 1:
                    node = [[None for x in range(256)],1]
                    temp[index] = node
                
                temp = temp[index][0]
            else:
                temp[index] = 1

    return nodeTree 
def update_day_sensitive(uid_list):
    results = {}
    all_results = {}
    now_ts = time.time()
    #run_type
    if RUN_TYPE == 1:
        now_date_ts = datetime2ts(ts2datetime(now_ts))
    else:
        now_date_ts = test_ts
    today_sensitive_dict = {}
    for i in range(WEEK,0,-1):
        ts = now_date_ts - DAY*i
        count = 0
        sensitive_results = r_cluster_3.hmget('sensitive_'+str(ts), uid_list)
        #print 'sensitive_results:', sensitive_results
        for uid in uid_list:
            if uid not in results:
                results[uid] = {}
            sensitive_item = sensitive_results[count]
            if uid not in today_sensitive_dict:
                today_sensitive_dict[uid] = {}
            if sensitive_item:
                sensitive_dict = json.loads(sensitive_item)
            else:
                sensitive_dict = {}
            for sensitive in sensitive_dict:
                try:
                    results[uid][sensitive] += 1
                except:
                    results[uid][sensitive] = 1
                if ts == now_date_ts - DAY:
                    try:
                        today_sensitive_dict[uid][sensitive] += 1
                    except:
                        today_sensitive_dict[uid][sensitive] = 1
            count += 1
    #print 'results:', results
    for uid in uid_list:
        user_sensitive_dict = results[uid]
        #print 'uid,sensitive:', uid, user_sensitive_dict
        sensitive_score = 0
        today_sensitive_dict_user = today_sensitive_dict[uid]
        for item in today_sensitive_dict_user:
            k = item
            v = today_sensitive_dict_user[k]
            tmp_stage = r_sensitive.hget('sensitive_words', k)
            if tmp_stage:
                sensitive_score += v * sensitive_score_dict[str(tmp_stage)]
            #print 'sensitive_score:', sensitive_score
        sensitive_string = '&'.join(user_sensitive_dict.keys())
        #print 'uid, sensitive:', uid, sensitive_string, sensitive_score
        all_results[uid] = {'sensitive_string': sensitive_string, 'sensitive_dict':json.dumps(user_sensitive_dict),\
                'sensitive': sensitive_score}
    #print 'all_results:', all_results
    return all_results
def expand_index_action(item):
    index_body = {}
    index_body['uid'] = str(item['uid'])
    index_body['user_fansnum'] = int(item.get('user_fansnum', 0))
    index_body['text'] = item['text']
    index_body['mid'] = str(item['mid'])
    index_body['sentiment'] = str(item['sentiment'])
    index_body['timestamp'] = int(item['timestamp'])
    index_body['message_type'] = item['message_type']
    index_body['keywords_dict'] = item['keywords_dict']
    index_body['keywords_string'] = item['keywords_string']
    index_body['sensitive_words_string'] = item['sensitive_words_string']
    index_body['sensitive_words_dict'] = item['sensitive_words_dict']
    index_body['retweeted'] = 0
    index_body['comment'] = 0
    index_body['sensitive'] = 0
    sensitive_words_dict = json.loads(item['sensitive_words_dict'])
    if sensitive_words_dict:
        score = 0
        for k,v in sensitive_words_dict.iteritems():
            tmp_stage = r_sensitive.hget("sensitive_words", k)
            if tmp_stage:
                score += v*sensitive_score_dict[str(tmp_stage)]
        index_body['sensitive'] = score
    if item['message_type'] == 3:
        #for retweet message: get directed retweet uname and uid 
        directed_uid, directed_uname = get_directed_retweet(item['text'], item['root_uid'])
        if directed_uid:
            index_body['directed_uid'] = int(directed_uid)
        else:
            #index_body['directed_uid'] = directed_uid
            index_body['directed_uid'] = 0
        index_body['directed_uname'] = directed_uname
        index_body['root_mid'] = str(item['root_mid'])
        index_body['root_uid'] = str(item['root_uid'])
    elif item['message_type'] == 2:
        #for comment meesage: get directed comment uname and uid
        directed_uid, directed_uname = get_directed_comment(item['text'], item['root_uid'])
        if directed_uid:
            index_body['directed_uid'] = int(directed_uid)
        else:
            #index_body['directed_uid'] = directed_uid
            index_body['directed_uid'] = 0
        index_body['directed_uname'] = directed_uname
        index_body['root_mid'] = str(item['root_mid'])
        index_body['root_uid'] = str(item['root_uid'])

    ip = item['send_ip']
    index_body['ip'] = ip
    index_body['geo'] = ip2city(ip) #output: 中国&河北&石家庄
    
    action = {'index': {'_id': index_body['mid']}}
    xdata = index_body
    return action, xdata
# -*- coding:utf-8 -*-
from openpyxl import load_workbook
import redis
import random
import json
import sys
reload(sys)
sys.path.append('../')
from global_utils import R_ADMIN as r

data = load_workbook('sensitive_words.xlsx')
table = data.get_sheet_by_name('Sheet2')
category = ['政治', '军事', '法律', '意识形态', '民运']
#print table.cell('A1').value
for i in range(1,549):
    word = table.cell(row=i, column=0).value
    level = table.cell(row=i, column=1).value
    index = random.randint(0,4)
    r.hset('sensitive_words', word, json.dumps([level, category[index]]))

#r.hset('recommend_sensitive_words_20130901', '洪秀柱', json.dumps([['1093622153', '3270699555'], 3]))
#r.hset('recommend_sensitive_words_20130901', '港灿', json.dumps([['1686546714'], 1]))
#r.hset('recommend_sensitive_words_20130901', '钟屿晨', json.dumps([['1887344341', '3183040584'], 2]))


# -*- coding:utf-8 -*-
from openpyxl import load_workbook
import redis
import random
import json
import sys

reload(sys)
sys.path.append('../')
from global_utils import R_ADMIN as r

data = load_workbook('sensitive_words.xlsx')
table = data.get_sheet_by_name('Sheet2')
category = ['政治', '军事', '法律', '意识形态', '民运']
#print table.cell('A1').value
for i in range(1, 549):
    word = table.cell(row=i, column=0).value
    level = table.cell(row=i, column=1).value
    index = random.randint(0, 4)
    r.hset('sensitive_words', word, json.dumps([level, category[index]]))

#r.hset('recommend_sensitive_words_20130901', '洪秀柱', json.dumps([['1093622153', '3270699555'], 3]))
#r.hset('recommend_sensitive_words_20130901', '港灿', json.dumps([['1686546714'], 1]))
#r.hset('recommend_sensitive_words_20130901', '钟屿晨', json.dumps([['1887344341', '3183040584'], 2]))
Example #8
0
# -*- coding:utf-8 -*-

from openpyxl import load_workbook
import redis
import json
import sys
reload(sys)
sys.path.append('../../')
from global_utils import R_ADMIN as r
#r = redis.StrictRedis(host="10.128.55.69", port="6379", db=15)
data = load_workbook('sensitive_words.xlsx')
table = data.get_sheet_by_name('Sheet2')
for i in range(1,549):
    word = table.cell(row=i, column=0).value
    level = table.cell(row=i, column=1).value
    r.hset('sensitive_words',word, level)
print r.hkeys('sensitive_words')




def test(ft_type):
    print ft_type
    if ft_type == 'facebook':
        index_name_pre = facebook_flow_text_index_name_pre
        index_type = facebook_flow_text_index_type
        user_index_name = facebook_user_index_name
        user_index_type = facebook_user_index_type
    else:
        index_name_pre = twitter_flow_text_index_name_pre
        index_type = twitter_flow_text_index_type
        user_index_name = twitter_user_index_name
        user_index_type = twitter_user_index_type

    # date_list = load_date_list(True)
    date_list = load_date_list()

    DFA = createWordTree()
    query_body = {
        'post_filter': {
            'missing': {
                'field': 'keywords_string'
            }
        },
        'query': {
            'filtered': {
                'filter': {
                    'bool': {
                        'must': [{
                            'range': {
                                'flag_ch': {
                                    'gte': -1
                                }
                            }
                        }]
                    }
                }
            }
        }
    }
    for date in date_list:
        count = 0
        bulk_action = []
        index_name = index_name_pre + date
        try:
            es_scan_results = scan(es,
                                   query=query_body,
                                   size=1000,
                                   index=index_name,
                                   doc_type=index_type)
            while True:
                try:
                    scan_data = es_scan_results.next()
                    item = scan_data['_source']
                    text = item['text_ch']
                    uid = item['uid']
                    if ft_type == 'facebook':
                        _id = item['fid']
                    else:
                        _id = item['tid']

                    ts = datetime2ts(date)
                    #add sentiment field to weibo

                    sentiment, keywords_list = triple_classifier(item)

                    #add key words to weibo
                    keywords_dict, keywords_string = get_weibo_keywords(
                        keywords_list)

                    #sensitive_words_dict
                    sensitive_words_dict = searchWord(
                        text.encode('utf-8', 'ignore'), DFA)
                    if sensitive_words_dict:
                        sensitive_words_string_data = "&".join(
                            sensitive_words_dict.keys())
                        sensitive_words_dict_data = json.dumps(
                            sensitive_words_dict)
                    else:
                        sensitive_words_string_data = ""
                        sensitive_words_dict_data = json.dumps({})

                    #redis
                    if sensitive_words_dict:
                        sensitive_count_string = r_cluster.hget(
                            'sensitive_' + str(ts), str(uid))
                        if sensitive_count_string:  #redis取空
                            sensitive_count_dict = json.loads(
                                sensitive_count_string)
                            for word in sensitive_words_dict.keys():
                                if sensitive_count_dict.has_key(word):
                                    sensitive_count_dict[
                                        word] += sensitive_words_dict[word]
                                else:
                                    sensitive_count_dict[
                                        word] = sensitive_words_dict[word]
                            r_cluster.hset('sensitive_' + str(ts), str(uid),
                                           json.dumps(sensitive_count_dict))
                        else:
                            r_cluster.hset('sensitive_' + str(ts), str(uid),
                                           json.dumps(sensitive_words_dict))

                    #sensitive
                    sensitive_score = 0
                    if sensitive_words_dict:
                        for k, v in sensitive_words_dict.iteritems():
                            tmp_stage = r_sensitive.hget("sensitive_words", k)
                            if tmp_stage:
                                sensitive_score += v * sensitive_score_dict[
                                    str(tmp_stage)]

                    #directed_uid
                    directed_uid_data = 0
                    directed_uid, directed_uname = get_root_retweet(
                        text, uid, ft_type)
                    if directed_uid:
                        directed_uid_data = long(directed_uid)

                    # hashtag
                    hashtag = ''
                    RE = re.compile(
                        u'#([0-9a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+)[ ," =.。: :、]'
                    )
                    hashtag_list = re.findall(RE, text)
                    if hashtag_list:
                        hashtag = '&'.join(hashtag_list)

                    #action
                    action = {'update': {'_id': _id}}

                    # action_data
                    action_data = {
                        'sentiment': str(sentiment),
                        'keywords_dict': json.dumps(keywords_dict),
                        'keywords_string': keywords_string,
                        'sensitive_words_string': sensitive_words_string_data,
                        'sensitive_words_dict': sensitive_words_dict_data,
                        'sensitive': sensitive_score,
                        'directed_uid': directed_uid_data,
                        'directed_uname': directed_uname,
                        'hashtag': hashtag,
                    }

                    bulk_action.extend([action, {'doc': action_data}])
                    count += 1

                    if count % 1000 == 0 and count != 0:
                        if bulk_action:
                            es.bulk(bulk_action,
                                    index=index_name,
                                    doc_type=facebook_flow_text_index_type,
                                    timeout=600)
                        bulk_action = []
                        count = 0
                except StopIteration:
                    break
            if bulk_action:

                es.bulk(bulk_action,
                        index=index_name,
                        doc_type=facebook_flow_text_index_type,
                        timeout=600)
        except Exception, e:  #es文档不存在
            print e
Example #10
0
def compute_topic_task():
    print time.time()
    while True:
        #print r.rpop(topic_queue_name)
        task = r.rpop(topic_queue_name)
        if not task:
            break
        else:
            task = json.loads(task)
            print task
            topic = task['name']
            en_name = task['en_name']
            start_ts = int(task['start_ts'])  #timestamp
            end_ts = int(task['end_ts'])  #timestamp
            submit_user = task['submit_user']
            comput_status = task['comput_status']
            task_id = str(start_ts) + '_' + str(
                end_ts) + '_' + en_name + '_' + submit_user

            exist_flag = exist(task_id)
            #get_topic_weibo(topic,en_name,start_ts,end_ts)
            if exist_flag:
                #start compute
                #try:
                weibo_es.update(index=topic_index_name,
                                doc_type=topic_index_type,
                                id=task_id,
                                body={'doc': {
                                    'comput_status': -1
                                }})
                print 'finish change status'
                #geo

                repost_search(en_name, start_ts, end_ts)
                print 'finish geo_1 analyze'
                cityTopic(en_name, start_ts, end_ts)
                print 'finish geo analyze'
                #language
                count_fre(en_name,
                          start_ts=start_ts,
                          over_ts=end_ts,
                          news_limit=NEWS_LIMIT,
                          weibo_limit=MAX_LANGUAGE_WEIBO)
                print 'finish language analyze'
                #time
                propagateCronTopic(en_name, start_ts, end_ts)
                print 'finish time analyze'

                #network
                compute_network(en_name, start_ts, end_ts)
                print 'finish network analyze'

                #sentiment
                sentimentTopic(en_name, start_ts=start_ts, over_ts=end_ts)
                print 'finish sentiment analyze'
                #finish compute
                print weibo_es.update(index=topic_index_name,
                                      doc_type=topic_index_type,
                                      id=task_id,
                                      body={
                                          'doc': {
                                              'comput_status': 1,
                                              'finish_ts': int(time.time())
                                          }
                                      })
                print 'finish change status done'
                # except:
                # 	raise
                # 	break
            else:
                pass
Example #11
0
def expand_index_action(item):
    index_body = {}
    index_body['uid'] = str(item['uid'])
    index_body['text'] = item['text']
    index_body['tid'] = str(item['tid'])
    index_body['sentiment'] = str(item['sentiment'])
    index_body['timestamp'] = int(item['timestamp'])
    #index_body['message_type'] = item['message_type']
    index_body['keywords_dict'] = item['keywords_dict']
    index_body['keywords_string'] = item['keywords_string']
    index_body['sensitive_words_string'] = item['sensitive_words_string']
    index_body['sensitive_words_dict'] = item['sensitive_words_dict']
    sensitive_words_dict = json.loads(item['sensitive_words_dict'])
    score = 0
    if sensitive_words_dict:
        #score = 0
        for k, v in sensitive_words_dict.iteritems():
            tmp_stage = r_sensitive.hget("sensitive_words", k)
            if tmp_stage:
                score += v * sensitive_score_dict[str(tmp_stage)]
        #index_body['sensitive'] = score
    index_body['sensitive'] = score

    #for retweet message: get directed retweet uname and uid
    # directed_uid, directed_uname = get_directed_retweet(item['text'], item['root_uid'])
    directed_uid, directed_uname = get_root_retweet(item['text'], item['uid'])
    if directed_uid:
        index_body['directed_uid'] = long(directed_uid)
    else:
        #index_body['directed_uid'] = directed_uid
        index_body['directed_uid'] = 0
    index_body['directed_uname'] = directed_uname
    # if item['message_type'] == 3:
    #     #for retweet message: get directed retweet uname and uid
    #     directed_uid, directed_uname = get_directed_retweet(item['text'], item['root_uid'])
    #     if directed_uid:
    #         index_body['directed_uid'] = int(directed_uid)
    #     else:
    #         #index_body['directed_uid'] = directed_uid
    #         index_body['directed_uid'] = 0
    #     index_body['directed_uname'] = directed_uname
    #     index_body['root_tid'] = str(item['root_tid'])
    #     index_body['root_uid'] = str(item['root_uid'])
    # elif item['message_type'] == 2:
    #     #for comment meesage: get directed comment uname and uid
    #     directed_uid, directed_uname = get_directed_comment(item['text'], item['root_uid'])
    #     if directed_uid:
    #         index_body['directed_uid'] = int(directed_uid)
    #     else:
    #         #index_body['directed_uid'] = directed_uid
    #         index_body['directed_uid'] = 0
    #     index_body['directed_uname'] = directed_uname
    #     index_body['root_tid'] = str(item['root_tid'])
    #     index_body['root_uid'] = str(item['root_uid'])

    # ip = item['send_ip']
    # index_body['ip'] = ip
    # index_body['geo'] = ip2city(ip) #output: 中国&河北&石家庄

    action = {'index': {'_id': index_body['tid']}}
    xdata = index_body
    return action, xdata
Example #12
0
def get_flow_information(uid_list):
    results = {}
    #results = {uid:{'hashtag_dict':{},'hashtag':'', 'keywords_dict':{}, 'keywords_string':'', 'activity_geo':'', 'activity_geo_dict':dict}}
    iter_results = {
    }  # iter_results = {uid:{'hashtag': hashtag_dict, 'geo':geo_dict, 'keywords':keywords_dict}}
    now_ts = time.time()
    now_date_ts = datetime2ts(ts2datetime(now_ts))
    #test
    now_date_ts = test_ts
    for i in range(7, 0, -1):
        ts = now_date_ts - DAY * i
        iter_date = ts2datetime(ts)
        flow_text_index_name = flow_text_index_name_pre + iter_date
        uid_day_geo = {}
        #compute hashtag and geo
        hashtag_results = r_cluster.hmget('hashtag_' + str(ts), uid_list)
        ip_results = r_cluster.hmget('new_ip_' + str(ts), uid_list)
        #compute sensitive_words
        sensitive_results = r_cluster.hmget('sensitive_' + str(ts), uid_list)
        count = 0
        for uid in uid_list:
            #init iter_results[uid]
            if uid not in iter_results:
                iter_results[uid] = {
                    'hashtag': {},
                    'geo': {},
                    'geo_track': [],
                    'keywords': {},
                    'sensitive': {}
                }
            #compute hashtag
            hashtag_item = hashtag_results[count]
            if hashtag_item:
                uid_hashtag_dict = json.loads(hashtag_item)
            else:
                uid_hashtag_dict = {}
            for hashtag in uid_hashtag_dict:
                try:
                    iter_results[uid]['hashtag'][hashtag] += uid_hashtag_dict[
                        hashtag]
                except:
                    iter_results[uid]['hashtag'][hashtag] = uid_hashtag_dict[
                        hashtag]
            #compute sensitive
            sensitive_item = sensitive_results[count]
            if sensitive_item:
                uid_sensitive_dict = json.loads(sensitive_item)
            else:
                uid_sensitive_dict = {}
            for sensitive_word in uid_sensitive_dict:
                try:
                    iter_results[uid]['sensitive'][
                        sensitive_word] += uid_sensitive_dict[sensitive_word]
                except:
                    iter_results[uid]['sensitive'][
                        sensitive_word] = uid_sensitive_dict[sensitive_word]
            #compute geo
            uid_day_geo[uid] = {}
            ip_item = ip_results[count]
            if ip_item:
                uid_ip_dict = json.loads(ip_item)
            else:
                uid_ip_dict = {}
            for ip in uid_ip_dict:
                ip_count = len(uid_ip_dict[ip].split('&'))
                geo = ip2city(ip)
                if geo:
                    #print 'geo:', geo
                    try:
                        iter_results[uid]['geo'][geo] += ip_count
                    except:
                        iter_results[uid]['geo'][geo] = ip_count
                    try:
                        uid_day_geo[uid][geo] += ip_count
                    except:
                        uid_day_geo[uid][geo] = ip_count
            iter_results[uid]['geo_track'].append(uid_day_geo[uid])
            count += 1

        #compute keywords:
        try:
            text_results = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type, \
                                               body={'query':{'filtered':{'filter':{'terms':{'uid': uid_list}}}}, 'size':MAX_VALUE}, _source=True, fields=['uid', 'keywords_dict','text'])['hits']['hits']
        except:
            text_results = {}
        for item in text_results:
            #print 'keywords item:', item
            uid = item['fields']['uid'][0]
            uid_keywords_dict = json.loads(item['fields']['keywords_dict'][0])
            for keywords in uid_keywords_dict:
                try:
                    iter_results[uid]['keywords'][
                        keywords] += uid_keywords_dict[keywords]
                except:
                    iter_results[uid]['keywords'][
                        keywords] = uid_keywords_dict[keywords]

            #jln filter keyword 2016/11/08
            weibo_text = json.loads(item['fields']['text'][0])
            filter_keywords_dict = get_weibo_single(weibo_text)

            for keywords in filter_keywords_dict:
                try:
                    iter_results[uid]['filter_keywords'][
                        keywords] += uid_keywords_dict[keywords]
                except:
                    iter_results[uid]['filter_keywords'][
                        keywords] = uid_keywords_dict[keywords]

    #get keywords top
    for uid in uid_list:
        results[uid] = {}
        hashtag_dict = iter_results[uid]['hashtag']
        results[uid]['hashtag_dict'] = json.dumps(hashtag_dict)
        results[uid]['hashtag'] = '&'.join(hashtag_dict.keys())
        #sensitive words
        sensitive_word_dict = iter_results[uid]['sensitive']
        results[uid]['sensitive_dict'] = json.dumps(sensitive_word_dict)
        results[uid]['sensitive_string'] = '&'.join(sensitive_word_dict.keys())
        sensitive_score = 0
        for item in sensitive_word_dict:
            k = item
            v = sensitive_word_dict[k]
            tmp_stage = r_sensitive.hget('sensitive_words', k)
            if tmp_stage:
                sensitive_score += v * sensitive_score_dict[str(tmp_stage)]
        results[uid]['sensitive'] = sensitive_score
        #print 'sensitive_dict:', results[uid]['sensitive_dict']
        #print 'sensitive_string:', results[uid]['sensitive_string']
        #print 'sensitive:', results[uid]['sensitive']
        #geo
        geo_dict = iter_results[uid]['geo']
        geo_track_list = iter_results[uid]['geo_track']
        results[uid]['activity_geo_dict'] = json.dumps(geo_track_list)
        geo_dict_keys = geo_dict.keys()
        #print 'geo_dict_keys:', geo_dict_keys
        results[uid]['activity_geo'] = '&'.join(
            ['&'.join(item.split('\t')) for item in geo_dict_keys])
        #print 'activity_geo:',  results[uid]['activity_geo']

        keywords_dict = iter_results[uid]['keywords']
        keywords_top50 = sorted(keywords_dict.items(),
                                key=lambda x: x[1],
                                reverse=True)[:50]
        keywords_top50_string = '&'.join(
            [keyword_item[0] for keyword_item in keywords_top50])

        filter_keywords_dict = iter_results[uid]['filter_keywords']
        f_keywords_top50 = sorted(filter_keywords_dict.items(),
                                  key=lambda x: x[1],
                                  reverse=True)[:50]
        f_keywords_top50_string = '&'.join(
            [filter_keywords_dict[0] for keyword_item in f_keywords_top50])

        results[uid]['keywords'] = json.dumps(keywords_top50)
        results[uid]['keywords_string'] = keywords_top50_string

        results[uid]['filter_keywords'] = json.dumps(f_keywords_top50)
        results[uid]['filter_keywords_string'] = f_keywords_top50_string
    return results
def main():
    if RUN_TYPE:
        now_ts = time.time()-DAY # 前一天
        now_ts = datetime2ts('2016-03-24')
        ts = str(datetime2ts(ts2datetime(now_ts)))
    else:
        ts = str(datetime2ts('2016-03-16'))
    now_ts = int(ts)
    print now_ts
    sensitive_string = "sensitive_" + ts
    date_string = ts
    update_sensitive_key = "sensitive_score_" + ts # 更新的键
    sensitive_dict_key = "sensitive_dict_" + ts
    sensitive_string_key = "sensitive_string_" + ts
    sensitive_day_change_key = "sensitive_" + ts +"_day_change"
    del_month = datetime2ts(ts2datetime(now_ts - MONTH))
    del_sensitive_key = "sensitive_score_"+str(del_month) # 要删除的键

    former_ts = int(ts) - DAY
    former_date = str(datetime2ts(ts2datetime(former_ts)))
    former_sensitive_key = "sensitive_score_" + former_date

    iter_count = 0
    bulk_action = []

    mappings(ES_SENSITIVE_INDEX)
    total_number = r.hlen(sensitive_string)
    scan_cursor = 0
    print total_number

    while 1:
        re_scan = r.hscan(sensitive_string, scan_cursor, count=1000)
        scan_cursor = re_scan[0]
        if len(re_scan[1]) != 0:
            sensitive_info = re_scan[1] # 字典形式,uid:sensitive_words_dict
            uid_list = sensitive_info.keys()
            sensitive_results = es.mget(index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX, body={"ids":uid_list})['docs']
            if sensitive_results:
                for item in sensitive_results:
                    uid = item['_id']
                    sensitive_words_dict = json.loads(sensitive_info[uid]) # json.loads
                    current_sensitive_score = 0
                    for k,v in sensitive_words_dict.iteritems():
                        tmp_stage = r_sensitive.hget("sensitive_words", k)
                        if tmp_stage:
                            current_sensitive_score += v*sensitive_score_dict[str(tmp_stage)]
                    if item['found']: # 之前存在相关信息
                        revise_item = item["_source"]
                        if del_sensitive_key in revise_item:
                            item.remove(del_sensitive_key)
                        revise_item['uid'] = uid
                        # 新更新的敏感度
                        revise_item[update_sensitive_key] = current_sensitive_score
                        # 新更新的敏感词
                        revise_item[sensitive_dict_key] = sensitive_info[uid]
                        # 新更新的string
                        revise_item[sensitive_string_key] = "&".join(sensitive_words_dict.keys())
                        # 当天和之前一天、一周和一月均值的差异
                        revise_item['sensitive_day_change'] = current_sensitive_score - revise_item.get(former_sensitive_key, 0)
                        revise_item['sensitive_week_change'] = current_sensitive_score - revise_item.get('sensitive_week_ave', 0)
                        revise_item['sensitive_month_change'] = current_sensitive_score - revise_item.get('sensitive_month_ave', 0)
                        # 更新后week、month的均值和方差
                        revise_item['sensitive_week_ave'], revise_item['sensitive_week_var'], revise_item['sensitive_week_sum'] = compute_week(revise_item, now_ts)
                        revise_item['senstiive_month_ave'], revise_item['sensitive_month_var'], revise_item['sensitive_month_sum'] = compute_month(revise_item, now_ts)

                    else:
                        revise_item = dict()
                        revise_item['uid'] = uid
                        revise_item[update_sensitive_key] = current_sensitive_score
                        revise_item[sensitive_dict_key] = sensitive_info[uid]
                        revise_item[sensitive_string_key] = "&".join(sensitive_words_dict.keys())
                        revise_item['sensitive_day_change'] = current_sensitive_score
                        revise_item['sensitive_week_change'] = current_sensitive_score
                        revise_item['sensitive_month_change'] = current_sensitive_score
                        revise_item['sensitive_week_ave'], revise_item['sensitive_week_var'], revise_item['sensitive_week_sum'] = compute_week(revise_item, now_ts)
                        revise_item['senstiive_month_ave'], revise_item['sensitive_month_var'], revise_item['sensitive_month_sum'] = compute_month(revise_item, now_ts)
                    action = {'index':{'_id': uid}}
                    bulk_action.extend([action, revise_item])
                    iter_count += 1
                    if iter_count % 1000 == 0:
                        es.bulk(bulk_action, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX)
                        bulk_action = []
                        print iter_count
        if int(scan_cursor) == 0:
            break
    if bulk_action:
        es.bulk(bulk_action, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX)

    print iter_count
Example #14
0
def main():
    if RUN_TYPE:
        now_ts = time.time() - DAY  # 前一天
        ts = str(datetime2ts(ts2datetime(now_ts)))
    else:
        ts = str(datetime2ts('2013-09-02'))
    now_ts = int(ts)
    print now_ts
    sensitive_string = "sensitive_" + ts
    date_string = ts
    update_sensitive_key = "sensitive_score_" + ts  # 更新的键
    sensitive_dict_key = "sensitive_dict_" + ts
    sensitive_string_key = "sensitive_string_" + ts
    sensitive_day_change_key = "sensitive_" + ts + "_day_change"
    del_month = datetime2ts(ts2datetime(now_ts - MONTH))
    del_sensitive_key = "sensitive_score_" + str(del_month)  # 要删除的键

    former_ts = int(ts) - DAY
    former_date = str(datetime2ts(ts2datetime(former_ts)))
    former_sensitive_key = "sensitive_score_" + former_date

    iter_count = 0
    bulk_action = []

    mappings(ES_SENSITIVE_INDEX)
    total_number = r_cluster.hlen(sensitive_string)
    scan_cursor = 0
    print total_number

    while 1:
        re_scan = r_cluster.hscan(sensitive_string, scan_cursor, count=1000)
        scan_cursor = re_scan[0]
        if len(re_scan[1]) != 0:
            sensitive_info = re_scan[1]  # 字典形式,uid:sensitive_words_dict
            uid_list = sensitive_info.keys()
            sensitive_results = es.mget(index=ES_SENSITIVE_INDEX,
                                        doc_type=DOCTYPE_SENSITIVE_INDEX,
                                        body={"ids": uid_list})['docs']
            if sensitive_results:
                for item in sensitive_results:
                    uid = item['_id']
                    sensitive_words_dict = json.loads(
                        sensitive_info[uid])  # json.loads
                    current_sensitive_score = 0
                    for k, v in sensitive_words_dict.iteritems():
                        tmp_stage = r_sensitive.hget("sensitive_words", k)
                        if tmp_stage:
                            tmp_stage = json.loads(tmp_stage)
                            current_sensitive_score += v * sensitive_score_dict[
                                str(tmp_stage[0])]
                    if item['found']:  # 之前存在相关信息
                        revise_item = item["_source"]
                        if del_sensitive_key in revise_item:
                            item.pop(del_sensitive_key)
                        revise_item['uid'] = uid
                        # 新更新的敏感度
                        revise_item[
                            update_sensitive_key] = current_sensitive_score
                        revise_item['last_value'] = current_sensitive_score
                        # 新更新的敏感词
                        revise_item[sensitive_dict_key] = sensitive_info[uid]
                        # 新更新的string
                        revise_item[sensitive_string_key] = "&".join(
                            sensitive_words_dict.keys())
                        # 当天和之前一天、一周和一月均值的差异
                        revise_item[
                            'sensitive_day_change'] = current_sensitive_score - revise_item.get(
                                former_sensitive_key, 0)
                        revise_item[
                            'sensitive_week_change'] = current_sensitive_score - revise_item.get(
                                'sensitive_week_ave', 0)
                        revise_item[
                            'sensitive_month_change'] = current_sensitive_score - revise_item.get(
                                'sensitive_month_ave', 0)
                        # 更新后week、month的均值和方差
                        revise_item['sensitive_week_ave'], revise_item[
                            'sensitive_week_var'], revise_item[
                                'sensitive_week_sum'] = compute_week(
                                    revise_item, now_ts)
                        revise_item['senstiive_month_ave'], revise_item[
                            'sensitive_month_var'], revise_item[
                                'sensitive_month_sum'] = compute_month(
                                    revise_item, now_ts)

                    else:
                        revise_item = dict()
                        revise_item['uid'] = uid
                        revise_item[
                            update_sensitive_key] = current_sensitive_score
                        revise_item['last_value'] = current_sensitive_score
                        revise_item[sensitive_dict_key] = sensitive_info[uid]
                        revise_item[sensitive_string_key] = "&".join(
                            sensitive_words_dict.keys())
                        revise_item[
                            'sensitive_day_change'] = current_sensitive_score
                        revise_item[
                            'sensitive_week_change'] = current_sensitive_score
                        revise_item[
                            'sensitive_month_change'] = current_sensitive_score
                        revise_item['sensitive_week_ave'], revise_item[
                            'sensitive_week_var'], revise_item[
                                'sensitive_week_sum'] = compute_week(
                                    revise_item, now_ts)
                        revise_item['senstiive_month_ave'], revise_item[
                            'sensitive_month_var'], revise_item[
                                'sensitive_month_sum'] = compute_month(
                                    revise_item, now_ts)
                    action = {'index': {'_id': uid}}
                    bulk_action.extend([action, revise_item])
                    iter_count += 1
                    if iter_count % 1000 == 0:
                        es.bulk(bulk_action,
                                index=ES_SENSITIVE_INDEX,
                                doc_type=DOCTYPE_SENSITIVE_INDEX)
                        bulk_action = []
                        print iter_count
        if int(scan_cursor) == 0:
            break
    if bulk_action:
        es.bulk(bulk_action,
                index=ES_SENSITIVE_INDEX,
                doc_type=DOCTYPE_SENSITIVE_INDEX)

    print iter_count

    #######更新尚未完成的用户
    update_scan = scan(es,
                       query={
                           "query": {
                               "filtered": {
                                   "filter": {
                                       "missing": {
                                           "field": update_sensitive_key
                                       }
                                   }
                               }
                           }
                       },
                       index=ES_SENSITIVE_INDEX,
                       doc_type=DOCTYPE_SENSITIVE_INDEX)
    iter_count = 0
    bulk_action = []

    while 1:
        try:
            tmp = update_scan.next()
            revise_item = tmp['_source']
            if del_sensitive_key in revise_item:
                revise_item.pop(del_sensitive_key)
            uid = tmp['_id']
            # 新更新的敏感度
            revise_item[update_sensitive_key] = 0
            revise_item['last_value'] = 0
            # 新更新的敏感词
            revise_item[sensitive_dict_key] = json.dumps({})
            # 新更新的string
            revise_item[sensitive_string_key] = ""
            # 当天和之前一天、一周和一月均值的差异
            revise_item['sensitive_day_change'] = 0 - revise_item.get(
                former_sensitive_key, 0)
            revise_item['sensitive_week_change'] = 0 - revise_item.get(
                'sensitive_week_ave', 0)
            revise_item['sensitive_month_change'] = 0 - revise_item.get(
                'sensitive_month_ave', 0)
            # 更新后week、month的均值和方差
            revise_item['sensitive_week_ave'], revise_item[
                'sensitive_week_var'], revise_item[
                    'sensitive_week_sum'] = compute_week(revise_item, now_ts)
            revise_item['senstiive_month_ave'], revise_item[
                'sensitive_month_var'], revise_item[
                    'sensitive_month_sum'] = compute_month(
                        revise_item, now_ts)

            action = {'index': {'_id': uid}}
            bulk_action.extend([action, revise_item])
            iter_count += 1
            if iter_count % 1000 == 0:
                es.bulk(bulk_action,
                        index=ES_SENSITIVE_INDEX,
                        doc_type=DOCTYPE_SENSITIVE_INDEX)
                bulk_action = []
        except StopIteration:
            print "all done"
            if bulk_action:
                es.bulk(bulk_action,
                        index=ES_SENSITIVE_INDEX,
                        doc_type=DOCTYPE_SENSITIVE_INDEX)
            break
        except Exception, r:
            print Exception, r
def get_flow_information(uid_list):
    results = {}      
    #results = {uid:{'hashtag_dict':{},'hashtag':'', 'keywords_dict':{}, 'keywords_string':'', 'activity_geo':'', 'activity_geo_dict':dict}}
    iter_results = {} # iter_results = {uid:{'hashtag': hashtag_dict, 'geo':geo_dict, 'keywords':keywords_dict}}
    now_ts = time.time()
    now_date_ts = datetime2ts(ts2datetime(now_ts))
    #test
    now_date_ts = test_ts
    for i in range(7,0,-1):
        ts = now_date_ts - DAY*i
        iter_date = ts2datetime(ts)
        flow_text_index_name = flow_text_index_name_pre + iter_date
        uid_day_geo = {}
        #compute hashtag and geo
        hashtag_results = r_cluster_3.hmget('hashtag_'+str(ts), uid_list)
        ip_results = r_cluster.hmget('new_ip_'+str(ts), uid_list)
        #compute sensitive_words
        sensitive_results = r_cluster_3.hmget('sensitive_'+str(ts), uid_list)
        count = 0 
        for uid in uid_list:
            #init iter_results[uid]
            if uid not in iter_results:
                iter_results[uid] = {'hashtag':{}, 'geo':{},'geo_track':[],'keywords':{}, 'sensitive':{}}
            #compute hashtag
            hashtag_item = hashtag_results[count]
            if hashtag_item:
                uid_hashtag_dict = json.loads(hashtag_item)
            else:
                uid_hashtag_dict = {}
            for hashtag in uid_hashtag_dict:
                try:
                    iter_results[uid]['hashtag'][hashtag] += uid_hashtag_dict[hashtag]
                except:
                    iter_results[uid]['hashtag'][hashtag] = uid_hashtag_dict[hashtag]
            #compute sensitive
            sensitive_item = sensitive_results[count]
            if sensitive_item:
                uid_sensitive_dict = json.loads(sensitive_item)
            else:
                uid_sensitive_dict = {}
            for sensitive_word in uid_sensitive_dict:
                try:
                    iter_results[uid]['sensitive'][sensitive_word] += uid_sensitive_dict[sensitive_word]
                except:
                    iter_results[uid]['sensitive'][sensitive_word] = uid_sensitive_dict[sensitive_word]
            #compute geo
            uid_day_geo[uid] = {}
            ip_item = ip_results[count]
            if ip_item:
                uid_ip_dict = json.loads(ip_item)
            else:
                uid_ip_dict = {}
            for ip in uid_ip_dict:
                ip_count = len(uid_ip_dict[ip].split('&'))
                geo = ip2city(ip)
                if geo:
                    #print 'geo:', geo
                    try:
                        iter_results[uid]['geo'][geo] += ip_count
                    except:
                        iter_results[uid]['geo'][geo] = ip_count
                    try:
                        uid_day_geo[uid][geo] += ip_count
                    except:
                        uid_day_geo[uid][geo] = ip_count
            iter_results[uid]['geo_track'].append(uid_day_geo[uid])
            count += 1
        
        #compute keywords:        
        try:
            text_results = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type, \
                                               body={'query':{'filtered':{'filter':{'terms':{'uid': uid_list}}}}, 'size':MAX_VALUE}, _source=True, fields=['uid', 'keywords_dict'])['hits']['hits']
        except:
            text_results = {}
        for item in text_results:
            #print 'keywords item:', item
            uid = item['fields']['uid'][0]
            uid_keywords_dict = json.loads(item['fields']['keywords_dict'][0])
            for keywords in uid_keywords_dict:
                try:
                    iter_results[uid]['keywords'][keywords] += uid_keywords_dict[keywords]
                except:
                    iter_results[uid]['keywords'][keywords] = uid_keywords_dict[keywords]

       
    #get keywords top
    for uid in uid_list:
        results[uid] = {}
        hashtag_dict = iter_results[uid]['hashtag']
        results[uid]['hashtag_dict'] = json.dumps(hashtag_dict)
        results[uid]['hashtag'] = '&'.join(hashtag_dict.keys())
        #sensitive words
        sensitive_word_dict = iter_results[uid]['sensitive']
        results[uid]['sensitive_dict'] = json.dumps(sensitive_word_dict)
        results[uid]['sensitive_string'] = '&'.join(sensitive_word_dict.keys())
        sensitive_score = 0
        for item in sensitive_word_dict:
            k = item
            v = sensitive_word_dict[k]
            tmp_stage = r_sensitive.hget('sensitive_words', k)
            if tmp_stage:
                sensitive_score += v * sensitive_score_dict[str(tmp_stage)]
        results[uid]['sensitive'] = sensitive_score
        #print 'sensitive_dict:', results[uid]['sensitive_dict']
        #print 'sensitive_string:', results[uid]['sensitive_string']
        #print 'sensitive:', results[uid]['sensitive']
        #geo
        geo_dict = iter_results[uid]['geo']
        geo_track_list = iter_results[uid]['geo_track']
        results[uid]['activity_geo_dict'] = json.dumps(geo_track_list)
        geo_dict_keys = geo_dict.keys()
        #print 'geo_dict_keys:', geo_dict_keys
        results[uid]['activity_geo'] = '&'.join(['&'.join(item.split('\t')) for item in geo_dict_keys])
        #print 'activity_geo:',  results[uid]['activity_geo']

        keywords_dict = iter_results[uid]['keywords']
        keywords_top50 = sorted(keywords_dict.items(), key=lambda x:x[1], reverse=True)[:50]
        keywords_top50_string = '&'.join([keyword_item[0] for keyword_item in keywords_top50])
        results[uid]['keywords'] = json.dumps(keywords_top50)
        results[uid]['keywords_string'] = keywords_top50_string
        
    return results
# -*- coding:utf-8 -*-

from openpyxl import load_workbook
import redis
import json
import sys
reload(sys)
sys.path.append('../../')
from global_utils import R_ADMIN as r
#r = redis.StrictRedis(host="10.128.55.69", port="6379", db=15)
"""
data = load_workbook('sensitive_words.xlsx')
table = data.get_sheet_by_name('Sheet2')
for i in range(1,549):
    word = table.cell(row=i, column=0).value
    level = table.cell(row=i, column=1).value
    r.hset('sensitive_words',word, level)
print r.hkeys('sensitive_words')
"""
r.hset("sensitive_words", ("疫苗之殇").decode('utf-8').encode('utf-8'), 1)



def get_flow_information_v2(uid_list, all_user_keywords_dict):
    results = {}      
    #results = {uid:{'hashtag_dict':{},'hashtag':'', 'keywords_dict':{}, 'keywords_string':'', 'activity_geo':'', 'activity_geo_dict':dict, 'activity_geo_aggs':''}}
    iter_results = {} # iter_results = {uid:{'hashtag': hashtag_dict, 'geo':geo_dict, 'keywords':keywords_dict}}
    now_ts = time.time()
    #run_type
    today_sensitive_results = {}
    if RUN_TYPE == 1:
        now_date_ts = datetime2ts(ts2datetime(now_ts))
    else:
        now_date_ts = test_ts
    for i in range(WEEK,0,-1):
        ts = now_date_ts - DAY*i
        uid_day_geo = {}
        #compute hashtag and geo
        hashtag_results = r_cluster_3.hmget('hashtag_'+str(ts), uid_list)
        ip_results = r_cluster.hmget('new_ip_'+str(ts), uid_list)
        #compute sensitive_words
        sensitive_results = r_cluster_3.hmget('sensitive_'+str(ts), uid_list)
        count = 0 
        for uid in uid_list:
            #init iter_results[uid]
            if uid not in iter_results:
                iter_results[uid] = {'hashtag':{}, 'geo':{},'geo_track':[],'keywords':{}, 'sensitive':{}, 'school':{}}
            if uid not in today_sensitive_results:
                today_sensitive_results[uid] = {}
            #compute hashtag
            hashtag_item = hashtag_results[count]
            if hashtag_item:
                uid_hashtag_dict = json.loads(hashtag_item)
            else:
                uid_hashtag_dict = {}
            for hashtag in uid_hashtag_dict:
                try:
                    iter_results[uid]['hashtag'][hashtag] += uid_hashtag_dict[hashtag]
                except:
                    iter_results[uid]['hashtag'][hashtag] = uid_hashtag_dict[hashtag]
            #compute sensitive
            sensitive_item = sensitive_results[count]
            if sensitive_item:
                uid_sensitive_dict = json.loads(sensitive_item)
            else:
                uid_sensitive_dict = {}
            for sensitive_word in uid_sensitive_dict:
                try:
                    iter_results[uid]['sensitive'][sensitive_word] += uid_sensitive_dict[sensitive_word]
                except:
                    iter_results[uid]['sensitive'][sensitive_word] = uid_sensitive_dict[sensitive_word]
                if ts == now_date_ts - DAY:
                    try:
                        today_sensitive_results[uid][sensitive_word] += uid_sensitive_dict[sensitive_word]
                    except:
                        today_sensitive_results[uid][sensitive_word] = uid_sensitive_dict[sensitive_word]
            #compute geo
            uid_day_geo[uid] = {}
            ip_item = ip_results[count]
            if ip_item:
                uid_ip_dict = json.loads(ip_item)
            else:
                uid_ip_dict = {}
            for ip in uid_ip_dict:
                ip_count = len(uid_ip_dict[ip].split('&'))
                geo, school = ip2city(ip)
                if geo:
                    try:
                        iter_results[uid]['geo'][geo] += ip_count
                    except:
                        iter_results[uid]['geo'][geo] = ip_count
                    try:
                        uid_day_geo[uid][geo] += ip_count
                    except:
                        uid_day_geo[uid][geo] = ip_count
                if school:
                    try:
                        iter_results[uid]['school'][school] += ip_count
                    except:
                        iter_results[uid]['school'][school] = ip_count
            iter_results[uid]['geo_track'].append(uid_day_geo[uid])
            count += 1
               
    #get keywords top
    for uid in uid_list:
        results[uid] = {}
        #hashtag
        hashtag_dict = iter_results[uid]['hashtag']
        results[uid]['hashtag_dict'] = json.dumps(hashtag_dict)
        results[uid]['hashtag'] = '&'.join(hashtag_dict.keys())
        #sensitive words
        sensitive_word_dict = iter_results[uid]['sensitive']
        results[uid]['sensitive_dict'] = json.dumps(sensitive_word_dict)
        results[uid]['sensitive_string'] = '&'.join(sensitive_word_dict.keys())
        sensitive_score = 0
        today_sensitive_results_user = today_sensitive_results[uid]
        for sensitive_item in today_sensitive_results_user:
            k = sensitive_item
            v = today_sensitive_results_user[sensitive_item]
            tmp_stage = r_sensitive.hget('sensitive_words', k)
            if tmp_stage:
                sensitive_score += v * sensitive_score_dict[str(tmp_stage)]
        results[uid]['sensitive'] = sensitive_score
        #print 'sensitive_dict:', results[uid]['sensitive_dict']
        #print 'sensitive_string:', results[uid]['sensitive_string']
        #print 'sensitive:', results[uid]['sensitive']
        #geo
        geo_dict = iter_results[uid]['geo']
        geo_track_list = iter_results[uid]['geo_track']
        results[uid]['activity_geo_dict'] = json.dumps(geo_track_list)
        geo_dict_keys = geo_dict.keys()
        results[uid]['activity_geo'] = '&'.join(['&'.join(item.split('\t')) for item in geo_dict_keys])
        try:
            results[uid]['activity_geo_aggs'] = '&'.join([item.split('\t')[-1] for item in geo_dict_keys])
        except:
            results[uid]['activity_geo_aggs'] = ''
        #keywords
        keywords_dict = all_user_keywords_dict[uid]
        keywords_top50 = sorted(keywords_dict.items(), key=lambda x:x[1], reverse=True)[:50]
        keywords_top50_string = '&'.join([keyword_item[0] for keyword_item in keywords_top50])
        results[uid]['keywords'] = json.dumps(keywords_top50)
        results[uid]['keywords_string'] = keywords_top50_string
        #school dict
        school_dict = iter_results[uid]['school']
        school_string = '&'.join(school_dict.keys())
        if school_dict != {}:
            is_school = '1'
        else:
            is_school = '0'
        results[uid]['is_school'] = is_school
        results[uid]['school_string'] = school_string
        results[uid]['school_dict'] = json.dumps(school_dict)
        
    return results
Example #18
0
# -*- coding:utf-8 -*-

from openpyxl import load_workbook
import redis
import json
import sys

reload(sys)
sys.path.append('../../')
from global_utils import R_ADMIN as r
#r = redis.StrictRedis(host="10.128.55.69", port="6379", db=15)
data = load_workbook('sensitive_words.xlsx')
table = data.get_sheet_by_name('Sheet2')
for i in range(1, 549):
    word = table.cell(row=i, column=0).value
    level = table.cell(row=i, column=1).value
    r.hset('sensitive_words', word, level)
print r.hkeys('sensitive_words')
Example #19
0
def get_flow_information_v2(uid_list, all_user_keywords_dict):
    results = {}
    #results = {uid:{'hashtag_dict':{},'hashtag':'', 'keywords_dict':{}, 'keywords_string':'', 'activity_geo':'', 'activity_geo_dict':dict, 'activity_geo_aggs':''}}
    iter_results = {
    }  # iter_results = {uid:{'hashtag': hashtag_dict, 'geo':geo_dict, 'keywords':keywords_dict}}
    now_ts = time.time()
    #run_type
    today_sensitive_results = {}
    if RUN_TYPE == 1:
        now_date_ts = datetime2ts(ts2datetime(now_ts))
    else:
        now_date_ts = test_ts
    print 'run_type:', RUN_TYPE
    for i in range(WEEK, 0, -1):
        ts = now_date_ts - DAY * i
        uid_day_geo = {}
        #compute hashtag and geo
        hashtag_results = r_cluster_3.hmget('hashtag_' + str(ts), uid_list)
        ip_results = r_cluster.hmget('new_ip_' + str(ts), uid_list)
        #print 'ip_results:', ip_results
        #compute sensitive_words
        sensitive_results = r_cluster_3.hmget('sensitive_' + str(ts), uid_list)
        count = 0
        for uid in uid_list:
            #init iter_results[uid]
            if uid not in iter_results:
                iter_results[uid] = {
                    'hashtag': {},
                    'geo': {},
                    'geo_track': [],
                    'keywords': {},
                    'sensitive': {},
                    'school': {},
                    'week_ip': {
                        0: {},
                        1: {},
                        2: {},
                        3: {},
                        4: {},
                        5: {}
                    },
                    'ip': {}
                }
            if uid not in today_sensitive_results:
                today_sensitive_results[uid] = {}
            #compute hashtag
            hashtag_item = hashtag_results[count]
            if hashtag_item:
                uid_hashtag_dict = json.loads(hashtag_item)
            else:
                uid_hashtag_dict = {}
            for hashtag in uid_hashtag_dict:
                try:
                    iter_results[uid]['hashtag'][hashtag] += uid_hashtag_dict[
                        hashtag]
                except:
                    iter_results[uid]['hashtag'][hashtag] = uid_hashtag_dict[
                        hashtag]
            #compute sensitive
            sensitive_item = sensitive_results[count]
            if sensitive_item:
                uid_sensitive_dict = json.loads(sensitive_item)
            else:
                uid_sensitive_dict = {}
            for sensitive_word in uid_sensitive_dict:
                try:
                    iter_results[uid]['sensitive'][
                        sensitive_word] += uid_sensitive_dict[sensitive_word]
                except:
                    iter_results[uid]['sensitive'][
                        sensitive_word] = uid_sensitive_dict[sensitive_word]
                if ts == now_date_ts - DAY:
                    try:
                        today_sensitive_results[uid][
                            sensitive_word] += uid_sensitive_dict[
                                sensitive_word]
                    except:
                        today_sensitive_results[uid][
                            sensitive_word] = uid_sensitive_dict[
                                sensitive_word]
            #compute geo
            uid_day_geo[uid] = {}
            ip_item = ip_results[count]
            if ip_item:
                uid_ip_dict = json.loads(ip_item)
            else:
                uid_ip_dict = {}
            for ip in uid_ip_dict:
                ip_count = len(uid_ip_dict[ip].split('&'))
                geo, school = ip2city(ip)
                if geo:
                    try:
                        iter_results[uid]['geo'][geo] += ip_count
                    except:
                        iter_results[uid]['geo'][geo] = ip_count
                    try:
                        uid_day_geo[uid][geo] += ip_count
                    except:
                        uid_day_geo[uid][geo] = ip_count
                if school:
                    try:
                        iter_results[uid]['school'][school] += ip_count
                    except:
                        iter_results[uid]['school'][school] = ip_count
                #deal ip: job_ip&home_ip&active_ip
                ip_time_list = uid_ip_dict[ip].split('&')
                try:
                    iter_results[uid]['ip'][ip] += ip_count
                except:
                    iter_results[uid]['ip'] = {ip: ip_count}
                for ip_time_item in ip_time_list:
                    ip_timesegment = (int(ip_time_item) - ts) / IP_TIME_SEGMENT
                    try:
                        iter_results[uid]['week_ip'][ip_timesegment][ip] += 1
                    except:
                        iter_results[uid]['week_ip'][ip_timesegment][ip] = 1
                #end deal ip
            iter_results[uid]['geo_track'].append(uid_day_geo[uid])
            count += 1
    #get keywords top
    for uid in uid_list:
        #print 'test iter_results_ip:', iter_results[uid]['week_ip']
        results[uid] = {}
        #hashtag
        hashtag_dict = iter_results[uid]['hashtag']
        results[uid]['hashtag_dict'] = json.dumps(hashtag_dict)
        results[uid]['hashtag'] = '&'.join(hashtag_dict.keys())
        #sensitive words
        sensitive_word_dict = iter_results[uid]['sensitive']
        results[uid]['sensitive_dict'] = json.dumps(sensitive_word_dict)
        results[uid]['sensitive_string'] = '&'.join(sensitive_word_dict.keys())
        sensitive_score = 0
        today_sensitive_results_user = today_sensitive_results[uid]
        for sensitive_item in today_sensitive_results_user:
            k = sensitive_item
            v = today_sensitive_results_user[sensitive_item]
            tmp_stage = r_sensitive.hget('sensitive_words', k)
            if tmp_stage:
                sensitive_score += v * sensitive_score_dict[str(tmp_stage)]
        results[uid]['sensitive'] = sensitive_score
        #geo
        geo_dict = iter_results[uid]['geo']
        geo_track_list = iter_results[uid]['geo_track']
        results[uid]['activity_geo_dict'] = json.dumps(geo_track_list)
        geo_dict_keys = geo_dict.keys()
        results[uid]['activity_geo'] = '&'.join(
            ['&'.join(item.split('\t')) for item in geo_dict_keys])
        try:
            results[uid]['activity_geo_aggs'] = '&'.join(
                [item.split('\t')[-1] for item in geo_dict_keys])
        except:
            results[uid]['activity_geo_aggs'] = ''
        #keywords
        keywords_dict = all_user_keywords_dict[uid]
        keywords_top50 = sorted(keywords_dict.items(),
                                key=lambda x: x[1],
                                reverse=True)[:50]
        keywords_top50_string = '&'.join(
            [keyword_item[0] for keyword_item in keywords_top50])
        results[uid]['keywords'] = json.dumps(keywords_top50)
        results[uid]['keywords_string'] = keywords_top50_string
        #school dict
        school_dict = iter_results[uid]['school']
        school_string = '&'.join(school_dict.keys())
        if school_dict != {}:
            is_school = '1'
        else:
            is_school = '0'
        results[uid]['is_school'] = is_school
        results[uid]['school_string'] = school_string
        results[uid]['school_dict'] = json.dumps(school_dict)
        #ip: job_ip&home_ip&activity_ip
        #activity_ip
        all_ip_dict = iter_results[uid]['ip']
        sort_all_ip = sorted(all_ip_dict.items(),
                             key=lambda x: x[1],
                             reverse=True)
        try:
            activity_ip = sort_all_ip[0][0]
        except:
            activity_ip = ''
        results[uid]['activity_ip'] = str(activity_ip)
        #job_ip & home_ip
        week_time_ip_dict = iter_results[uid]['week_ip']
        for i in range(0, 6):
            try:
                segment_dict = week_time_ip_dict[i]
            except:
                week_time_ip_dict[i] = {}
        home_ip, job_ip = get_ip_description(week_time_ip_dict)
        results[uid]['home_ip'] = str(home_ip)
        results[uid]['job_ip'] = str(job_ip)

    return results
def get_flow_information(uid_list):
    # 前七天的数据, 不能用于每天更新
    lenth = len(uid_list)
    results = {}
    iter_results = {}
    result_dict = {}
    if RUN_TYPE:
        now_ts = time.time()
        now_date = ts2datetime(now_ts)  # date: 2013-09-01
    else:
        now_date = "2013-09-08"
    ts = datetime2ts(now_date)

    start_ts = ts - 8 * 3600 * 24
    for i in range(1, 8):
        ts = start_ts + i * 3600 * 24
        date = ts2datetime(ts)
        print "date:", date
        uid_day_geo = {}
        sensitive_uid_day_geo = {}
        flow_index_name = flow_text_index_name_pre + str(date)
        # hashtag
        print uid_list
        hashtag_results = redis_cluster.hmget("hashtag_" + str(ts), uid_list)
        sensitive_hashtag = redis_cluster.hmget("sensitive_hashtag_" + str(ts), uid_list)
        # sensitive_words
        sensitive_results = redis_cluster.hmget("sensitive_" + str(ts), uid_list)
        # ip
        if WORK_TYPE == 0:
            ip_index_name = ip_index_pre + str(date)
            sensitive_ip_index_name = sen_ip_index_pre + str(date)
            # activity_index_name = act_index_pre + str(date)
            # sensitive_activity_index_name = sen_act_index_pre + str(date)
            exist_bool = es_cluster.indices.exists(index=ip_index_name)
            sensitive_exist_bool = es_cluster.indices.exists(index=sensitive_ip_index_name)
            # activity_exist_bool = es_cluster.indices.exists(index=activity_index_name)
            # sensitive_activity_exist_bool = es_cluster.indices.exists(index=sensitive_activity_index_name)
            if exist_bool:
                ip_results = es_cluster.mget(index=ip_index_name, doc_type="ip", body={"ids": uid_list})["docs"]
            else:
                ip_results = [dict()] * lenth
            if sensitive_exist_bool:
                sensitive_ip_results = es_cluster.mget(
                    index=sensitive_ip_index_name, doc_type="sensitive_ip", body={"ids": uid_list}
                )["docs"]
            else:
                sensitive_ip_results = [dict()] * lenth
            """
            if activity_exist_bool:
                activity_results = es_cluster.mget(index=activity_index_name, doc_type="activity", body={"ids":uid_list})["docs"]
            else:
                activity_results = [dict()]*lenth
            if sensitive_activity_exist_bool:
                sensitive_activity_results = es_cluster.mget(index=sensitive_activity_index_name, doc_type="sensitive_activity", body={"ids":uid_list})["docs"]
            else:
                sensitive_activity_results = [dict()]*lenth
            """
        else:
            ip_results = redis_ip.hmget("ip_" + str(ts), uid_list)
            sensitive_ip_results = redis_ip.hmget("sensitive_ip_" + str(ts), uid_list)
            # activity_results = redis_activity.hmget('activity_'+str(date), uid_list)
            # sensitive_activity_results = redis_activity.hmget('sensitive_activity_'+str(date), uid_list)

        for j in range(0, len(uid_list)):
            uid = uid_list[j]
            if uid not in iter_results:
                iter_results[uid] = {
                    "hashtag": {},
                    "sensitive_hashtag": {},
                    "geo": {},
                    "sensitive_geo": {},
                    "geo_track": [],
                    "keywords": {},
                    "sensitive_words": {},
                    "sensitive_geo_track": [],
                    "ip": [],
                    "sensitive_ip": [],
                }
            # sensitive words
            if sensitive_results[j]:
                sensitive_words_results = json.loads(sensitive_results[j])
                for sensitive_word in sensitive_words_results:
                    try:
                        iter_results[uid]["sensitive_words"][sensitive_word] += sensitive_words_results[sensitive_word]
                    except:
                        iter_results[uid]["sensitive_words"][sensitive_word] = sensitive_words_results[sensitive_word]
            # print "sensitive_words:", iter_results[uid]["sensitive_words"]

            if hashtag_results[j]:
                hashtag_dict = json.loads(hashtag_results[j])
                for hashtag in hashtag_dict:
                    try:
                        iter_results[uid]["hashtag"][hashtag] += hashtag_dict[hashtag]
                    except:
                        iter_results[uid]["hashtag"][hashtag] = hashtag_dict[hashtag]
            # print "hashtag: ", iter_results[uid]['hashtag']

            if sensitive_hashtag[j]:
                sensitive_hashtag_dict = json.loads(sensitive_hashtag[j])
                for hashtag in sensitive_hashtag_dict:
                    try:
                        iter_results[uid]["sensitive_hashtag"][hashtag] += sensitive_hashtag_dict[hashtag]
                    except:
                        iter_results[uid]["sensitive_hashtag"][hashtag] = sensitive_hashtag_dict[hashtag]
            # print "sensitive_hashtag:", iter_results[uid]['sensitive_hashtag']

            uid_day_geo[uid] = {}
            sensitive_uid_day_geo[uid] = {}
            if WORK_TYPE == 0:  # es
                if ip_results[j]:
                    if ip_results[j]["found"]:
                        detail_item = ip_results[j]["_source"]
                        ip_dict = json.loads(detail_item["ip_dict"])
                    else:
                        ip_dict = {}
                else:
                    ip_dict = {}
            else:
                if ip_results[j]:
                    ip_dict = json.loads(ip_results[j])
                else:
                    ip_dict = {}
            if ip_dict:
                # iter_results[uid]['ip'].append(ip_dict)
                geo_dict = ip2geo(ip_dict)
                for geo, count in geo_dict.iteritems():
                    try:
                        iter_results[uid]["geo"][geo] += count
                    except:
                        iter_results[uid]["geo"][geo] = count
                    try:
                        uid_day_geo[uid][geo] += count
                    except:
                        uid_day_geo[uid][geo] = count
            # iter_results[uid]['ip'].append(ip_dict)
            iter_results[uid]["geo_track"].append(uid_day_geo[uid])
            # print "ip:", iter_results[uid]['ip'], iter_results[uid]['geo_track']

            if WORK_TYPE == 0:
                if sensitive_ip_results[j]:
                    if sensitive_ip_results[j]["found"]:
                        detail_item = sensitive_ip_results[j]["_source"]
                        sensitive_ip_dict = json.loads(detail_item["sensitive_ip_dict"])
                    else:
                        sensitive_ip_dict = dict()
                else:
                    sensitive_ip_dict = dict()
            else:
                if sensitive_ip_results[j]:
                    sensitive_ip_dict = json.loads(sensitive_ip_results[j])
                else:
                    sensitive_ip_dict = dict()
            if sensitive_ip_dict:
                sensitive_geo_dict = ip2geo(sensitive_ip_dict)
                # iter_results[uid]['sensitive_ip'].append(sensitive_ip_dict)
                for geo, count in sensitive_geo_dict.iteritems():
                    try:
                        iter_results[uid]["sensitive_geo"][geo] += count
                    except:
                        iter_results[uid]["sensitive_geo"][geo] = count
                    try:
                        sensitive_uid_day_geo[uid][geo] += count
                    except:
                        sensitive_uid_day_geo[uid][geo] = count
            # iter_results[uid]['sensitive_ip'].append(sensitive_ip_dict)
            iter_results[uid]["sensitive_geo_track"].append(sensitive_uid_day_geo[uid])
            # print "sensitive_ip:", iter_results[uid]['sensitive_ip'], iter_results[uid]['sensitive_geo_track']

        # compute keywords
        flow_text_exist = es_flow_text.indices.exists(index=flow_index_name)
        if flow_text_exist:
            text_results = es_flow_text.search(
                index=flow_index_name,
                doc_type=flow_text_index_type,
                body={"query": {"filtered": {"filter": {"terms": {"uid": uid_list}}}}, "size": MAX_VALUE},
                _source=False,
                fields=["uid", "keywords_dict"],
            )["hits"]["hits"]
        else:
            text_results = {}
        for item in text_results:
            uid = item["fields"]["uid"][0]
            uid_keywords_dict = json.loads(item["fields"]["keywords_dict"][0])
            for keywords in uid_keywords_dict:
                try:
                    iter_results[uid]["keywords"][keywords] += uid_keywords_dict[keywords]
                except:
                    iter_results[uid]["keywords"][keywords] = uid_keywords_dict[keywords]
        # print "keywords:", iter_results[uid]['keywords']

    for uid in uid_list:
        results[uid] = {}
        # hashtag
        hashtag_dict = iter_results[uid]["hashtag"]
        results[uid]["hashtag_dict"] = json.dumps(hashtag_dict)
        results[uid]["hashtag_string"] = "&".join(hashtag_dict.keys())
        # sensitive hashtag
        sensitive_hashtag_dict = iter_results[uid]["sensitive_hashtag"]
        results[uid]["sensitive_hashtag_dict"] = json.dumps(sensitive_hashtag_dict)
        results[uid]["sensitive_hashtag_string"] = "&".join(sensitive_hashtag_dict.keys())
        # sensitive_words
        sensitive_word_dict = iter_results[uid]["sensitive_words"]
        results[uid]["sensitive_words_dict"] = json.dumps(sensitive_word_dict)
        results[uid]["sensitive_words_string"] = "&".join(sensitive_word_dict.keys())
        sensitive_score = 0
        for k, v in sensitive_word_dict.iteritems():
            tmp = r_sensitive.hget("sensitive_words", k)
            if tmp:
                tmp_stage = json.loads(tmp)
                sensitive_score += sensitive_score_dict[str(tmp_stage[0])] * v
        results[uid]["sensitive"] = sensitive_score
        # geo
        geo_dict = iter_results[uid]["geo"]
        geo_track_list = iter_results[uid]["geo_track"]
        results[uid]["activity_geo_dict"] = json.dumps(geo_track_list)
        geo_dict_keys = geo_dict.keys()
        results[uid]["activity_geo"] = "&".join(["&".join(item.split("\t")) for item in geo_dict_keys])
        results[uid]["activity_geo_aggs"] = "&".join([item.split("\t")[-1] for item in geo_dict_keys])
        sensitive_geo_dict = iter_results[uid]["sensitive_geo"]
        sensitive_geo_track_list = iter_results[uid]["sensitive_geo_track"]
        results[uid]["sensitive_activity_geo_dict"] = json.dumps(sensitive_geo_track_list)
        sensitive_geo_dict_keys = sensitive_geo_dict.keys()
        results[uid]["sensitive_activity_geo"] = "&".join(
            ["&".join(item.split("\t")) for item in sensitive_geo_dict_keys]
        )
        results[uid]["sensitive_activity_geo_aggs"] = "&".join(
            [item.split("\t")[-1] for item in sensitive_geo_dict_keys]
        )

        keywords_dict = iter_results[uid]["keywords"]
        keywords_top50 = sorted(keywords_dict.items(), key=lambda x: x[1], reverse=True)[:50]
        keywords_top50_string = "&".join([keyword_item[0] for keyword_item in keywords_top50])
        results[uid]["keywords_dict"] = json.dumps(keywords_top50)
        results[uid]["keywords_string"] = keywords_top50_string

    return results
Example #21
0
def get_flow_information_v2(uid_list, all_user_keywords_dict):
    results = {}
    #results = {uid:{'hashtag_dict':{},'hashtag':'', 'keywords_dict':{}, 'keywords_string':'', 'activity_geo':'', 'activity_geo_dict':dict, 'activity_geo_aggs':''}}
    iter_results = {
    }  # iter_results = {uid:{'hashtag': hashtag_dict, 'geo':geo_dict, 'keywords':keywords_dict}}
    now_ts = time.time()
    #run_type
    if RUN_TYPE == 1:
        now_date_ts = datetime2ts(ts2datetime(now_ts))
    else:
        now_date_ts = test_ts
    for i in range(WEEK, 0, -1):
        ts = now_date_ts - DAY * i
        print ts
        uid_day_geo = {}
        #compute hashtag and geo
        hashtag_results = r_cluster.hmget('hashtag_' + str(ts), uid_list)
        ip_results = r_cluster.hmget('new_ip_' + str(ts), uid_list)
        #compute sensitive_words
        sensitive_results = r_cluster.hmget('sensitive_' + str(ts), uid_list)
        count = 0
        for uid in uid_list:
            #init iter_results[uid]
            if uid not in iter_results:
                iter_results[uid] = {
                    'hashtag': {},
                    'geo': {},
                    'geo_track': [],
                    'keywords': {},
                    'sensitive': {}
                }
            #compute hashtag
            hashtag_item = hashtag_results[count]
            if hashtag_item:
                uid_hashtag_dict = json.loads(hashtag_item)
            else:
                uid_hashtag_dict = {}
            for hashtag in uid_hashtag_dict:
                try:
                    iter_results[uid]['hashtag'][hashtag] += uid_hashtag_dict[
                        hashtag]
                except:
                    iter_results[uid]['hashtag'][hashtag] = uid_hashtag_dict[
                        hashtag]
            #compute sensitive
            sensitive_item = sensitive_results[count]
            if sensitive_item:
                uid_sensitive_dict = json.loads(sensitive_item)
            else:
                uid_sensitive_dict = {}
            for sensitive_word in uid_sensitive_dict:
                try:
                    iter_results[uid]['sensitive'][
                        sensitive_word] += uid_sensitive_dict[sensitive_word]
                except:
                    iter_results[uid]['sensitive'][
                        sensitive_word] = uid_sensitive_dict[sensitive_word]
            #compute geo
            uid_day_geo[uid] = {}
            ip_item = ip_results[count]
            if ip_item:
                uid_ip_dict = json.loads(ip_item)
            else:
                uid_ip_dict = {}
            for ip in uid_ip_dict:
                ip_count = len(uid_ip_dict[ip].split('&'))
                geo = ip2city(ip)
                if geo:
                    try:
                        iter_results[uid]['geo'][geo] += ip_count
                    except:
                        iter_results[uid]['geo'][geo] = ip_count
                    try:
                        uid_day_geo[uid][geo] += ip_count
                    except:
                        uid_day_geo[uid][geo] = ip_count
            iter_results[uid]['geo_track'].append(uid_day_geo[uid])
            count += 1

    #get keywords top
    for uid in uid_list:
        results[uid] = {}
        #hashtag
        hashtag_dict = iter_results[uid]['hashtag']
        results[uid]['hashtag_dict'] = json.dumps(hashtag_dict)
        results[uid]['hashtag'] = '&'.join(hashtag_dict.keys())
        #sensitive words
        sensitive_word_dict = iter_results[uid]['sensitive']
        results[uid]['sensitive_dict'] = json.dumps(sensitive_word_dict)
        results[uid]['sensitive_string'] = '&'.join(sensitive_word_dict.keys())
        sensitive_score = 0
        for sensitive_item in sensitive_word_dict:
            k = sensitive_item
            v = sensitive_word_dict[sensitive_item]
            tmp_stage = r_sensitive.hget('sensitive_words', k)
            if tmp_stage:
                sensitive_score += v * sensitive_score_dict[str(tmp_stage)]
        results[uid]['sensitive'] = sensitive_score
        #print 'sensitive_dict:', results[uid]['sensitive_dict']
        #print 'sensitive_string:', results[uid]['sensitive_string']
        #print 'sensitive:', results[uid]['sensitive']
        #geo
        geo_dict = iter_results[uid]['geo']
        geo_track_list = iter_results[uid]['geo_track']
        results[uid]['activity_geo_dict'] = json.dumps(geo_track_list)
        geo_dict_keys = geo_dict.keys()
        results[uid]['activity_geo'] = '&'.join(
            ['&'.join(item.split('\t')) for item in geo_dict_keys])
        try:
            results[uid]['activity_geo_aggs'] = '&'.join(
                [item.split('\t')[-1] for item in geo_dict_keys])
        except:
            results[uid]['activity_geo_aggs'] = ''

        keywords_dict = all_user_keywords_dict[uid]
        keywords_top50 = sorted(keywords_dict.items(),
                                key=lambda x: x[1],
                                reverse=True)[:50]
        keywords_top50_string = '&'.join(
            [keyword_item[0] for keyword_item in keywords_top50])
        results[uid]['keywords'] = json.dumps(keywords_top50)
        results[uid]['keywords_string'] = keywords_top50_string

    return results
Example #22
0
        'query':{
            'term':{'uid':uid}
        },
        'size':50
    }
    index_name = flow_text_index_name_pre + ts2datetime(timestamp)
    try:
        search_results = es_xnr.search(index=index_name,doc_type=flow_text_index_type,body=query_body)['hits']['hits']
    except Exception,e:
        pass
        search_results = []
    for result in search_results:
        text = result['_source']['text'].encode('utf-8')
        node = createWordTree()
        sensitive_words_dict = searchWord(text, node)
        if sensitive_words_dict:
            sensitive_words_list = []
            for k,v in sensitive_words_dict.iteritems():
                tmp_stage = r_sensitive.hget("sensitive_words", k)
                if tmp_stage:
                    score += v*sensitive_score_dict[str(tmp_stage)]
    return score
    
if __name__ == '__main__':
    # '2017-10-15'
    # print get_sensitive_user(timestamp=1507996800, uid='100003271864059')
    print get_sensitive_info(timestamp=1507996800,mid='123124323',text=u"64和达赖太阳花")
    print get_sensitive_info(timestamp=1507996800,mid='123124323')
    print get_sensitive_info(timestamp=1507996800,text=u"64和达赖太阳花")
    print get_sensitive_info(timestamp=1507996800,)
def main():
    if RUN_TYPE:
        now_ts = time.time()-DAY # 前一天
        ts = str(datetime2ts(ts2datetime(now_ts)))
    else:
        ts = str(datetime2ts('2013-09-02'))
    now_ts = int(ts)
    print now_ts
    sensitive_string = "sensitive_" + ts
    date_string = ts
    update_sensitive_key = "sensitive_score_" + ts # 更新的键
    sensitive_dict_key = "sensitive_dict_" + ts
    sensitive_string_key = "sensitive_string_" + ts
    sensitive_day_change_key = "sensitive_" + ts +"_day_change"
    del_month = datetime2ts(ts2datetime(now_ts - MONTH))
    del_sensitive_key = "sensitive_score_"+str(del_month) # 要删除的键

    former_ts = int(ts) - DAY
    former_date = str(datetime2ts(ts2datetime(former_ts)))
    former_sensitive_key = "sensitive_score_" + former_date

    iter_count = 0
    bulk_action = []

    mappings(ES_SENSITIVE_INDEX)
    total_number = r_cluster.hlen(sensitive_string)
    scan_cursor = 0
    print total_number

    while 1:
        re_scan = r_cluster.hscan(sensitive_string, scan_cursor, count=1000)
        scan_cursor = re_scan[0]
        if len(re_scan[1]) != 0:
            sensitive_info = re_scan[1] # 字典形式,uid:sensitive_words_dict
            uid_list = sensitive_info.keys()
            sensitive_results = es.mget(index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX, body={"ids":uid_list})['docs']
            if sensitive_results:
                for item in sensitive_results:
                    uid = item['_id']
                    sensitive_words_dict = json.loads(sensitive_info[uid]) # json.loads
                    current_sensitive_score = 0
                    for k,v in sensitive_words_dict.iteritems():
                        tmp_stage = r_sensitive.hget("sensitive_words", k)
                        if tmp_stage:
                            tmp_stage = json.loads(tmp_stage)
                            current_sensitive_score += v*sensitive_score_dict[str(tmp_stage[0])]
                    if item['found']: # 之前存在相关信息
                        revise_item = item["_source"]
                        if del_sensitive_key in revise_item:
                            item.pop(del_sensitive_key)
                        revise_item['uid'] = uid
                        # 新更新的敏感度
                        revise_item[update_sensitive_key] = current_sensitive_score
                        revise_item['last_value'] = current_sensitive_score
                        # 新更新的敏感词
                        revise_item[sensitive_dict_key] = sensitive_info[uid]
                        # 新更新的string
                        revise_item[sensitive_string_key] = "&".join(sensitive_words_dict.keys())
                        # 当天和之前一天、一周和一月均值的差异
                        revise_item['sensitive_day_change'] = current_sensitive_score - revise_item.get(former_sensitive_key, 0)
                        revise_item['sensitive_week_change'] = current_sensitive_score - revise_item.get('sensitive_week_ave', 0)
                        revise_item['sensitive_month_change'] = current_sensitive_score - revise_item.get('sensitive_month_ave', 0)
                        # 更新后week、month的均值和方差
                        revise_item['sensitive_week_ave'], revise_item['sensitive_week_var'], revise_item['sensitive_week_sum'] = compute_week(revise_item, now_ts)
                        revise_item['senstiive_month_ave'], revise_item['sensitive_month_var'], revise_item['sensitive_month_sum'] = compute_month(revise_item, now_ts)

                    else:
                        revise_item = dict()
                        revise_item['uid'] = uid
                        revise_item[update_sensitive_key] = current_sensitive_score
                        revise_item['last_value'] = current_sensitive_score
                        revise_item[sensitive_dict_key] = sensitive_info[uid]
                        revise_item[sensitive_string_key] = "&".join(sensitive_words_dict.keys())
                        revise_item['sensitive_day_change'] = current_sensitive_score
                        revise_item['sensitive_week_change'] = current_sensitive_score
                        revise_item['sensitive_month_change'] = current_sensitive_score
                        revise_item['sensitive_week_ave'], revise_item['sensitive_week_var'], revise_item['sensitive_week_sum'] = compute_week(revise_item, now_ts)
                        revise_item['senstiive_month_ave'], revise_item['sensitive_month_var'], revise_item['sensitive_month_sum'] = compute_month(revise_item, now_ts)
                    action = {'index':{'_id': uid}}
                    bulk_action.extend([action, revise_item])
                    iter_count += 1
                    if iter_count % 1000 == 0:
                        es.bulk(bulk_action, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX)
                        bulk_action = []
                        print iter_count
        if int(scan_cursor) == 0:
            break
    if bulk_action:
        es.bulk(bulk_action, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX)

    print iter_count


    #######更新尚未完成的用户
    update_scan = scan(es, query={"query":{"filtered":{"filter":{"missing":{"field":update_sensitive_key}}}}}, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX)
    iter_count = 0
    bulk_action = []

    while 1:
        try:
            tmp = update_scan.next()
            revise_item = tmp['_source']
            if del_sensitive_key in revise_item:
                revise_item.pop(del_sensitive_key)
            uid = tmp['_id']
            # 新更新的敏感度
            revise_item[update_sensitive_key] = 0
            revise_item['last_value'] = 0
            # 新更新的敏感词
            revise_item[sensitive_dict_key] = json.dumps({})
            # 新更新的string
            revise_item[sensitive_string_key] = ""
            # 当天和之前一天、一周和一月均值的差异
            revise_item['sensitive_day_change'] = 0 - revise_item.get(former_sensitive_key, 0)
            revise_item['sensitive_week_change'] = 0 - revise_item.get('sensitive_week_ave', 0)
            revise_item['sensitive_month_change'] = 0 - revise_item.get('sensitive_month_ave', 0)
            # 更新后week、month的均值和方差
            revise_item['sensitive_week_ave'], revise_item['sensitive_week_var'], revise_item['sensitive_week_sum'] = compute_week(revise_item, now_ts)
            revise_item['senstiive_month_ave'], revise_item['sensitive_month_var'], revise_item['sensitive_month_sum'] = compute_month(revise_item, now_ts)

            action = {'index':{'_id': uid}}
            bulk_action.extend([action, revise_item])
            iter_count += 1
            if iter_count % 1000 == 0:
                es.bulk(bulk_action, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX)
                bulk_action = []
        except StopIteration:
            print "all done"
            if bulk_action:
                es.bulk(bulk_action, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX)
            break
        except Exception, r:
            print Exception, r