Ejemplo n.º 1
0
def update_day_sensitive(uid_list):
    results = {}
    count = 0
    for uid in uid_list:
        results[uid] = {"sensitive": 0, 'sensitive_string': "", 'sensitive_dict': json.dumps({})}
    all_results = {}
    now_ts = time.time()
    if RUN_TYPE == 1:
        now_date_ts = datetime2ts(ts2datetime(now_ts))
    else:
        now_date_ts = datetime2ts('2013-09-02')
    today_sensitive_dict = {}
    sensitive_results = redis_cluster.hmget("sensitive_"+str(now_date_ts), uid_list)
    for item in sensitive_results:
        if not item:
            count += 1
            continue
        print type(item)
        uid = uid_list[count]
        item = json.loads(item)
        sensitive_index = 0
        sensitive_words_dict = {}
        for word, count in item.iteritems():
            tmp_stage = r_sensitive.hget("sensitive_words", word)
            if tmp_stage:
                tmp = json.loads(tmp_stage)
                sensitive_index += sensitive_score_dict[str(tmp[0])] * count
        sensitive_words_string = "&".join(item.keys())
        results[uid] = {'sensitive': sensitive_index, "sensitive_words_string":sensitive_words_string, "sensitive_words_dict":item}
        count += 1

    return results
Ejemplo n.º 2
0
def get_sensitive_user(timestamp, uid):

    score = 0

    query_body = {'query': {'term': {'uid': uid}}, 'size': 50}

    index_name = flow_text_index_name_pre + ts2datetime(timestamp)

    search_results = es_flow_text.search(index=index_name,
                                         doc_type=flow_text_index_type,
                                         body=query_body)['hits']['hits']

    for result in search_results:

        text = result['_source']['text'].encode('utf-8')

        node = createWordTree()
        sensitive_words_dict = searchWord(text, node)

        if sensitive_words_dict:

            sensitive_words_list = []

            for k, v in sensitive_words_dict.iteritems():
                tmp_stage = r_sensitive.hget("sensitive_words", k)
                if tmp_stage:
                    score += v * sensitive_score_dict[str(tmp_stage)]

    return score
def update_day_sensitive(uid_list):
    results = {}
    for uid in uid_list:
        results[uid] = {"sensitive": 0, "sensitive_string": "", "sensitive_dict": json.dumps({})}
    all_results = {}
    now_ts = time.time()
    if RUN_TYPE == 1:
        now_date_ts = datetime2ts(ts2datetime(now_ts))
    else:
        now_date_ts = datetime2ts("2013-09-03")
    today_sensitive_dict = {}
    sensitive_results = redis_cluster.hmget("sensitive_" + str(now_date_ts), uid_list)
    for item in sensitive_results:
        for uid, words_dict in item.iteritems():
            sensitive_index = 0
            sensitive_words_dict = {}
            if words_dict:
                sensitive_words_dict = json.dumps(words_dict)
                for word, count in words_dict.iter_items():
                    tmp_stage = r_sensitive.hget("sensitive_words", word)
                    if tmp_stage:
                        tmp = json.loads(tmp_stage)
                        sensitive_index += sensitive_score_dict[tmp[0]] * count
            sensitive_words_string = "&".join(sensitive_words_dict.keys())
            results[uid] = {
                "sensitive": sensitive_index,
                "sensitive_words_string": sensitive_words_string,
                "sensitive_words_dict": sensitive_words_dict,
            }

    return results
Ejemplo n.º 4
0
def sensitive_process(text,timestamp):

    ## 人物敏感度
    iter_results = {} # iter_results = {uid:{}}
    now_ts = time.time()
    #run_type
    today_sensitive_results = {}
    if S_TYPE != 'test':
        now_date_ts = datetime2ts(ts2datetime(now_ts))
    else:
        now_date_ts = datetime2ts(S_DATE)

    for i in range(WEEK,0,-1):
        ts = now_date_ts - DAY*i
        sensitive_results = r_cluster_3.hmget('sensitive_'+str(ts), uid_list)
        count = 0
        for uid in uid_list:
            if uid not in today_sensitive_results:
                today_sensitive_results[uid] = {}

            #compute sensitive
            sensitive_item = sensitive_results[count]
            if sensitive_item:
                uid_sensitive_dict = json.loads(sensitive_item)
            else:
                uid_sensitive_dict = {}
            for sensitive_word in uid_sensitive_dict:
                try:
                    iter_results[uid]['sensitive'][sensitive_word] += uid_sensitive_dict[sensitive_word]
                except:
                    iter_results[uid]['sensitive'][sensitive_word] = uid_sensitive_dict[sensitive_word]
                if ts == now_date_ts - DAY:
                    try:
                        today_sensitive_results[uid][sensitive_word] += uid_sensitive_dict[sensitive_word]
                    except:
                        today_sensitive_results[uid][sensitive_word] = uid_sensitive_dict[sensitive_word]

        for uid in uid_list:
        results[uid] = {}

    

    ## 信息敏感度
    sensitive_words_dict = searchWord(text.encode('utf-8', 'ignore'), DFA)
    if sensitive_words_dict:
        item['sensitive_words_string'] = "&".join(sensitive_words_dict.keys())
        item['sensitive_words_dict'] = json.dumps(sensitive_words_dict)
    else:
        item['sensitive_words_string'] = ""
        item['sensitive_words_dict'] = json.dumps({})
                    
    sensitive_words_dict = json.loads(item['sensitive_words_dict'])
    if sensitive_words_dict:
        score = 0
        for k,v in sensitive_words_dict.iteritems():
            tmp_stage = r_sensitive.hget("sensitive_words", k)
            if tmp_stage:
                score += v*sensitive_score_dict[str(tmp_stage)]
        index_body['sensitive'] = score
Ejemplo n.º 5
0
def expand_index_action(item):
    index_body = {}
    index_body['uid'] = str(item['uid'])
    index_body['text'] = item['text']
    index_body['fid'] = str(item['fid'])
    index_body['sentiment'] = str(item['sentiment'])
    index_body['timestamp'] = int(item['timestamp'])
    #index_body['message_type'] = item['message_type']
    index_body['keywords_dict'] = item['keywords_dict']
    index_body['keywords_string'] = item['keywords_string']
    index_body['sensitive_words_string'] = item['sensitive_words_string']
    index_body['sensitive_words_dict'] = item['sensitive_words_dict']
    sensitive_words_dict = json.loads(item['sensitive_words_dict'])
    score = 0
    if sensitive_words_dict:
        #score = 0
        for k, v in sensitive_words_dict.iteritems():
            tmp_stage = r_sensitive.hget("sensitive_words", k)
            if tmp_stage:
                score += v * sensitive_score_dict[str(tmp_stage)]
        #index_body['sensitive'] = score
    index_body['sensitive'] = score
    #print 'sensitive...index body...',index_body['sensitive']
    #if item['message_type'] == 3:

    #for retweet message: get directed retweet uname and uid
    # directed_uid, directed_uname = get_directed_retweet(item['text'], item['root_uid'])
    directed_uid, directed_uname = get_root_retweet(item['text'], item['uid'])
    if directed_uid:
        index_body['directed_uid'] = long(directed_uid)
    else:
        #index_body['directed_uid'] = directed_uid
        index_body['directed_uid'] = 0

    index_body['directed_uname'] = directed_uname
    #index_body['root_fid'] = str(item['fid'])
    #index_body['root_uid'] = str(item['uid'])
    # elif item['message_type'] == 2:
    #     #for comment meesage: get directed comment uname and uid
    #     directed_uid, directed_uname = get_directed_comment(item['text'], item['root_uid'])
    #     if directed_uid:
    #         index_body['directed_uid'] = int(directed_uid)
    #     else:
    #         #index_body['directed_uid'] = directed_uid
    #         index_body['directed_uid'] = 0
    #     index_body['directed_uname'] = directed_uname
    #     index_body['root_fid'] = str(item['root_fid'])
    #     index_body['root_uid'] = str(item['root_uid'])

    # ip = item['send_ip']
    # index_body['ip'] = ip
    # index_body['geo'] = ip2city(ip) #output: 中国&河北&石家庄

    action = {'index': {'_id': index_body['fid']}}
    xdata = index_body
    #print 'index_body...',index_body
    return action, xdata
Ejemplo n.º 6
0
def get_sensitive_user(uid):
    try:
        tmp_stage = r_sensitive.hget('sensitive_words', uid)
        if tmp_stage:
            sensitive_score += v * sensitive_score_dict[str(tmp_stage)]
    except:
        sensitive_score = 0

    return sensitive_score
Ejemplo n.º 7
0
def expand_index_action(item):
    index_body = {}
    index_body['uid'] = str(item['uid'])
    index_body['user_fansnum'] = int(item.get('user_fansnum', 0))
    index_body['text'] = item['text']
    index_body['mid'] = str(item['mid'])
    index_body['sentiment'] = str(item['sentiment'])
    index_body['timestamp'] = int(item['timestamp'])
    index_body['message_type'] = item['message_type']
    index_body['keywords_dict'] = item['keywords_dict']
    index_body['keywords_string'] = item['keywords_string']
    index_body['sensitive_words_string'] = item['sensitive_words_string']
    index_body['sensitive_words_dict'] = item['sensitive_words_dict']
    index_body['retweeted'] = 0
    index_body['comment'] = 0
    index_body['sensitive'] = 0
    sensitive_words_dict = json.loads(item['sensitive_words_dict'])
    if sensitive_words_dict:
        score = 0
        for k, v in sensitive_words_dict.iteritems():
            tmp_stage = r_sensitive.hget("sensitive_words", k)
            if tmp_stage:
                score += v * sensitive_score_dict[str(tmp_stage)]
        index_body['sensitive'] = score
    if item['message_type'] == 3:
        #for retweet message: get directed retweet uname and uid
        directed_uid, directed_uname = get_directed_retweet(
            item['text'], item['root_uid'])
        if directed_uid:
            index_body['directed_uid'] = int(directed_uid)
        else:
            #index_body['directed_uid'] = directed_uid
            index_body['directed_uid'] = 0
        index_body['directed_uname'] = directed_uname
        index_body['root_mid'] = str(item['root_mid'])
        index_body['root_uid'] = str(item['root_uid'])
    elif item['message_type'] == 2:
        #for comment meesage: get directed comment uname and uid
        directed_uid, directed_uname = get_directed_comment(
            item['text'], item['root_uid'])
        if directed_uid:
            index_body['directed_uid'] = int(directed_uid)
        else:
            #index_body['directed_uid'] = directed_uid
            index_body['directed_uid'] = 0
        index_body['directed_uname'] = directed_uname
        index_body['root_mid'] = str(item['root_mid'])
        index_body['root_uid'] = str(item['root_uid'])

    ip = item['send_ip']
    index_body['ip'] = ip
    index_body['geo'] = ip2city(ip)  #output: 中国&河北&石家庄

    action = {'index': {'_id': index_body['mid']}}
    xdata = index_body
    return action, xdata
Ejemplo n.º 8
0
def compute_sensitive(text):
    score = 0
    node = createWordTree()
    sensitive_words_dict = searchWord(text.encode('utf-8'), node)
    if sensitive_words_dict:
        for k, v in sensitive_words_dict.iteritems():
            tmp_stage = r_sensitive.hget("sensitive_words", k)
            if tmp_stage:
                score += v * sensitive_score_dict[str(tmp_stage)]
    return score
Ejemplo n.º 9
0
def update_day_sensitive(uid_list):
    results = {}
    all_results = {}
    now_ts = time.time()
    #run_type
    if RUN_TYPE == 1:
        now_date_ts = datetime2ts(ts2datetime(now_ts))
    else:
        now_date_ts = test_ts
    today_sensitive_dict = {}
    for i in range(WEEK,0,-1):
        ts = now_date_ts - DAY*i
        count = 0
        sensitive_results = r_cluster_3.hmget('sensitive_'+str(ts), uid_list)
        #print 'sensitive_results:', sensitive_results
        for uid in uid_list:
            if uid not in results:
                results[uid] = {}
            sensitive_item = sensitive_results[count]
            if uid not in today_sensitive_dict:
                today_sensitive_dict[uid] = {}
            if sensitive_item:
                sensitive_dict = json.loads(sensitive_item)
            else:
                sensitive_dict = {}
            for sensitive in sensitive_dict:
                try:
                    results[uid][sensitive] += 1
                except:
                    results[uid][sensitive] = 1
                if ts == now_date_ts - DAY:
                    try:
                        today_sensitive_dict[uid][sensitive] += 1
                    except:
                        today_sensitive_dict[uid][sensitive] = 1
            count += 1
    #print 'results:', results
    for uid in uid_list:
        user_sensitive_dict = results[uid]
        #print 'uid,sensitive:', uid, user_sensitive_dict
        sensitive_score = 0
        today_sensitive_dict_user = today_sensitive_dict[uid]
        for item in today_sensitive_dict_user:
            k = item
            v = today_sensitive_dict_user[k]
            tmp_stage = r_sensitive.hget('sensitive_words', k)
            if tmp_stage:
                sensitive_score += v * sensitive_score_dict[str(tmp_stage)]
            #print 'sensitive_score:', sensitive_score
        sensitive_string = '&'.join(user_sensitive_dict.keys())
        #print 'uid, sensitive:', uid, sensitive_string, sensitive_score
        all_results[uid] = {'sensitive_string': sensitive_string, 'sensitive_dict':json.dumps(user_sensitive_dict),\
                'sensitive': sensitive_score}
    #print 'all_results:', all_results
    return all_results
def update_day_sensitive(uid_list):
    results = {}
    all_results = {}
    now_ts = time.time()
    #run_type
    if RUN_TYPE == 1:
        now_date_ts = datetime2ts(ts2datetime(now_ts))
    else:
        now_date_ts = test_ts
    today_sensitive_dict = {}
    for i in range(WEEK, 0, -1):
        ts = now_date_ts - DAY * i
        count = 0
        sensitive_results = r_cluster_3.hmget('sensitive_' + str(ts), uid_list)
        #print 'sensitive_results:', sensitive_results
        for uid in uid_list:
            if uid not in results:
                results[uid] = {}
            sensitive_item = sensitive_results[count]
            if uid not in today_sensitive_dict:
                today_sensitive_dict[uid] = {}
            if sensitive_item:
                sensitive_dict = json.loads(sensitive_item)
            else:
                sensitive_dict = {}
            for sensitive in sensitive_dict:
                try:
                    results[uid][sensitive] += 1
                except:
                    results[uid][sensitive] = 1
                if ts == now_date_ts - DAY:
                    try:
                        today_sensitive_dict[uid][sensitive] += 1
                    except:
                        today_sensitive_dict[uid][sensitive] = 1
            count += 1
    #print 'results:', results
    for uid in uid_list:
        user_sensitive_dict = results[uid]
        #print 'uid,sensitive:', uid, user_sensitive_dict
        sensitive_score = 0
        today_sensitive_dict_user = today_sensitive_dict[uid]
        for item in today_sensitive_dict_user:
            k = item
            v = today_sensitive_dict_user[k]
            tmp_stage = r_sensitive.hget('sensitive_words', k)
            if tmp_stage:
                sensitive_score += v * sensitive_score_dict[str(tmp_stage)]
            #print 'sensitive_score:', sensitive_score
        sensitive_string = '&'.join(user_sensitive_dict.keys())
        #print 'uid, sensitive:', uid, sensitive_string, sensitive_score
        all_results[uid] = {'sensitive_string': sensitive_string, 'sensitive_dict':json.dumps(user_sensitive_dict),\
                'sensitive': sensitive_score}
    #print 'all_results:', all_results
    return all_results
def expand_index_action(item):
    index_body = {}
    index_body['uid'] = str(item['uid'])
    index_body['user_fansnum'] = int(item.get('user_fansnum', 0))
    index_body['text'] = item['text']
    index_body['mid'] = str(item['mid'])
    index_body['sentiment'] = str(item['sentiment'])
    index_body['timestamp'] = int(item['timestamp'])
    index_body['message_type'] = item['message_type']
    index_body['keywords_dict'] = item['keywords_dict']
    index_body['keywords_string'] = item['keywords_string']
    index_body['sensitive_words_string'] = item['sensitive_words_string']
    index_body['sensitive_words_dict'] = item['sensitive_words_dict']
    index_body['retweeted'] = 0
    index_body['comment'] = 0
    index_body['sensitive'] = 0
    sensitive_words_dict = json.loads(item['sensitive_words_dict'])
    if sensitive_words_dict:
        score = 0
        for k,v in sensitive_words_dict.iteritems():
            tmp_stage = r_sensitive.hget("sensitive_words", k)
            if tmp_stage:
                score += v*sensitive_score_dict[str(tmp_stage)]
        index_body['sensitive'] = score
    if item['message_type'] == 3:
        #for retweet message: get directed retweet uname and uid 
        directed_uid, directed_uname = get_directed_retweet(item['text'], item['root_uid'])
        if directed_uid:
            index_body['directed_uid'] = int(directed_uid)
        else:
            #index_body['directed_uid'] = directed_uid
            index_body['directed_uid'] = 0
        index_body['directed_uname'] = directed_uname
        index_body['root_mid'] = str(item['root_mid'])
        index_body['root_uid'] = str(item['root_uid'])
    elif item['message_type'] == 2:
        #for comment meesage: get directed comment uname and uid
        directed_uid, directed_uname = get_directed_comment(item['text'], item['root_uid'])
        if directed_uid:
            index_body['directed_uid'] = int(directed_uid)
        else:
            #index_body['directed_uid'] = directed_uid
            index_body['directed_uid'] = 0
        index_body['directed_uname'] = directed_uname
        index_body['root_mid'] = str(item['root_mid'])
        index_body['root_uid'] = str(item['root_uid'])

    ip = item['send_ip']
    index_body['ip'] = ip
    index_body['geo'] = ip2city(ip) #output: 中国&河北&石家庄
    
    action = {'index': {'_id': index_body['mid']}}
    xdata = index_body
    return action, xdata
Ejemplo n.º 12
0
def update_day_sensitive(uid_list):
    results = {}
    for uid in uid_list:
        results[uid] = {
            "sensitive": 0,
            'sensitive_string': "",
            'sensitive_dict': json.dumps({})
        }
    all_results = {}
    now_ts = time.time()
    if RUN_TYPE == 1:
        now_date_ts = datetime2ts(ts2datetime(now_ts))
    else:
        now_date_ts = datetime2ts('2013-09-03')
    today_sensitive_dict = {}
    sensitive_results = redis_cluster.hmget("sensitive_" + str(now_date_ts),
                                            uid_list)
    for item in sensitive_results:
        for uid, words_dict in item.iteritems():
            sensitive_index = 0
            sensitive_words_dict = {}
            if words_dict:
                sensitive_words_dict = json.dumps(words_dict)
                for word, count in words_dict.iter_items():
                    tmp_stage = r_sensitive.hget("sensitive_words", word)
                    if tmp_stage:
                        tmp = json.loads(tmp_stage)
                        sensitive_index += sensitive_score_dict[tmp[0]] * count
            sensitive_words_string = "&".join(sensitive_words_dict.keys())
            results[uid] = {
                'sensitive': sensitive_index,
                "sensitive_words_string": sensitive_words_string,
                "sensitive_words_dict": sensitive_words_dict
            }

    return results
Ejemplo n.º 13
0
def get_flow_information(uid_list):
    results = {}
    #results = {uid:{'hashtag_dict':{},'hashtag':'', 'keywords_dict':{}, 'keywords_string':'', 'activity_geo':'', 'activity_geo_dict':dict}}
    iter_results = {
    }  # iter_results = {uid:{'hashtag': hashtag_dict, 'geo':geo_dict, 'keywords':keywords_dict}}
    now_ts = time.time()
    now_date_ts = datetime2ts(ts2datetime(now_ts))
    #test
    now_date_ts = test_ts
    for i in range(7, 0, -1):
        ts = now_date_ts - DAY * i
        iter_date = ts2datetime(ts)
        flow_text_index_name = flow_text_index_name_pre + iter_date
        uid_day_geo = {}
        #compute hashtag and geo
        hashtag_results = r_cluster.hmget('hashtag_' + str(ts), uid_list)
        ip_results = r_cluster.hmget('new_ip_' + str(ts), uid_list)
        #compute sensitive_words
        sensitive_results = r_cluster.hmget('sensitive_' + str(ts), uid_list)
        count = 0
        for uid in uid_list:
            #init iter_results[uid]
            if uid not in iter_results:
                iter_results[uid] = {
                    'hashtag': {},
                    'geo': {},
                    'geo_track': [],
                    'keywords': {},
                    'sensitive': {}
                }
            #compute hashtag
            hashtag_item = hashtag_results[count]
            if hashtag_item:
                uid_hashtag_dict = json.loads(hashtag_item)
            else:
                uid_hashtag_dict = {}
            for hashtag in uid_hashtag_dict:
                try:
                    iter_results[uid]['hashtag'][hashtag] += uid_hashtag_dict[
                        hashtag]
                except:
                    iter_results[uid]['hashtag'][hashtag] = uid_hashtag_dict[
                        hashtag]
            #compute sensitive
            sensitive_item = sensitive_results[count]
            if sensitive_item:
                uid_sensitive_dict = json.loads(sensitive_item)
            else:
                uid_sensitive_dict = {}
            for sensitive_word in uid_sensitive_dict:
                try:
                    iter_results[uid]['sensitive'][
                        sensitive_word] += uid_sensitive_dict[sensitive_word]
                except:
                    iter_results[uid]['sensitive'][
                        sensitive_word] = uid_sensitive_dict[sensitive_word]
            #compute geo
            uid_day_geo[uid] = {}
            ip_item = ip_results[count]
            if ip_item:
                uid_ip_dict = json.loads(ip_item)
            else:
                uid_ip_dict = {}
            for ip in uid_ip_dict:
                ip_count = len(uid_ip_dict[ip].split('&'))
                geo = ip2city(ip)
                if geo:
                    #print 'geo:', geo
                    try:
                        iter_results[uid]['geo'][geo] += ip_count
                    except:
                        iter_results[uid]['geo'][geo] = ip_count
                    try:
                        uid_day_geo[uid][geo] += ip_count
                    except:
                        uid_day_geo[uid][geo] = ip_count
            iter_results[uid]['geo_track'].append(uid_day_geo[uid])
            count += 1

        #compute keywords:
        try:
            text_results = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type, \
                                               body={'query':{'filtered':{'filter':{'terms':{'uid': uid_list}}}}, 'size':MAX_VALUE}, _source=True, fields=['uid', 'keywords_dict','text'])['hits']['hits']
        except:
            text_results = {}
        for item in text_results:
            #print 'keywords item:', item
            uid = item['fields']['uid'][0]
            uid_keywords_dict = json.loads(item['fields']['keywords_dict'][0])
            for keywords in uid_keywords_dict:
                try:
                    iter_results[uid]['keywords'][
                        keywords] += uid_keywords_dict[keywords]
                except:
                    iter_results[uid]['keywords'][
                        keywords] = uid_keywords_dict[keywords]

            #jln filter keyword 2016/11/08
            weibo_text = json.loads(item['fields']['text'][0])
            filter_keywords_dict = get_weibo_single(weibo_text)

            for keywords in filter_keywords_dict:
                try:
                    iter_results[uid]['filter_keywords'][
                        keywords] += uid_keywords_dict[keywords]
                except:
                    iter_results[uid]['filter_keywords'][
                        keywords] = uid_keywords_dict[keywords]

    #get keywords top
    for uid in uid_list:
        results[uid] = {}
        hashtag_dict = iter_results[uid]['hashtag']
        results[uid]['hashtag_dict'] = json.dumps(hashtag_dict)
        results[uid]['hashtag'] = '&'.join(hashtag_dict.keys())
        #sensitive words
        sensitive_word_dict = iter_results[uid]['sensitive']
        results[uid]['sensitive_dict'] = json.dumps(sensitive_word_dict)
        results[uid]['sensitive_string'] = '&'.join(sensitive_word_dict.keys())
        sensitive_score = 0
        for item in sensitive_word_dict:
            k = item
            v = sensitive_word_dict[k]
            tmp_stage = r_sensitive.hget('sensitive_words', k)
            if tmp_stage:
                sensitive_score += v * sensitive_score_dict[str(tmp_stage)]
        results[uid]['sensitive'] = sensitive_score
        #print 'sensitive_dict:', results[uid]['sensitive_dict']
        #print 'sensitive_string:', results[uid]['sensitive_string']
        #print 'sensitive:', results[uid]['sensitive']
        #geo
        geo_dict = iter_results[uid]['geo']
        geo_track_list = iter_results[uid]['geo_track']
        results[uid]['activity_geo_dict'] = json.dumps(geo_track_list)
        geo_dict_keys = geo_dict.keys()
        #print 'geo_dict_keys:', geo_dict_keys
        results[uid]['activity_geo'] = '&'.join(
            ['&'.join(item.split('\t')) for item in geo_dict_keys])
        #print 'activity_geo:',  results[uid]['activity_geo']

        keywords_dict = iter_results[uid]['keywords']
        keywords_top50 = sorted(keywords_dict.items(),
                                key=lambda x: x[1],
                                reverse=True)[:50]
        keywords_top50_string = '&'.join(
            [keyword_item[0] for keyword_item in keywords_top50])

        filter_keywords_dict = iter_results[uid]['filter_keywords']
        f_keywords_top50 = sorted(filter_keywords_dict.items(),
                                  key=lambda x: x[1],
                                  reverse=True)[:50]
        f_keywords_top50_string = '&'.join(
            [filter_keywords_dict[0] for keyword_item in f_keywords_top50])

        results[uid]['keywords'] = json.dumps(keywords_top50)
        results[uid]['keywords_string'] = keywords_top50_string

        results[uid]['filter_keywords'] = json.dumps(f_keywords_top50)
        results[uid]['filter_keywords_string'] = f_keywords_top50_string
    return results
def main():
    if RUN_TYPE:
        now_ts = time.time()-DAY # 前一天
        ts = str(datetime2ts(ts2datetime(now_ts)))
    else:
        ts = str(datetime2ts('2013-09-02'))
    now_ts = int(ts)
    print now_ts
    sensitive_string = "sensitive_" + ts
    date_string = ts
    update_sensitive_key = "sensitive_score_" + ts # 更新的键
    sensitive_dict_key = "sensitive_dict_" + ts
    sensitive_string_key = "sensitive_string_" + ts
    sensitive_day_change_key = "sensitive_" + ts +"_day_change"
    del_month = datetime2ts(ts2datetime(now_ts - MONTH))
    del_sensitive_key = "sensitive_score_"+str(del_month) # 要删除的键

    former_ts = int(ts) - DAY
    former_date = str(datetime2ts(ts2datetime(former_ts)))
    former_sensitive_key = "sensitive_score_" + former_date

    iter_count = 0
    bulk_action = []

    mappings(ES_SENSITIVE_INDEX)
    total_number = r_cluster.hlen(sensitive_string)
    scan_cursor = 0
    print total_number

    while 1:
        re_scan = r_cluster.hscan(sensitive_string, scan_cursor, count=1000)
        scan_cursor = re_scan[0]
        if len(re_scan[1]) != 0:
            sensitive_info = re_scan[1] # 字典形式,uid:sensitive_words_dict
            uid_list = sensitive_info.keys()
            sensitive_results = es.mget(index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX, body={"ids":uid_list})['docs']
            if sensitive_results:
                for item in sensitive_results:
                    uid = item['_id']
                    sensitive_words_dict = json.loads(sensitive_info[uid]) # json.loads
                    current_sensitive_score = 0
                    for k,v in sensitive_words_dict.iteritems():
                        tmp_stage = r_sensitive.hget("sensitive_words", k)
                        if tmp_stage:
                            tmp_stage = json.loads(tmp_stage)
                            current_sensitive_score += v*sensitive_score_dict[str(tmp_stage[0])]
                    if item['found']: # 之前存在相关信息
                        revise_item = item["_source"]
                        if del_sensitive_key in revise_item:
                            item.pop(del_sensitive_key)
                        revise_item['uid'] = uid
                        # 新更新的敏感度
                        revise_item[update_sensitive_key] = current_sensitive_score
                        revise_item['last_value'] = current_sensitive_score
                        # 新更新的敏感词
                        revise_item[sensitive_dict_key] = sensitive_info[uid]
                        # 新更新的string
                        revise_item[sensitive_string_key] = "&".join(sensitive_words_dict.keys())
                        # 当天和之前一天、一周和一月均值的差异
                        revise_item['sensitive_day_change'] = current_sensitive_score - revise_item.get(former_sensitive_key, 0)
                        revise_item['sensitive_week_change'] = current_sensitive_score - revise_item.get('sensitive_week_ave', 0)
                        revise_item['sensitive_month_change'] = current_sensitive_score - revise_item.get('sensitive_month_ave', 0)
                        # 更新后week、month的均值和方差
                        revise_item['sensitive_week_ave'], revise_item['sensitive_week_var'], revise_item['sensitive_week_sum'] = compute_week(revise_item, now_ts)
                        revise_item['senstiive_month_ave'], revise_item['sensitive_month_var'], revise_item['sensitive_month_sum'] = compute_month(revise_item, now_ts)

                    else:
                        revise_item = dict()
                        revise_item['uid'] = uid
                        revise_item[update_sensitive_key] = current_sensitive_score
                        revise_item['last_value'] = current_sensitive_score
                        revise_item[sensitive_dict_key] = sensitive_info[uid]
                        revise_item[sensitive_string_key] = "&".join(sensitive_words_dict.keys())
                        revise_item['sensitive_day_change'] = current_sensitive_score
                        revise_item['sensitive_week_change'] = current_sensitive_score
                        revise_item['sensitive_month_change'] = current_sensitive_score
                        revise_item['sensitive_week_ave'], revise_item['sensitive_week_var'], revise_item['sensitive_week_sum'] = compute_week(revise_item, now_ts)
                        revise_item['senstiive_month_ave'], revise_item['sensitive_month_var'], revise_item['sensitive_month_sum'] = compute_month(revise_item, now_ts)
                    action = {'index':{'_id': uid}}
                    bulk_action.extend([action, revise_item])
                    iter_count += 1
                    if iter_count % 1000 == 0:
                        es.bulk(bulk_action, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX)
                        bulk_action = []
                        print iter_count
        if int(scan_cursor) == 0:
            break
    if bulk_action:
        es.bulk(bulk_action, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX)

    print iter_count


    #######更新尚未完成的用户
    update_scan = scan(es, query={"query":{"filtered":{"filter":{"missing":{"field":update_sensitive_key}}}}}, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX)
    iter_count = 0
    bulk_action = []

    while 1:
        try:
            tmp = update_scan.next()
            revise_item = tmp['_source']
            if del_sensitive_key in revise_item:
                revise_item.pop(del_sensitive_key)
            uid = tmp['_id']
            # 新更新的敏感度
            revise_item[update_sensitive_key] = 0
            revise_item['last_value'] = 0
            # 新更新的敏感词
            revise_item[sensitive_dict_key] = json.dumps({})
            # 新更新的string
            revise_item[sensitive_string_key] = ""
            # 当天和之前一天、一周和一月均值的差异
            revise_item['sensitive_day_change'] = 0 - revise_item.get(former_sensitive_key, 0)
            revise_item['sensitive_week_change'] = 0 - revise_item.get('sensitive_week_ave', 0)
            revise_item['sensitive_month_change'] = 0 - revise_item.get('sensitive_month_ave', 0)
            # 更新后week、month的均值和方差
            revise_item['sensitive_week_ave'], revise_item['sensitive_week_var'], revise_item['sensitive_week_sum'] = compute_week(revise_item, now_ts)
            revise_item['senstiive_month_ave'], revise_item['sensitive_month_var'], revise_item['sensitive_month_sum'] = compute_month(revise_item, now_ts)

            action = {'index':{'_id': uid}}
            bulk_action.extend([action, revise_item])
            iter_count += 1
            if iter_count % 1000 == 0:
                es.bulk(bulk_action, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX)
                bulk_action = []
        except StopIteration:
            print "all done"
            if bulk_action:
                es.bulk(bulk_action, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX)
            break
        except Exception, r:
            print Exception, r
Ejemplo n.º 15
0
    index_name = flow_text_index_name_pre + ts2datetime(timestamp)
    try:
        search_results = es_xnr.search(index=index_name,
                                       doc_type=flow_text_index_type,
                                       body=query_body)['hits']['hits']
    except Exception, e:
        pass
        search_results = []
    for result in search_results:
        text = result['_source']['text'].encode('utf-8')
        node = createWordTree()
        sensitive_words_dict = searchWord(text, node)
        if sensitive_words_dict:
            sensitive_words_list = []
            for k, v in sensitive_words_dict.iteritems():
                tmp_stage = r_sensitive.hget("sensitive_words", k)
                if tmp_stage:
                    score += v * sensitive_score_dict[str(tmp_stage)]
    return score


if __name__ == '__main__':
    # '2017-10-15'
    # print get_sensitive_user(timestamp=1507996800, uid='100003271864059')
    print get_sensitive_info(timestamp=1507996800,
                             mid='123124323',
                             text=u"64和达赖太阳花")
    print get_sensitive_info(timestamp=1507996800, mid='123124323')
    print get_sensitive_info(timestamp=1507996800, text=u"64和达赖太阳花")
    print get_sensitive_info(timestamp=1507996800, text=u'达赖')
    print get_sensitive_info(timestamp=1507996800, text=u'军区')
Ejemplo n.º 16
0
def get_flow_information_v2(uid_list, all_user_keywords_dict):
    results = {}
    #results = {uid:{'hashtag_dict':{},'hashtag':'', 'keywords_dict':{}, 'keywords_string':'', 'activity_geo':'', 'activity_geo_dict':dict, 'activity_geo_aggs':''}}
    iter_results = {
    }  # iter_results = {uid:{'hashtag': hashtag_dict, 'geo':geo_dict, 'keywords':keywords_dict}}
    now_ts = time.time()
    #run_type
    today_sensitive_results = {}
    if RUN_TYPE == 1:
        now_date_ts = datetime2ts(ts2datetime(now_ts))
    else:
        now_date_ts = test_ts
    print 'run_type:', RUN_TYPE
    for i in range(WEEK, 0, -1):
        ts = now_date_ts - DAY * i
        uid_day_geo = {}
        #compute hashtag and geo
        hashtag_results = r_cluster_3.hmget('hashtag_' + str(ts), uid_list)
        ip_results = r_cluster.hmget('new_ip_' + str(ts), uid_list)
        #print 'ip_results:', ip_results
        #compute sensitive_words
        sensitive_results = r_cluster_3.hmget('sensitive_' + str(ts), uid_list)
        count = 0
        for uid in uid_list:
            #init iter_results[uid]
            if uid not in iter_results:
                iter_results[uid] = {
                    'hashtag': {},
                    'geo': {},
                    'geo_track': [],
                    'keywords': {},
                    'sensitive': {},
                    'school': {},
                    'week_ip': {
                        0: {},
                        1: {},
                        2: {},
                        3: {},
                        4: {},
                        5: {}
                    },
                    'ip': {}
                }
            if uid not in today_sensitive_results:
                today_sensitive_results[uid] = {}
            #compute hashtag
            hashtag_item = hashtag_results[count]
            if hashtag_item:
                uid_hashtag_dict = json.loads(hashtag_item)
            else:
                uid_hashtag_dict = {}
            for hashtag in uid_hashtag_dict:
                try:
                    iter_results[uid]['hashtag'][hashtag] += uid_hashtag_dict[
                        hashtag]
                except:
                    iter_results[uid]['hashtag'][hashtag] = uid_hashtag_dict[
                        hashtag]
            #compute sensitive
            sensitive_item = sensitive_results[count]
            if sensitive_item:
                uid_sensitive_dict = json.loads(sensitive_item)
            else:
                uid_sensitive_dict = {}
            for sensitive_word in uid_sensitive_dict:
                try:
                    iter_results[uid]['sensitive'][
                        sensitive_word] += uid_sensitive_dict[sensitive_word]
                except:
                    iter_results[uid]['sensitive'][
                        sensitive_word] = uid_sensitive_dict[sensitive_word]
                if ts == now_date_ts - DAY:
                    try:
                        today_sensitive_results[uid][
                            sensitive_word] += uid_sensitive_dict[
                                sensitive_word]
                    except:
                        today_sensitive_results[uid][
                            sensitive_word] = uid_sensitive_dict[
                                sensitive_word]
            #compute geo
            uid_day_geo[uid] = {}
            ip_item = ip_results[count]
            if ip_item:
                uid_ip_dict = json.loads(ip_item)
            else:
                uid_ip_dict = {}
            for ip in uid_ip_dict:
                ip_count = len(uid_ip_dict[ip].split('&'))
                geo, school = ip2city(ip)
                if geo:
                    try:
                        iter_results[uid]['geo'][geo] += ip_count
                    except:
                        iter_results[uid]['geo'][geo] = ip_count
                    try:
                        uid_day_geo[uid][geo] += ip_count
                    except:
                        uid_day_geo[uid][geo] = ip_count
                if school:
                    try:
                        iter_results[uid]['school'][school] += ip_count
                    except:
                        iter_results[uid]['school'][school] = ip_count
                #deal ip: job_ip&home_ip&active_ip
                ip_time_list = uid_ip_dict[ip].split('&')
                try:
                    iter_results[uid]['ip'][ip] += ip_count
                except:
                    iter_results[uid]['ip'] = {ip: ip_count}
                for ip_time_item in ip_time_list:
                    ip_timesegment = (int(ip_time_item) - ts) / IP_TIME_SEGMENT
                    try:
                        iter_results[uid]['week_ip'][ip_timesegment][ip] += 1
                    except:
                        iter_results[uid]['week_ip'][ip_timesegment][ip] = 1
                #end deal ip
            iter_results[uid]['geo_track'].append(uid_day_geo[uid])
            count += 1
    #get keywords top
    for uid in uid_list:
        #print 'test iter_results_ip:', iter_results[uid]['week_ip']
        results[uid] = {}
        #hashtag
        hashtag_dict = iter_results[uid]['hashtag']
        results[uid]['hashtag_dict'] = json.dumps(hashtag_dict)
        results[uid]['hashtag'] = '&'.join(hashtag_dict.keys())
        #sensitive words
        sensitive_word_dict = iter_results[uid]['sensitive']
        results[uid]['sensitive_dict'] = json.dumps(sensitive_word_dict)
        results[uid]['sensitive_string'] = '&'.join(sensitive_word_dict.keys())
        sensitive_score = 0
        today_sensitive_results_user = today_sensitive_results[uid]
        for sensitive_item in today_sensitive_results_user:
            k = sensitive_item
            v = today_sensitive_results_user[sensitive_item]
            tmp_stage = r_sensitive.hget('sensitive_words', k)
            if tmp_stage:
                sensitive_score += v * sensitive_score_dict[str(tmp_stage)]
        results[uid]['sensitive'] = sensitive_score
        #geo
        geo_dict = iter_results[uid]['geo']
        geo_track_list = iter_results[uid]['geo_track']
        results[uid]['activity_geo_dict'] = json.dumps(geo_track_list)
        geo_dict_keys = geo_dict.keys()
        results[uid]['activity_geo'] = '&'.join(
            ['&'.join(item.split('\t')) for item in geo_dict_keys])
        try:
            results[uid]['activity_geo_aggs'] = '&'.join(
                [item.split('\t')[-1] for item in geo_dict_keys])
        except:
            results[uid]['activity_geo_aggs'] = ''
        #keywords
        keywords_dict = all_user_keywords_dict[uid]
        keywords_top50 = sorted(keywords_dict.items(),
                                key=lambda x: x[1],
                                reverse=True)[:50]
        keywords_top50_string = '&'.join(
            [keyword_item[0] for keyword_item in keywords_top50])
        results[uid]['keywords'] = json.dumps(keywords_top50)
        results[uid]['keywords_string'] = keywords_top50_string
        #school dict
        school_dict = iter_results[uid]['school']
        school_string = '&'.join(school_dict.keys())
        if school_dict != {}:
            is_school = '1'
        else:
            is_school = '0'
        results[uid]['is_school'] = is_school
        results[uid]['school_string'] = school_string
        results[uid]['school_dict'] = json.dumps(school_dict)
        #ip: job_ip&home_ip&activity_ip
        #activity_ip
        all_ip_dict = iter_results[uid]['ip']
        sort_all_ip = sorted(all_ip_dict.items(),
                             key=lambda x: x[1],
                             reverse=True)
        try:
            activity_ip = sort_all_ip[0][0]
        except:
            activity_ip = ''
        results[uid]['activity_ip'] = str(activity_ip)
        #job_ip & home_ip
        week_time_ip_dict = iter_results[uid]['week_ip']
        for i in range(0, 6):
            try:
                segment_dict = week_time_ip_dict[i]
            except:
                week_time_ip_dict[i] = {}
        home_ip, job_ip = get_ip_description(week_time_ip_dict)
        results[uid]['home_ip'] = str(home_ip)
        results[uid]['job_ip'] = str(job_ip)

    return results
Ejemplo n.º 17
0
def get_flow_information(uid_list):
    # 前七天的数据, 不能用于每天更新
    lenth = len(uid_list)
    results = {}
    iter_results = {}
    result_dict = {}
    if RUN_TYPE:
        now_ts = time.time()
        now_date = ts2datetime(now_ts)  # date: 2013-09-01
    else:
        now_date = "2013-09-08"
    ts = datetime2ts(now_date)

    start_ts = ts - 8 * 3600 * 24
    for i in range(1, 8):
        ts = start_ts + i * 3600 * 24
        date = ts2datetime(ts)
        print "date:", date
        uid_day_geo = {}
        sensitive_uid_day_geo = {}
        flow_index_name = flow_text_index_name_pre + str(date)
        # hashtag
        print uid_list
        hashtag_results = redis_cluster.hmget('hashtag_' + str(ts), uid_list)
        sensitive_hashtag = redis_cluster.hmget('sensitive_hashtag_' + str(ts),
                                                uid_list)
        # sensitive_words
        sensitive_results = redis_cluster.hmget('sensitive_' + str(ts),
                                                uid_list)
        # ip
        if WORK_TYPE == 0:
            ip_index_name = ip_index_pre + str(date)
            sensitive_ip_index_name = sen_ip_index_pre + str(date)
            #activity_index_name = act_index_pre + str(date)
            #sensitive_activity_index_name = sen_act_index_pre + str(date)
            exist_bool = es_cluster.indices.exists(index=ip_index_name)
            sensitive_exist_bool = es_cluster.indices.exists(
                index=sensitive_ip_index_name)
            #activity_exist_bool = es_cluster.indices.exists(index=activity_index_name)
            #sensitive_activity_exist_bool = es_cluster.indices.exists(index=sensitive_activity_index_name)
            if exist_bool:
                ip_results = es_cluster.mget(index=ip_index_name,
                                             doc_type="ip",
                                             body={"ids": uid_list})["docs"]
            else:
                ip_results = [dict()] * lenth
            if sensitive_exist_bool:
                sensitive_ip_results = es_cluster.mget(
                    index=sensitive_ip_index_name,
                    doc_type="sensitive_ip",
                    body={"ids": uid_list})["docs"]
            else:
                sensitive_ip_results = [dict()] * lenth
            """
            if activity_exist_bool:
                activity_results = es_cluster.mget(index=activity_index_name, doc_type="activity", body={"ids":uid_list})["docs"]
            else:
                activity_results = [dict()]*lenth
            if sensitive_activity_exist_bool:
                sensitive_activity_results = es_cluster.mget(index=sensitive_activity_index_name, doc_type="sensitive_activity", body={"ids":uid_list})["docs"]
            else:
                sensitive_activity_results = [dict()]*lenth
            """
        else:
            ip_results = redis_ip.hmget('ip_' + str(ts), uid_list)
            sensitive_ip_results = redis_ip.hmget('sensitive_ip_' + str(ts),
                                                  uid_list)
            #activity_results = redis_activity.hmget('activity_'+str(date), uid_list)
            #sensitive_activity_results = redis_activity.hmget('sensitive_activity_'+str(date), uid_list)

        for j in range(0, len(uid_list)):
            uid = uid_list[j]
            if uid not in iter_results:
                iter_results[uid] = {'hashtag':{}, 'sensitive_hashtag':{}, 'geo':{}, "sensitive_geo":{},'geo_track':[],'keywords':{}, \
                        'sensitive_words':{}, "sensitive_geo_track":[],'ip': [], 'sensitive_ip':[]}
            # sensitive words
            if sensitive_results[j]:
                sensitive_words_results = json.loads(sensitive_results[j])
                for sensitive_word in sensitive_words_results:
                    try:
                        iter_results[uid]["sensitive_words"][
                            sensitive_word] += sensitive_words_results[
                                sensitive_word]
                    except:
                        iter_results[uid]["sensitive_words"][
                            sensitive_word] = sensitive_words_results[
                                sensitive_word]
            #print "sensitive_words:", iter_results[uid]["sensitive_words"]

            if hashtag_results[j]:
                hashtag_dict = json.loads(hashtag_results[j])
                for hashtag in hashtag_dict:
                    try:
                        iter_results[uid]['hashtag'][hashtag] += hashtag_dict[
                            hashtag]
                    except:
                        iter_results[uid]['hashtag'][hashtag] = hashtag_dict[
                            hashtag]
            #print "hashtag: ", iter_results[uid]['hashtag']

            if sensitive_hashtag[j]:
                sensitive_hashtag_dict = json.loads(sensitive_hashtag[j])
                for hashtag in sensitive_hashtag_dict:
                    try:
                        iter_results[uid]['sensitive_hashtag'][
                            hashtag] += sensitive_hashtag_dict[hashtag]
                    except:
                        iter_results[uid]['sensitive_hashtag'][
                            hashtag] = sensitive_hashtag_dict[hashtag]
            #print "sensitive_hashtag:", iter_results[uid]['sensitive_hashtag']

            uid_day_geo[uid] = {}
            sensitive_uid_day_geo[uid] = {}
            if WORK_TYPE == 0:  # es
                if ip_results[j]:
                    if ip_results[j]['found']:
                        detail_item = ip_results[j]['_source']
                        ip_dict = json.loads(detail_item['ip_dict'])
                    else:
                        ip_dict = {}
                else:
                    ip_dict = {}
            else:
                if ip_results[j]:
                    ip_dict = json.loads(ip_results[j])
                else:
                    ip_dict = {}
            if ip_dict:
                #iter_results[uid]['ip'].append(ip_dict)
                geo_dict = ip2geo(ip_dict)
                for geo, count in geo_dict.iteritems():
                    try:
                        iter_results[uid]['geo'][geo] += count
                    except:
                        iter_results[uid]['geo'][geo] = count
                    try:
                        uid_day_geo[uid][geo] += count
                    except:
                        uid_day_geo[uid][geo] = count
            #iter_results[uid]['ip'].append(ip_dict)
            iter_results[uid]['geo_track'].append(uid_day_geo[uid])
            #print "ip:", iter_results[uid]['ip'], iter_results[uid]['geo_track']

            if WORK_TYPE == 0:
                if sensitive_ip_results[j]:
                    if sensitive_ip_results[j]['found']:
                        detail_item = sensitive_ip_results[j]['_source']
                        sensitive_ip_dict = json.loads(
                            detail_item['sensitive_ip_dict'])
                    else:
                        sensitive_ip_dict = dict()
                else:
                    sensitive_ip_dict = dict()
            else:
                if sensitive_ip_results[j]:
                    sensitive_ip_dict = json.loads(sensitive_ip_results[j])
                else:
                    sensitive_ip_dict = dict()
            if sensitive_ip_dict:
                sensitive_geo_dict = ip2geo(sensitive_ip_dict)
                #iter_results[uid]['sensitive_ip'].append(sensitive_ip_dict)
                for geo, count in sensitive_geo_dict.iteritems():
                    try:
                        iter_results[uid]['sensitive_geo'][geo] += count
                    except:
                        iter_results[uid]['sensitive_geo'][geo] = count
                    try:
                        sensitive_uid_day_geo[uid][geo] += count
                    except:
                        sensitive_uid_day_geo[uid][geo] = count
            #iter_results[uid]['sensitive_ip'].append(sensitive_ip_dict)
            iter_results[uid]['sensitive_geo_track'].append(
                sensitive_uid_day_geo[uid])
            #print "sensitive_ip:", iter_results[uid]['sensitive_ip'], iter_results[uid]['sensitive_geo_track']

        # compute keywords
        flow_text_exist = es_flow_text.indices.exists(index=flow_index_name)
        if flow_text_exist:
            text_results = es_flow_text.search(index=flow_index_name, doc_type=flow_text_index_type,\
                   body={'query':{'filtered':{'filter':{'terms':{'uid': uid_list}}}}, 'size':MAX_VALUE},_source=False, fields=['uid', 'keywords_dict'])['hits']['hits']
        else:
            text_results = {}
        for item in text_results:
            uid = item['fields']['uid'][0]
            uid_keywords_dict = json.loads(item['fields']['keywords_dict'][0])
            for keywords in uid_keywords_dict:
                try:
                    iter_results[uid]['keywords'][
                        keywords] += uid_keywords_dict[keywords]
                except:
                    iter_results[uid]['keywords'][
                        keywords] = uid_keywords_dict[keywords]
        #print "keywords:", iter_results[uid]['keywords']

    for uid in uid_list:
        results[uid] = {}
        # hashtag
        hashtag_dict = iter_results[uid]['hashtag']
        results[uid]['hashtag_dict'] = json.dumps(hashtag_dict)
        results[uid]['hashtag_string'] = '&'.join(hashtag_dict.keys())
        # sensitive hashtag
        sensitive_hashtag_dict = iter_results[uid]['sensitive_hashtag']
        results[uid]['sensitive_hashtag_dict'] = json.dumps(
            sensitive_hashtag_dict)
        results[uid]['sensitive_hashtag_string'] = '&'.join(
            sensitive_hashtag_dict.keys())
        # sensitive_words
        sensitive_word_dict = iter_results[uid]['sensitive_words']
        results[uid]['sensitive_words_dict'] = json.dumps(sensitive_word_dict)
        results[uid]['sensitive_words_string'] = '&'.join(
            sensitive_word_dict.keys())
        sensitive_score = 0
        for k, v in sensitive_word_dict.iteritems():
            tmp = r_sensitive.hget('sensitive_words', k)
            if tmp:
                tmp_stage = json.loads(tmp)
                sensitive_score += sensitive_score_dict[str(tmp_stage[0])] * v
        results[uid]['sensitive'] = sensitive_score
        # geo
        geo_dict = iter_results[uid]['geo']
        geo_track_list = iter_results[uid]['geo_track']
        results[uid]['activity_geo_dict'] = json.dumps(geo_track_list)
        geo_dict_keys = geo_dict.keys()
        results[uid]['activity_geo'] = '&'.join(
            ['&'.join(item.split('\t')) for item in geo_dict_keys])
        results[uid]['activity_geo_aggs'] = '&'.join(
            [item.split('\t')[-1] for item in geo_dict_keys])
        sensitive_geo_dict = iter_results[uid]['sensitive_geo']
        sensitive_geo_track_list = iter_results[uid]['sensitive_geo_track']
        results[uid]['sensitive_activity_geo_dict'] = json.dumps(
            sensitive_geo_track_list)
        sensitive_geo_dict_keys = sensitive_geo_dict.keys()
        results[uid]['sensitive_activity_geo'] = '&'.join(
            ['&'.join(item.split('\t')) for item in sensitive_geo_dict_keys])
        results[uid]['sensitive_activity_geo_aggs'] = '&'.join(
            [item.split('\t')[-1] for item in sensitive_geo_dict_keys])

        keywords_dict = iter_results[uid]['keywords']
        keywords_top50 = sorted(keywords_dict.items(),
                                key=lambda x: x[1],
                                reverse=True)[:50]
        keywords_top50_string = '&'.join(
            [keyword_item[0] for keyword_item in keywords_top50])
        results[uid]['keywords_dict'] = json.dumps(keywords_top50)
        results[uid]['keywords_string'] = keywords_top50_string

    return results
def get_flow_information_v2(uid_list, all_user_keywords_dict):
    results = {}      
    #results = {uid:{'hashtag_dict':{},'hashtag':'', 'keywords_dict':{}, 'keywords_string':'', 'activity_geo':'', 'activity_geo_dict':dict, 'activity_geo_aggs':''}}
    iter_results = {} # iter_results = {uid:{'hashtag': hashtag_dict, 'geo':geo_dict, 'keywords':keywords_dict}}
    now_ts = time.time()
    #run_type
    today_sensitive_results = {}
    if RUN_TYPE == 1:
        now_date_ts = datetime2ts(ts2datetime(now_ts))
    else:
        now_date_ts = test_ts
    for i in range(WEEK,0,-1):
        ts = now_date_ts - DAY*i
        uid_day_geo = {}
        #compute hashtag and geo
        hashtag_results = r_cluster_3.hmget('hashtag_'+str(ts), uid_list)
        ip_results = r_cluster.hmget('new_ip_'+str(ts), uid_list)
        #compute sensitive_words
        sensitive_results = r_cluster_3.hmget('sensitive_'+str(ts), uid_list)
        count = 0 
        for uid in uid_list:
            #init iter_results[uid]
            if uid not in iter_results:
                iter_results[uid] = {'hashtag':{}, 'geo':{},'geo_track':[],'keywords':{}, 'sensitive':{}, 'school':{}}
            if uid not in today_sensitive_results:
                today_sensitive_results[uid] = {}
            #compute hashtag
            hashtag_item = hashtag_results[count]
            if hashtag_item:
                uid_hashtag_dict = json.loads(hashtag_item)
            else:
                uid_hashtag_dict = {}
            for hashtag in uid_hashtag_dict:
                try:
                    iter_results[uid]['hashtag'][hashtag] += uid_hashtag_dict[hashtag]
                except:
                    iter_results[uid]['hashtag'][hashtag] = uid_hashtag_dict[hashtag]
            #compute sensitive
            sensitive_item = sensitive_results[count]
            if sensitive_item:
                uid_sensitive_dict = json.loads(sensitive_item)
            else:
                uid_sensitive_dict = {}
            for sensitive_word in uid_sensitive_dict:
                try:
                    iter_results[uid]['sensitive'][sensitive_word] += uid_sensitive_dict[sensitive_word]
                except:
                    iter_results[uid]['sensitive'][sensitive_word] = uid_sensitive_dict[sensitive_word]
                if ts == now_date_ts - DAY:
                    try:
                        today_sensitive_results[uid][sensitive_word] += uid_sensitive_dict[sensitive_word]
                    except:
                        today_sensitive_results[uid][sensitive_word] = uid_sensitive_dict[sensitive_word]
            #compute geo
            uid_day_geo[uid] = {}
            ip_item = ip_results[count]
            if ip_item:
                uid_ip_dict = json.loads(ip_item)
            else:
                uid_ip_dict = {}
            for ip in uid_ip_dict:
                ip_count = len(uid_ip_dict[ip].split('&'))
                geo, school = ip2city(ip)
                if geo:
                    try:
                        iter_results[uid]['geo'][geo] += ip_count
                    except:
                        iter_results[uid]['geo'][geo] = ip_count
                    try:
                        uid_day_geo[uid][geo] += ip_count
                    except:
                        uid_day_geo[uid][geo] = ip_count
                if school:
                    try:
                        iter_results[uid]['school'][school] += ip_count
                    except:
                        iter_results[uid]['school'][school] = ip_count
            iter_results[uid]['geo_track'].append(uid_day_geo[uid])
            count += 1
               
    #get keywords top
    for uid in uid_list:
        results[uid] = {}
        #hashtag
        hashtag_dict = iter_results[uid]['hashtag']
        results[uid]['hashtag_dict'] = json.dumps(hashtag_dict)
        results[uid]['hashtag'] = '&'.join(hashtag_dict.keys())
        #sensitive words
        sensitive_word_dict = iter_results[uid]['sensitive']
        results[uid]['sensitive_dict'] = json.dumps(sensitive_word_dict)
        results[uid]['sensitive_string'] = '&'.join(sensitive_word_dict.keys())
        sensitive_score = 0
        today_sensitive_results_user = today_sensitive_results[uid]
        for sensitive_item in today_sensitive_results_user:
            k = sensitive_item
            v = today_sensitive_results_user[sensitive_item]
            tmp_stage = r_sensitive.hget('sensitive_words', k)
            if tmp_stage:
                sensitive_score += v * sensitive_score_dict[str(tmp_stage)]
        results[uid]['sensitive'] = sensitive_score
        #print 'sensitive_dict:', results[uid]['sensitive_dict']
        #print 'sensitive_string:', results[uid]['sensitive_string']
        #print 'sensitive:', results[uid]['sensitive']
        #geo
        geo_dict = iter_results[uid]['geo']
        geo_track_list = iter_results[uid]['geo_track']
        results[uid]['activity_geo_dict'] = json.dumps(geo_track_list)
        geo_dict_keys = geo_dict.keys()
        results[uid]['activity_geo'] = '&'.join(['&'.join(item.split('\t')) for item in geo_dict_keys])
        try:
            results[uid]['activity_geo_aggs'] = '&'.join([item.split('\t')[-1] for item in geo_dict_keys])
        except:
            results[uid]['activity_geo_aggs'] = ''
        #keywords
        keywords_dict = all_user_keywords_dict[uid]
        keywords_top50 = sorted(keywords_dict.items(), key=lambda x:x[1], reverse=True)[:50]
        keywords_top50_string = '&'.join([keyword_item[0] for keyword_item in keywords_top50])
        results[uid]['keywords'] = json.dumps(keywords_top50)
        results[uid]['keywords_string'] = keywords_top50_string
        #school dict
        school_dict = iter_results[uid]['school']
        school_string = '&'.join(school_dict.keys())
        if school_dict != {}:
            is_school = '1'
        else:
            is_school = '0'
        results[uid]['is_school'] = is_school
        results[uid]['school_string'] = school_string
        results[uid]['school_dict'] = json.dumps(school_dict)
        
    return results
def get_flow_information(uid_list):
    # 前七天的数据, 不能用于每天更新
    lenth = len(uid_list)
    results = {}
    iter_results = {}
    result_dict = {}
    if RUN_TYPE:
        now_ts = time.time()
        now_date = ts2datetime(now_ts)  # date: 2013-09-01
    else:
        now_date = "2013-09-08"
    ts = datetime2ts(now_date)

    start_ts = ts - 8 * 3600 * 24
    for i in range(1, 8):
        ts = start_ts + i * 3600 * 24
        date = ts2datetime(ts)
        print "date:", date
        uid_day_geo = {}
        sensitive_uid_day_geo = {}
        flow_index_name = flow_text_index_name_pre + str(date)
        # hashtag
        print uid_list
        hashtag_results = redis_cluster.hmget("hashtag_" + str(ts), uid_list)
        sensitive_hashtag = redis_cluster.hmget("sensitive_hashtag_" + str(ts), uid_list)
        # sensitive_words
        sensitive_results = redis_cluster.hmget("sensitive_" + str(ts), uid_list)
        # ip
        if WORK_TYPE == 0:
            ip_index_name = ip_index_pre + str(date)
            sensitive_ip_index_name = sen_ip_index_pre + str(date)
            # activity_index_name = act_index_pre + str(date)
            # sensitive_activity_index_name = sen_act_index_pre + str(date)
            exist_bool = es_cluster.indices.exists(index=ip_index_name)
            sensitive_exist_bool = es_cluster.indices.exists(index=sensitive_ip_index_name)
            # activity_exist_bool = es_cluster.indices.exists(index=activity_index_name)
            # sensitive_activity_exist_bool = es_cluster.indices.exists(index=sensitive_activity_index_name)
            if exist_bool:
                ip_results = es_cluster.mget(index=ip_index_name, doc_type="ip", body={"ids": uid_list})["docs"]
            else:
                ip_results = [dict()] * lenth
            if sensitive_exist_bool:
                sensitive_ip_results = es_cluster.mget(
                    index=sensitive_ip_index_name, doc_type="sensitive_ip", body={"ids": uid_list}
                )["docs"]
            else:
                sensitive_ip_results = [dict()] * lenth
            """
            if activity_exist_bool:
                activity_results = es_cluster.mget(index=activity_index_name, doc_type="activity", body={"ids":uid_list})["docs"]
            else:
                activity_results = [dict()]*lenth
            if sensitive_activity_exist_bool:
                sensitive_activity_results = es_cluster.mget(index=sensitive_activity_index_name, doc_type="sensitive_activity", body={"ids":uid_list})["docs"]
            else:
                sensitive_activity_results = [dict()]*lenth
            """
        else:
            ip_results = redis_ip.hmget("ip_" + str(ts), uid_list)
            sensitive_ip_results = redis_ip.hmget("sensitive_ip_" + str(ts), uid_list)
            # activity_results = redis_activity.hmget('activity_'+str(date), uid_list)
            # sensitive_activity_results = redis_activity.hmget('sensitive_activity_'+str(date), uid_list)

        for j in range(0, len(uid_list)):
            uid = uid_list[j]
            if uid not in iter_results:
                iter_results[uid] = {
                    "hashtag": {},
                    "sensitive_hashtag": {},
                    "geo": {},
                    "sensitive_geo": {},
                    "geo_track": [],
                    "keywords": {},
                    "sensitive_words": {},
                    "sensitive_geo_track": [],
                    "ip": [],
                    "sensitive_ip": [],
                }
            # sensitive words
            if sensitive_results[j]:
                sensitive_words_results = json.loads(sensitive_results[j])
                for sensitive_word in sensitive_words_results:
                    try:
                        iter_results[uid]["sensitive_words"][sensitive_word] += sensitive_words_results[sensitive_word]
                    except:
                        iter_results[uid]["sensitive_words"][sensitive_word] = sensitive_words_results[sensitive_word]
            # print "sensitive_words:", iter_results[uid]["sensitive_words"]

            if hashtag_results[j]:
                hashtag_dict = json.loads(hashtag_results[j])
                for hashtag in hashtag_dict:
                    try:
                        iter_results[uid]["hashtag"][hashtag] += hashtag_dict[hashtag]
                    except:
                        iter_results[uid]["hashtag"][hashtag] = hashtag_dict[hashtag]
            # print "hashtag: ", iter_results[uid]['hashtag']

            if sensitive_hashtag[j]:
                sensitive_hashtag_dict = json.loads(sensitive_hashtag[j])
                for hashtag in sensitive_hashtag_dict:
                    try:
                        iter_results[uid]["sensitive_hashtag"][hashtag] += sensitive_hashtag_dict[hashtag]
                    except:
                        iter_results[uid]["sensitive_hashtag"][hashtag] = sensitive_hashtag_dict[hashtag]
            # print "sensitive_hashtag:", iter_results[uid]['sensitive_hashtag']

            uid_day_geo[uid] = {}
            sensitive_uid_day_geo[uid] = {}
            if WORK_TYPE == 0:  # es
                if ip_results[j]:
                    if ip_results[j]["found"]:
                        detail_item = ip_results[j]["_source"]
                        ip_dict = json.loads(detail_item["ip_dict"])
                    else:
                        ip_dict = {}
                else:
                    ip_dict = {}
            else:
                if ip_results[j]:
                    ip_dict = json.loads(ip_results[j])
                else:
                    ip_dict = {}
            if ip_dict:
                # iter_results[uid]['ip'].append(ip_dict)
                geo_dict = ip2geo(ip_dict)
                for geo, count in geo_dict.iteritems():
                    try:
                        iter_results[uid]["geo"][geo] += count
                    except:
                        iter_results[uid]["geo"][geo] = count
                    try:
                        uid_day_geo[uid][geo] += count
                    except:
                        uid_day_geo[uid][geo] = count
            # iter_results[uid]['ip'].append(ip_dict)
            iter_results[uid]["geo_track"].append(uid_day_geo[uid])
            # print "ip:", iter_results[uid]['ip'], iter_results[uid]['geo_track']

            if WORK_TYPE == 0:
                if sensitive_ip_results[j]:
                    if sensitive_ip_results[j]["found"]:
                        detail_item = sensitive_ip_results[j]["_source"]
                        sensitive_ip_dict = json.loads(detail_item["sensitive_ip_dict"])
                    else:
                        sensitive_ip_dict = dict()
                else:
                    sensitive_ip_dict = dict()
            else:
                if sensitive_ip_results[j]:
                    sensitive_ip_dict = json.loads(sensitive_ip_results[j])
                else:
                    sensitive_ip_dict = dict()
            if sensitive_ip_dict:
                sensitive_geo_dict = ip2geo(sensitive_ip_dict)
                # iter_results[uid]['sensitive_ip'].append(sensitive_ip_dict)
                for geo, count in sensitive_geo_dict.iteritems():
                    try:
                        iter_results[uid]["sensitive_geo"][geo] += count
                    except:
                        iter_results[uid]["sensitive_geo"][geo] = count
                    try:
                        sensitive_uid_day_geo[uid][geo] += count
                    except:
                        sensitive_uid_day_geo[uid][geo] = count
            # iter_results[uid]['sensitive_ip'].append(sensitive_ip_dict)
            iter_results[uid]["sensitive_geo_track"].append(sensitive_uid_day_geo[uid])
            # print "sensitive_ip:", iter_results[uid]['sensitive_ip'], iter_results[uid]['sensitive_geo_track']

        # compute keywords
        flow_text_exist = es_flow_text.indices.exists(index=flow_index_name)
        if flow_text_exist:
            text_results = es_flow_text.search(
                index=flow_index_name,
                doc_type=flow_text_index_type,
                body={"query": {"filtered": {"filter": {"terms": {"uid": uid_list}}}}, "size": MAX_VALUE},
                _source=False,
                fields=["uid", "keywords_dict"],
            )["hits"]["hits"]
        else:
            text_results = {}
        for item in text_results:
            uid = item["fields"]["uid"][0]
            uid_keywords_dict = json.loads(item["fields"]["keywords_dict"][0])
            for keywords in uid_keywords_dict:
                try:
                    iter_results[uid]["keywords"][keywords] += uid_keywords_dict[keywords]
                except:
                    iter_results[uid]["keywords"][keywords] = uid_keywords_dict[keywords]
        # print "keywords:", iter_results[uid]['keywords']

    for uid in uid_list:
        results[uid] = {}
        # hashtag
        hashtag_dict = iter_results[uid]["hashtag"]
        results[uid]["hashtag_dict"] = json.dumps(hashtag_dict)
        results[uid]["hashtag_string"] = "&".join(hashtag_dict.keys())
        # sensitive hashtag
        sensitive_hashtag_dict = iter_results[uid]["sensitive_hashtag"]
        results[uid]["sensitive_hashtag_dict"] = json.dumps(sensitive_hashtag_dict)
        results[uid]["sensitive_hashtag_string"] = "&".join(sensitive_hashtag_dict.keys())
        # sensitive_words
        sensitive_word_dict = iter_results[uid]["sensitive_words"]
        results[uid]["sensitive_words_dict"] = json.dumps(sensitive_word_dict)
        results[uid]["sensitive_words_string"] = "&".join(sensitive_word_dict.keys())
        sensitive_score = 0
        for k, v in sensitive_word_dict.iteritems():
            tmp = r_sensitive.hget("sensitive_words", k)
            if tmp:
                tmp_stage = json.loads(tmp)
                sensitive_score += sensitive_score_dict[str(tmp_stage[0])] * v
        results[uid]["sensitive"] = sensitive_score
        # geo
        geo_dict = iter_results[uid]["geo"]
        geo_track_list = iter_results[uid]["geo_track"]
        results[uid]["activity_geo_dict"] = json.dumps(geo_track_list)
        geo_dict_keys = geo_dict.keys()
        results[uid]["activity_geo"] = "&".join(["&".join(item.split("\t")) for item in geo_dict_keys])
        results[uid]["activity_geo_aggs"] = "&".join([item.split("\t")[-1] for item in geo_dict_keys])
        sensitive_geo_dict = iter_results[uid]["sensitive_geo"]
        sensitive_geo_track_list = iter_results[uid]["sensitive_geo_track"]
        results[uid]["sensitive_activity_geo_dict"] = json.dumps(sensitive_geo_track_list)
        sensitive_geo_dict_keys = sensitive_geo_dict.keys()
        results[uid]["sensitive_activity_geo"] = "&".join(
            ["&".join(item.split("\t")) for item in sensitive_geo_dict_keys]
        )
        results[uid]["sensitive_activity_geo_aggs"] = "&".join(
            [item.split("\t")[-1] for item in sensitive_geo_dict_keys]
        )

        keywords_dict = iter_results[uid]["keywords"]
        keywords_top50 = sorted(keywords_dict.items(), key=lambda x: x[1], reverse=True)[:50]
        keywords_top50_string = "&".join([keyword_item[0] for keyword_item in keywords_top50])
        results[uid]["keywords_dict"] = json.dumps(keywords_top50)
        results[uid]["keywords_string"] = keywords_top50_string

    return results
def get_flow_information(uid_list):
    results = {}      
    #results = {uid:{'hashtag_dict':{},'hashtag':'', 'keywords_dict':{}, 'keywords_string':'', 'activity_geo':'', 'activity_geo_dict':dict}}
    iter_results = {} # iter_results = {uid:{'hashtag': hashtag_dict, 'geo':geo_dict, 'keywords':keywords_dict}}
    now_ts = time.time()
    now_date_ts = datetime2ts(ts2datetime(now_ts))
    #test
    now_date_ts = test_ts
    for i in range(7,0,-1):
        ts = now_date_ts - DAY*i
        iter_date = ts2datetime(ts)
        flow_text_index_name = flow_text_index_name_pre + iter_date
        uid_day_geo = {}
        #compute hashtag and geo
        hashtag_results = r_cluster_3.hmget('hashtag_'+str(ts), uid_list)
        ip_results = r_cluster.hmget('new_ip_'+str(ts), uid_list)
        #compute sensitive_words
        sensitive_results = r_cluster_3.hmget('sensitive_'+str(ts), uid_list)
        count = 0 
        for uid in uid_list:
            #init iter_results[uid]
            if uid not in iter_results:
                iter_results[uid] = {'hashtag':{}, 'geo':{},'geo_track':[],'keywords':{}, 'sensitive':{}}
            #compute hashtag
            hashtag_item = hashtag_results[count]
            if hashtag_item:
                uid_hashtag_dict = json.loads(hashtag_item)
            else:
                uid_hashtag_dict = {}
            for hashtag in uid_hashtag_dict:
                try:
                    iter_results[uid]['hashtag'][hashtag] += uid_hashtag_dict[hashtag]
                except:
                    iter_results[uid]['hashtag'][hashtag] = uid_hashtag_dict[hashtag]
            #compute sensitive
            sensitive_item = sensitive_results[count]
            if sensitive_item:
                uid_sensitive_dict = json.loads(sensitive_item)
            else:
                uid_sensitive_dict = {}
            for sensitive_word in uid_sensitive_dict:
                try:
                    iter_results[uid]['sensitive'][sensitive_word] += uid_sensitive_dict[sensitive_word]
                except:
                    iter_results[uid]['sensitive'][sensitive_word] = uid_sensitive_dict[sensitive_word]
            #compute geo
            uid_day_geo[uid] = {}
            ip_item = ip_results[count]
            if ip_item:
                uid_ip_dict = json.loads(ip_item)
            else:
                uid_ip_dict = {}
            for ip in uid_ip_dict:
                ip_count = len(uid_ip_dict[ip].split('&'))
                geo = ip2city(ip)
                if geo:
                    #print 'geo:', geo
                    try:
                        iter_results[uid]['geo'][geo] += ip_count
                    except:
                        iter_results[uid]['geo'][geo] = ip_count
                    try:
                        uid_day_geo[uid][geo] += ip_count
                    except:
                        uid_day_geo[uid][geo] = ip_count
            iter_results[uid]['geo_track'].append(uid_day_geo[uid])
            count += 1
        
        #compute keywords:        
        try:
            text_results = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type, \
                                               body={'query':{'filtered':{'filter':{'terms':{'uid': uid_list}}}}, 'size':MAX_VALUE}, _source=True, fields=['uid', 'keywords_dict'])['hits']['hits']
        except:
            text_results = {}
        for item in text_results:
            #print 'keywords item:', item
            uid = item['fields']['uid'][0]
            uid_keywords_dict = json.loads(item['fields']['keywords_dict'][0])
            for keywords in uid_keywords_dict:
                try:
                    iter_results[uid]['keywords'][keywords] += uid_keywords_dict[keywords]
                except:
                    iter_results[uid]['keywords'][keywords] = uid_keywords_dict[keywords]

       
    #get keywords top
    for uid in uid_list:
        results[uid] = {}
        hashtag_dict = iter_results[uid]['hashtag']
        results[uid]['hashtag_dict'] = json.dumps(hashtag_dict)
        results[uid]['hashtag'] = '&'.join(hashtag_dict.keys())
        #sensitive words
        sensitive_word_dict = iter_results[uid]['sensitive']
        results[uid]['sensitive_dict'] = json.dumps(sensitive_word_dict)
        results[uid]['sensitive_string'] = '&'.join(sensitive_word_dict.keys())
        sensitive_score = 0
        for item in sensitive_word_dict:
            k = item
            v = sensitive_word_dict[k]
            tmp_stage = r_sensitive.hget('sensitive_words', k)
            if tmp_stage:
                sensitive_score += v * sensitive_score_dict[str(tmp_stage)]
        results[uid]['sensitive'] = sensitive_score
        #print 'sensitive_dict:', results[uid]['sensitive_dict']
        #print 'sensitive_string:', results[uid]['sensitive_string']
        #print 'sensitive:', results[uid]['sensitive']
        #geo
        geo_dict = iter_results[uid]['geo']
        geo_track_list = iter_results[uid]['geo_track']
        results[uid]['activity_geo_dict'] = json.dumps(geo_track_list)
        geo_dict_keys = geo_dict.keys()
        #print 'geo_dict_keys:', geo_dict_keys
        results[uid]['activity_geo'] = '&'.join(['&'.join(item.split('\t')) for item in geo_dict_keys])
        #print 'activity_geo:',  results[uid]['activity_geo']

        keywords_dict = iter_results[uid]['keywords']
        keywords_top50 = sorted(keywords_dict.items(), key=lambda x:x[1], reverse=True)[:50]
        keywords_top50_string = '&'.join([keyword_item[0] for keyword_item in keywords_top50])
        results[uid]['keywords'] = json.dumps(keywords_top50)
        results[uid]['keywords_string'] = keywords_top50_string
        
    return results
Ejemplo n.º 21
0
def main():
    if RUN_TYPE:
        now_ts = time.time() - DAY  # 前一天
        ts = str(datetime2ts(ts2datetime(now_ts)))
    else:
        ts = str(datetime2ts('2013-09-02'))
    now_ts = int(ts)
    print now_ts
    sensitive_string = "sensitive_" + ts
    date_string = ts
    update_sensitive_key = "sensitive_score_" + ts  # 更新的键
    sensitive_dict_key = "sensitive_dict_" + ts
    sensitive_string_key = "sensitive_string_" + ts
    sensitive_day_change_key = "sensitive_" + ts + "_day_change"
    del_month = datetime2ts(ts2datetime(now_ts - MONTH))
    del_sensitive_key = "sensitive_score_" + str(del_month)  # 要删除的键

    former_ts = int(ts) - DAY
    former_date = str(datetime2ts(ts2datetime(former_ts)))
    former_sensitive_key = "sensitive_score_" + former_date

    iter_count = 0
    bulk_action = []

    mappings(ES_SENSITIVE_INDEX)
    total_number = r_cluster.hlen(sensitive_string)
    scan_cursor = 0
    print total_number

    while 1:
        re_scan = r_cluster.hscan(sensitive_string, scan_cursor, count=1000)
        scan_cursor = re_scan[0]
        if len(re_scan[1]) != 0:
            sensitive_info = re_scan[1]  # 字典形式,uid:sensitive_words_dict
            uid_list = sensitive_info.keys()
            sensitive_results = es.mget(index=ES_SENSITIVE_INDEX,
                                        doc_type=DOCTYPE_SENSITIVE_INDEX,
                                        body={"ids": uid_list})['docs']
            if sensitive_results:
                for item in sensitive_results:
                    uid = item['_id']
                    sensitive_words_dict = json.loads(
                        sensitive_info[uid])  # json.loads
                    current_sensitive_score = 0
                    for k, v in sensitive_words_dict.iteritems():
                        tmp_stage = r_sensitive.hget("sensitive_words", k)
                        if tmp_stage:
                            tmp_stage = json.loads(tmp_stage)
                            current_sensitive_score += v * sensitive_score_dict[
                                str(tmp_stage[0])]
                    if item['found']:  # 之前存在相关信息
                        revise_item = item["_source"]
                        if del_sensitive_key in revise_item:
                            item.pop(del_sensitive_key)
                        revise_item['uid'] = uid
                        # 新更新的敏感度
                        revise_item[
                            update_sensitive_key] = current_sensitive_score
                        revise_item['last_value'] = current_sensitive_score
                        # 新更新的敏感词
                        revise_item[sensitive_dict_key] = sensitive_info[uid]
                        # 新更新的string
                        revise_item[sensitive_string_key] = "&".join(
                            sensitive_words_dict.keys())
                        # 当天和之前一天、一周和一月均值的差异
                        revise_item[
                            'sensitive_day_change'] = current_sensitive_score - revise_item.get(
                                former_sensitive_key, 0)
                        revise_item[
                            'sensitive_week_change'] = current_sensitive_score - revise_item.get(
                                'sensitive_week_ave', 0)
                        revise_item[
                            'sensitive_month_change'] = current_sensitive_score - revise_item.get(
                                'sensitive_month_ave', 0)
                        # 更新后week、month的均值和方差
                        revise_item['sensitive_week_ave'], revise_item[
                            'sensitive_week_var'], revise_item[
                                'sensitive_week_sum'] = compute_week(
                                    revise_item, now_ts)
                        revise_item['senstiive_month_ave'], revise_item[
                            'sensitive_month_var'], revise_item[
                                'sensitive_month_sum'] = compute_month(
                                    revise_item, now_ts)

                    else:
                        revise_item = dict()
                        revise_item['uid'] = uid
                        revise_item[
                            update_sensitive_key] = current_sensitive_score
                        revise_item['last_value'] = current_sensitive_score
                        revise_item[sensitive_dict_key] = sensitive_info[uid]
                        revise_item[sensitive_string_key] = "&".join(
                            sensitive_words_dict.keys())
                        revise_item[
                            'sensitive_day_change'] = current_sensitive_score
                        revise_item[
                            'sensitive_week_change'] = current_sensitive_score
                        revise_item[
                            'sensitive_month_change'] = current_sensitive_score
                        revise_item['sensitive_week_ave'], revise_item[
                            'sensitive_week_var'], revise_item[
                                'sensitive_week_sum'] = compute_week(
                                    revise_item, now_ts)
                        revise_item['senstiive_month_ave'], revise_item[
                            'sensitive_month_var'], revise_item[
                                'sensitive_month_sum'] = compute_month(
                                    revise_item, now_ts)
                    action = {'index': {'_id': uid}}
                    bulk_action.extend([action, revise_item])
                    iter_count += 1
                    if iter_count % 1000 == 0:
                        es.bulk(bulk_action,
                                index=ES_SENSITIVE_INDEX,
                                doc_type=DOCTYPE_SENSITIVE_INDEX)
                        bulk_action = []
                        print iter_count
        if int(scan_cursor) == 0:
            break
    if bulk_action:
        es.bulk(bulk_action,
                index=ES_SENSITIVE_INDEX,
                doc_type=DOCTYPE_SENSITIVE_INDEX)

    print iter_count

    #######更新尚未完成的用户
    update_scan = scan(es,
                       query={
                           "query": {
                               "filtered": {
                                   "filter": {
                                       "missing": {
                                           "field": update_sensitive_key
                                       }
                                   }
                               }
                           }
                       },
                       index=ES_SENSITIVE_INDEX,
                       doc_type=DOCTYPE_SENSITIVE_INDEX)
    iter_count = 0
    bulk_action = []

    while 1:
        try:
            tmp = update_scan.next()
            revise_item = tmp['_source']
            if del_sensitive_key in revise_item:
                revise_item.pop(del_sensitive_key)
            uid = tmp['_id']
            # 新更新的敏感度
            revise_item[update_sensitive_key] = 0
            revise_item['last_value'] = 0
            # 新更新的敏感词
            revise_item[sensitive_dict_key] = json.dumps({})
            # 新更新的string
            revise_item[sensitive_string_key] = ""
            # 当天和之前一天、一周和一月均值的差异
            revise_item['sensitive_day_change'] = 0 - revise_item.get(
                former_sensitive_key, 0)
            revise_item['sensitive_week_change'] = 0 - revise_item.get(
                'sensitive_week_ave', 0)
            revise_item['sensitive_month_change'] = 0 - revise_item.get(
                'sensitive_month_ave', 0)
            # 更新后week、month的均值和方差
            revise_item['sensitive_week_ave'], revise_item[
                'sensitive_week_var'], revise_item[
                    'sensitive_week_sum'] = compute_week(revise_item, now_ts)
            revise_item['senstiive_month_ave'], revise_item[
                'sensitive_month_var'], revise_item[
                    'sensitive_month_sum'] = compute_month(
                        revise_item, now_ts)

            action = {'index': {'_id': uid}}
            bulk_action.extend([action, revise_item])
            iter_count += 1
            if iter_count % 1000 == 0:
                es.bulk(bulk_action,
                        index=ES_SENSITIVE_INDEX,
                        doc_type=DOCTYPE_SENSITIVE_INDEX)
                bulk_action = []
        except StopIteration:
            print "all done"
            if bulk_action:
                es.bulk(bulk_action,
                        index=ES_SENSITIVE_INDEX,
                        doc_type=DOCTYPE_SENSITIVE_INDEX)
            break
        except Exception, r:
            print Exception, r
Ejemplo n.º 22
0
def test(ft_type):
    print ft_type
    if ft_type == 'facebook':
        index_name_pre = facebook_flow_text_index_name_pre
        index_type = facebook_flow_text_index_type
        user_index_name = facebook_user_index_name
        user_index_type = facebook_user_index_type
    else:
        index_name_pre = twitter_flow_text_index_name_pre
        index_type = twitter_flow_text_index_type
        user_index_name = twitter_user_index_name
        user_index_type = twitter_user_index_type

    # date_list = load_date_list(True)
    date_list = load_date_list()

    DFA = createWordTree()
    query_body = {
        'post_filter': {
            'missing': {
                'field': 'keywords_string'
            }
        },
        'query': {
            'filtered': {
                'filter': {
                    'bool': {
                        'must': [{
                            'range': {
                                'flag_ch': {
                                    'gte': -1
                                }
                            }
                        }]
                    }
                }
            }
        }
    }
    for date in date_list:
        count = 0
        bulk_action = []
        index_name = index_name_pre + date
        try:
            es_scan_results = scan(es,
                                   query=query_body,
                                   size=1000,
                                   index=index_name,
                                   doc_type=index_type)
            while True:
                try:
                    scan_data = es_scan_results.next()
                    item = scan_data['_source']
                    text = item['text_ch']
                    uid = item['uid']
                    if ft_type == 'facebook':
                        _id = item['fid']
                    else:
                        _id = item['tid']

                    ts = datetime2ts(date)
                    #add sentiment field to weibo

                    sentiment, keywords_list = triple_classifier(item)

                    #add key words to weibo
                    keywords_dict, keywords_string = get_weibo_keywords(
                        keywords_list)

                    #sensitive_words_dict
                    sensitive_words_dict = searchWord(
                        text.encode('utf-8', 'ignore'), DFA)
                    if sensitive_words_dict:
                        sensitive_words_string_data = "&".join(
                            sensitive_words_dict.keys())
                        sensitive_words_dict_data = json.dumps(
                            sensitive_words_dict)
                    else:
                        sensitive_words_string_data = ""
                        sensitive_words_dict_data = json.dumps({})

                    #redis
                    if sensitive_words_dict:
                        sensitive_count_string = r_cluster.hget(
                            'sensitive_' + str(ts), str(uid))
                        if sensitive_count_string:  #redis取空
                            sensitive_count_dict = json.loads(
                                sensitive_count_string)
                            for word in sensitive_words_dict.keys():
                                if sensitive_count_dict.has_key(word):
                                    sensitive_count_dict[
                                        word] += sensitive_words_dict[word]
                                else:
                                    sensitive_count_dict[
                                        word] = sensitive_words_dict[word]
                            r_cluster.hset('sensitive_' + str(ts), str(uid),
                                           json.dumps(sensitive_count_dict))
                        else:
                            r_cluster.hset('sensitive_' + str(ts), str(uid),
                                           json.dumps(sensitive_words_dict))

                    #sensitive
                    sensitive_score = 0
                    if sensitive_words_dict:
                        for k, v in sensitive_words_dict.iteritems():
                            tmp_stage = r_sensitive.hget("sensitive_words", k)
                            if tmp_stage:
                                sensitive_score += v * sensitive_score_dict[
                                    str(tmp_stage)]

                    #directed_uid
                    directed_uid_data = 0
                    directed_uid, directed_uname = get_root_retweet(
                        text, uid, ft_type)
                    if directed_uid:
                        directed_uid_data = long(directed_uid)

                    # hashtag
                    hashtag = ''
                    RE = re.compile(
                        u'#([0-9a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+)[ ," =.。: :、]'
                    )
                    hashtag_list = re.findall(RE, text)
                    if hashtag_list:
                        hashtag = '&'.join(hashtag_list)

                    #action
                    action = {'update': {'_id': _id}}

                    # action_data
                    action_data = {
                        'sentiment': str(sentiment),
                        'keywords_dict': json.dumps(keywords_dict),
                        'keywords_string': keywords_string,
                        'sensitive_words_string': sensitive_words_string_data,
                        'sensitive_words_dict': sensitive_words_dict_data,
                        'sensitive': sensitive_score,
                        'directed_uid': directed_uid_data,
                        'directed_uname': directed_uname,
                        'hashtag': hashtag,
                    }

                    bulk_action.extend([action, {'doc': action_data}])
                    count += 1

                    if count % 1000 == 0 and count != 0:
                        if bulk_action:
                            es.bulk(bulk_action,
                                    index=index_name,
                                    doc_type=facebook_flow_text_index_type,
                                    timeout=600)
                        bulk_action = []
                        count = 0
                except StopIteration:
                    break
            if bulk_action:

                es.bulk(bulk_action,
                        index=index_name,
                        doc_type=facebook_flow_text_index_type,
                        timeout=600)
        except Exception, e:  #es文档不存在
            print e
Ejemplo n.º 23
0
def main():
    if RUN_TYPE:
        now_ts = time.time()-DAY # 前一天
        now_ts = datetime2ts('2016-03-24')
        ts = str(datetime2ts(ts2datetime(now_ts)))
    else:
        ts = str(datetime2ts('2016-03-16'))
    now_ts = int(ts)
    print now_ts
    sensitive_string = "sensitive_" + ts
    date_string = ts
    update_sensitive_key = "sensitive_score_" + ts # 更新的键
    sensitive_dict_key = "sensitive_dict_" + ts
    sensitive_string_key = "sensitive_string_" + ts
    sensitive_day_change_key = "sensitive_" + ts +"_day_change"
    del_month = datetime2ts(ts2datetime(now_ts - MONTH))
    del_sensitive_key = "sensitive_score_"+str(del_month) # 要删除的键

    former_ts = int(ts) - DAY
    former_date = str(datetime2ts(ts2datetime(former_ts)))
    former_sensitive_key = "sensitive_score_" + former_date

    iter_count = 0
    bulk_action = []

    mappings(ES_SENSITIVE_INDEX)
    total_number = r.hlen(sensitive_string)
    scan_cursor = 0
    print total_number

    while 1:
        re_scan = r.hscan(sensitive_string, scan_cursor, count=1000)
        scan_cursor = re_scan[0]
        if len(re_scan[1]) != 0:
            sensitive_info = re_scan[1] # 字典形式,uid:sensitive_words_dict
            uid_list = sensitive_info.keys()
            sensitive_results = es.mget(index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX, body={"ids":uid_list})['docs']
            if sensitive_results:
                for item in sensitive_results:
                    uid = item['_id']
                    sensitive_words_dict = json.loads(sensitive_info[uid]) # json.loads
                    current_sensitive_score = 0
                    for k,v in sensitive_words_dict.iteritems():
                        tmp_stage = r_sensitive.hget("sensitive_words", k)
                        if tmp_stage:
                            current_sensitive_score += v*sensitive_score_dict[str(tmp_stage)]
                    if item['found']: # 之前存在相关信息
                        revise_item = item["_source"]
                        if del_sensitive_key in revise_item:
                            item.remove(del_sensitive_key)
                        revise_item['uid'] = uid
                        # 新更新的敏感度
                        revise_item[update_sensitive_key] = current_sensitive_score
                        # 新更新的敏感词
                        revise_item[sensitive_dict_key] = sensitive_info[uid]
                        # 新更新的string
                        revise_item[sensitive_string_key] = "&".join(sensitive_words_dict.keys())
                        # 当天和之前一天、一周和一月均值的差异
                        revise_item['sensitive_day_change'] = current_sensitive_score - revise_item.get(former_sensitive_key, 0)
                        revise_item['sensitive_week_change'] = current_sensitive_score - revise_item.get('sensitive_week_ave', 0)
                        revise_item['sensitive_month_change'] = current_sensitive_score - revise_item.get('sensitive_month_ave', 0)
                        # 更新后week、month的均值和方差
                        revise_item['sensitive_week_ave'], revise_item['sensitive_week_var'], revise_item['sensitive_week_sum'] = compute_week(revise_item, now_ts)
                        revise_item['senstiive_month_ave'], revise_item['sensitive_month_var'], revise_item['sensitive_month_sum'] = compute_month(revise_item, now_ts)

                    else:
                        revise_item = dict()
                        revise_item['uid'] = uid
                        revise_item[update_sensitive_key] = current_sensitive_score
                        revise_item[sensitive_dict_key] = sensitive_info[uid]
                        revise_item[sensitive_string_key] = "&".join(sensitive_words_dict.keys())
                        revise_item['sensitive_day_change'] = current_sensitive_score
                        revise_item['sensitive_week_change'] = current_sensitive_score
                        revise_item['sensitive_month_change'] = current_sensitive_score
                        revise_item['sensitive_week_ave'], revise_item['sensitive_week_var'], revise_item['sensitive_week_sum'] = compute_week(revise_item, now_ts)
                        revise_item['senstiive_month_ave'], revise_item['sensitive_month_var'], revise_item['sensitive_month_sum'] = compute_month(revise_item, now_ts)
                    action = {'index':{'_id': uid}}
                    bulk_action.extend([action, revise_item])
                    iter_count += 1
                    if iter_count % 1000 == 0:
                        es.bulk(bulk_action, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX)
                        bulk_action = []
                        print iter_count
        if int(scan_cursor) == 0:
            break
    if bulk_action:
        es.bulk(bulk_action, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX)

    print iter_count
Ejemplo n.º 24
0
def get_flow_information_v2(uid_list, all_user_keywords_dict):
    results = {}
    #results = {uid:{'hashtag_dict':{},'hashtag':'', 'keywords_dict':{}, 'keywords_string':'', 'activity_geo':'', 'activity_geo_dict':dict, 'activity_geo_aggs':''}}
    iter_results = {
    }  # iter_results = {uid:{'hashtag': hashtag_dict, 'geo':geo_dict, 'keywords':keywords_dict}}
    now_ts = time.time()
    #run_type
    if RUN_TYPE == 1:
        now_date_ts = datetime2ts(ts2datetime(now_ts))
    else:
        now_date_ts = test_ts
    for i in range(WEEK, 0, -1):
        ts = now_date_ts - DAY * i
        print ts
        uid_day_geo = {}
        #compute hashtag and geo
        hashtag_results = r_cluster.hmget('hashtag_' + str(ts), uid_list)
        ip_results = r_cluster.hmget('new_ip_' + str(ts), uid_list)
        #compute sensitive_words
        sensitive_results = r_cluster.hmget('sensitive_' + str(ts), uid_list)
        count = 0
        for uid in uid_list:
            #init iter_results[uid]
            if uid not in iter_results:
                iter_results[uid] = {
                    'hashtag': {},
                    'geo': {},
                    'geo_track': [],
                    'keywords': {},
                    'sensitive': {}
                }
            #compute hashtag
            hashtag_item = hashtag_results[count]
            if hashtag_item:
                uid_hashtag_dict = json.loads(hashtag_item)
            else:
                uid_hashtag_dict = {}
            for hashtag in uid_hashtag_dict:
                try:
                    iter_results[uid]['hashtag'][hashtag] += uid_hashtag_dict[
                        hashtag]
                except:
                    iter_results[uid]['hashtag'][hashtag] = uid_hashtag_dict[
                        hashtag]
            #compute sensitive
            sensitive_item = sensitive_results[count]
            if sensitive_item:
                uid_sensitive_dict = json.loads(sensitive_item)
            else:
                uid_sensitive_dict = {}
            for sensitive_word in uid_sensitive_dict:
                try:
                    iter_results[uid]['sensitive'][
                        sensitive_word] += uid_sensitive_dict[sensitive_word]
                except:
                    iter_results[uid]['sensitive'][
                        sensitive_word] = uid_sensitive_dict[sensitive_word]
            #compute geo
            uid_day_geo[uid] = {}
            ip_item = ip_results[count]
            if ip_item:
                uid_ip_dict = json.loads(ip_item)
            else:
                uid_ip_dict = {}
            for ip in uid_ip_dict:
                ip_count = len(uid_ip_dict[ip].split('&'))
                geo = ip2city(ip)
                if geo:
                    try:
                        iter_results[uid]['geo'][geo] += ip_count
                    except:
                        iter_results[uid]['geo'][geo] = ip_count
                    try:
                        uid_day_geo[uid][geo] += ip_count
                    except:
                        uid_day_geo[uid][geo] = ip_count
            iter_results[uid]['geo_track'].append(uid_day_geo[uid])
            count += 1

    #get keywords top
    for uid in uid_list:
        results[uid] = {}
        #hashtag
        hashtag_dict = iter_results[uid]['hashtag']
        results[uid]['hashtag_dict'] = json.dumps(hashtag_dict)
        results[uid]['hashtag'] = '&'.join(hashtag_dict.keys())
        #sensitive words
        sensitive_word_dict = iter_results[uid]['sensitive']
        results[uid]['sensitive_dict'] = json.dumps(sensitive_word_dict)
        results[uid]['sensitive_string'] = '&'.join(sensitive_word_dict.keys())
        sensitive_score = 0
        for sensitive_item in sensitive_word_dict:
            k = sensitive_item
            v = sensitive_word_dict[sensitive_item]
            tmp_stage = r_sensitive.hget('sensitive_words', k)
            if tmp_stage:
                sensitive_score += v * sensitive_score_dict[str(tmp_stage)]
        results[uid]['sensitive'] = sensitive_score
        #print 'sensitive_dict:', results[uid]['sensitive_dict']
        #print 'sensitive_string:', results[uid]['sensitive_string']
        #print 'sensitive:', results[uid]['sensitive']
        #geo
        geo_dict = iter_results[uid]['geo']
        geo_track_list = iter_results[uid]['geo_track']
        results[uid]['activity_geo_dict'] = json.dumps(geo_track_list)
        geo_dict_keys = geo_dict.keys()
        results[uid]['activity_geo'] = '&'.join(
            ['&'.join(item.split('\t')) for item in geo_dict_keys])
        try:
            results[uid]['activity_geo_aggs'] = '&'.join(
                [item.split('\t')[-1] for item in geo_dict_keys])
        except:
            results[uid]['activity_geo_aggs'] = ''

        keywords_dict = all_user_keywords_dict[uid]
        keywords_top50 = sorted(keywords_dict.items(),
                                key=lambda x: x[1],
                                reverse=True)[:50]
        keywords_top50_string = '&'.join(
            [keyword_item[0] for keyword_item in keywords_top50])
        results[uid]['keywords'] = json.dumps(keywords_top50)
        results[uid]['keywords_string'] = keywords_top50_string

    return results