def update_day_sensitive(uid_list): results = {} count = 0 for uid in uid_list: results[uid] = {"sensitive": 0, 'sensitive_string': "", 'sensitive_dict': json.dumps({})} all_results = {} now_ts = time.time() if RUN_TYPE == 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = datetime2ts('2013-09-02') today_sensitive_dict = {} sensitive_results = redis_cluster.hmget("sensitive_"+str(now_date_ts), uid_list) for item in sensitive_results: if not item: count += 1 continue print type(item) uid = uid_list[count] item = json.loads(item) sensitive_index = 0 sensitive_words_dict = {} for word, count in item.iteritems(): tmp_stage = r_sensitive.hget("sensitive_words", word) if tmp_stage: tmp = json.loads(tmp_stage) sensitive_index += sensitive_score_dict[str(tmp[0])] * count sensitive_words_string = "&".join(item.keys()) results[uid] = {'sensitive': sensitive_index, "sensitive_words_string":sensitive_words_string, "sensitive_words_dict":item} count += 1 return results
def get_sensitive_user(timestamp, uid): score = 0 query_body = {'query': {'term': {'uid': uid}}, 'size': 50} index_name = flow_text_index_name_pre + ts2datetime(timestamp) search_results = es_flow_text.search(index=index_name, doc_type=flow_text_index_type, body=query_body)['hits']['hits'] for result in search_results: text = result['_source']['text'].encode('utf-8') node = createWordTree() sensitive_words_dict = searchWord(text, node) if sensitive_words_dict: sensitive_words_list = [] for k, v in sensitive_words_dict.iteritems(): tmp_stage = r_sensitive.hget("sensitive_words", k) if tmp_stage: score += v * sensitive_score_dict[str(tmp_stage)] return score
def update_day_sensitive(uid_list): results = {} for uid in uid_list: results[uid] = {"sensitive": 0, "sensitive_string": "", "sensitive_dict": json.dumps({})} all_results = {} now_ts = time.time() if RUN_TYPE == 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = datetime2ts("2013-09-03") today_sensitive_dict = {} sensitive_results = redis_cluster.hmget("sensitive_" + str(now_date_ts), uid_list) for item in sensitive_results: for uid, words_dict in item.iteritems(): sensitive_index = 0 sensitive_words_dict = {} if words_dict: sensitive_words_dict = json.dumps(words_dict) for word, count in words_dict.iter_items(): tmp_stage = r_sensitive.hget("sensitive_words", word) if tmp_stage: tmp = json.loads(tmp_stage) sensitive_index += sensitive_score_dict[tmp[0]] * count sensitive_words_string = "&".join(sensitive_words_dict.keys()) results[uid] = { "sensitive": sensitive_index, "sensitive_words_string": sensitive_words_string, "sensitive_words_dict": sensitive_words_dict, } return results
def sensitive_process(text,timestamp): ## 人物敏感度 iter_results = {} # iter_results = {uid:{}} now_ts = time.time() #run_type today_sensitive_results = {} if S_TYPE != 'test': now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = datetime2ts(S_DATE) for i in range(WEEK,0,-1): ts = now_date_ts - DAY*i sensitive_results = r_cluster_3.hmget('sensitive_'+str(ts), uid_list) count = 0 for uid in uid_list: if uid not in today_sensitive_results: today_sensitive_results[uid] = {} #compute sensitive sensitive_item = sensitive_results[count] if sensitive_item: uid_sensitive_dict = json.loads(sensitive_item) else: uid_sensitive_dict = {} for sensitive_word in uid_sensitive_dict: try: iter_results[uid]['sensitive'][sensitive_word] += uid_sensitive_dict[sensitive_word] except: iter_results[uid]['sensitive'][sensitive_word] = uid_sensitive_dict[sensitive_word] if ts == now_date_ts - DAY: try: today_sensitive_results[uid][sensitive_word] += uid_sensitive_dict[sensitive_word] except: today_sensitive_results[uid][sensitive_word] = uid_sensitive_dict[sensitive_word] for uid in uid_list: results[uid] = {} ## 信息敏感度 sensitive_words_dict = searchWord(text.encode('utf-8', 'ignore'), DFA) if sensitive_words_dict: item['sensitive_words_string'] = "&".join(sensitive_words_dict.keys()) item['sensitive_words_dict'] = json.dumps(sensitive_words_dict) else: item['sensitive_words_string'] = "" item['sensitive_words_dict'] = json.dumps({}) sensitive_words_dict = json.loads(item['sensitive_words_dict']) if sensitive_words_dict: score = 0 for k,v in sensitive_words_dict.iteritems(): tmp_stage = r_sensitive.hget("sensitive_words", k) if tmp_stage: score += v*sensitive_score_dict[str(tmp_stage)] index_body['sensitive'] = score
def expand_index_action(item): index_body = {} index_body['uid'] = str(item['uid']) index_body['text'] = item['text'] index_body['fid'] = str(item['fid']) index_body['sentiment'] = str(item['sentiment']) index_body['timestamp'] = int(item['timestamp']) #index_body['message_type'] = item['message_type'] index_body['keywords_dict'] = item['keywords_dict'] index_body['keywords_string'] = item['keywords_string'] index_body['sensitive_words_string'] = item['sensitive_words_string'] index_body['sensitive_words_dict'] = item['sensitive_words_dict'] sensitive_words_dict = json.loads(item['sensitive_words_dict']) score = 0 if sensitive_words_dict: #score = 0 for k, v in sensitive_words_dict.iteritems(): tmp_stage = r_sensitive.hget("sensitive_words", k) if tmp_stage: score += v * sensitive_score_dict[str(tmp_stage)] #index_body['sensitive'] = score index_body['sensitive'] = score #print 'sensitive...index body...',index_body['sensitive'] #if item['message_type'] == 3: #for retweet message: get directed retweet uname and uid # directed_uid, directed_uname = get_directed_retweet(item['text'], item['root_uid']) directed_uid, directed_uname = get_root_retweet(item['text'], item['uid']) if directed_uid: index_body['directed_uid'] = long(directed_uid) else: #index_body['directed_uid'] = directed_uid index_body['directed_uid'] = 0 index_body['directed_uname'] = directed_uname #index_body['root_fid'] = str(item['fid']) #index_body['root_uid'] = str(item['uid']) # elif item['message_type'] == 2: # #for comment meesage: get directed comment uname and uid # directed_uid, directed_uname = get_directed_comment(item['text'], item['root_uid']) # if directed_uid: # index_body['directed_uid'] = int(directed_uid) # else: # #index_body['directed_uid'] = directed_uid # index_body['directed_uid'] = 0 # index_body['directed_uname'] = directed_uname # index_body['root_fid'] = str(item['root_fid']) # index_body['root_uid'] = str(item['root_uid']) # ip = item['send_ip'] # index_body['ip'] = ip # index_body['geo'] = ip2city(ip) #output: 中国&河北&石家庄 action = {'index': {'_id': index_body['fid']}} xdata = index_body #print 'index_body...',index_body return action, xdata
def get_sensitive_user(uid): try: tmp_stage = r_sensitive.hget('sensitive_words', uid) if tmp_stage: sensitive_score += v * sensitive_score_dict[str(tmp_stage)] except: sensitive_score = 0 return sensitive_score
def expand_index_action(item): index_body = {} index_body['uid'] = str(item['uid']) index_body['user_fansnum'] = int(item.get('user_fansnum', 0)) index_body['text'] = item['text'] index_body['mid'] = str(item['mid']) index_body['sentiment'] = str(item['sentiment']) index_body['timestamp'] = int(item['timestamp']) index_body['message_type'] = item['message_type'] index_body['keywords_dict'] = item['keywords_dict'] index_body['keywords_string'] = item['keywords_string'] index_body['sensitive_words_string'] = item['sensitive_words_string'] index_body['sensitive_words_dict'] = item['sensitive_words_dict'] index_body['retweeted'] = 0 index_body['comment'] = 0 index_body['sensitive'] = 0 sensitive_words_dict = json.loads(item['sensitive_words_dict']) if sensitive_words_dict: score = 0 for k, v in sensitive_words_dict.iteritems(): tmp_stage = r_sensitive.hget("sensitive_words", k) if tmp_stage: score += v * sensitive_score_dict[str(tmp_stage)] index_body['sensitive'] = score if item['message_type'] == 3: #for retweet message: get directed retweet uname and uid directed_uid, directed_uname = get_directed_retweet( item['text'], item['root_uid']) if directed_uid: index_body['directed_uid'] = int(directed_uid) else: #index_body['directed_uid'] = directed_uid index_body['directed_uid'] = 0 index_body['directed_uname'] = directed_uname index_body['root_mid'] = str(item['root_mid']) index_body['root_uid'] = str(item['root_uid']) elif item['message_type'] == 2: #for comment meesage: get directed comment uname and uid directed_uid, directed_uname = get_directed_comment( item['text'], item['root_uid']) if directed_uid: index_body['directed_uid'] = int(directed_uid) else: #index_body['directed_uid'] = directed_uid index_body['directed_uid'] = 0 index_body['directed_uname'] = directed_uname index_body['root_mid'] = str(item['root_mid']) index_body['root_uid'] = str(item['root_uid']) ip = item['send_ip'] index_body['ip'] = ip index_body['geo'] = ip2city(ip) #output: 中国&河北&石家庄 action = {'index': {'_id': index_body['mid']}} xdata = index_body return action, xdata
def compute_sensitive(text): score = 0 node = createWordTree() sensitive_words_dict = searchWord(text.encode('utf-8'), node) if sensitive_words_dict: for k, v in sensitive_words_dict.iteritems(): tmp_stage = r_sensitive.hget("sensitive_words", k) if tmp_stage: score += v * sensitive_score_dict[str(tmp_stage)] return score
def update_day_sensitive(uid_list): results = {} all_results = {} now_ts = time.time() #run_type if RUN_TYPE == 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = test_ts today_sensitive_dict = {} for i in range(WEEK,0,-1): ts = now_date_ts - DAY*i count = 0 sensitive_results = r_cluster_3.hmget('sensitive_'+str(ts), uid_list) #print 'sensitive_results:', sensitive_results for uid in uid_list: if uid not in results: results[uid] = {} sensitive_item = sensitive_results[count] if uid not in today_sensitive_dict: today_sensitive_dict[uid] = {} if sensitive_item: sensitive_dict = json.loads(sensitive_item) else: sensitive_dict = {} for sensitive in sensitive_dict: try: results[uid][sensitive] += 1 except: results[uid][sensitive] = 1 if ts == now_date_ts - DAY: try: today_sensitive_dict[uid][sensitive] += 1 except: today_sensitive_dict[uid][sensitive] = 1 count += 1 #print 'results:', results for uid in uid_list: user_sensitive_dict = results[uid] #print 'uid,sensitive:', uid, user_sensitive_dict sensitive_score = 0 today_sensitive_dict_user = today_sensitive_dict[uid] for item in today_sensitive_dict_user: k = item v = today_sensitive_dict_user[k] tmp_stage = r_sensitive.hget('sensitive_words', k) if tmp_stage: sensitive_score += v * sensitive_score_dict[str(tmp_stage)] #print 'sensitive_score:', sensitive_score sensitive_string = '&'.join(user_sensitive_dict.keys()) #print 'uid, sensitive:', uid, sensitive_string, sensitive_score all_results[uid] = {'sensitive_string': sensitive_string, 'sensitive_dict':json.dumps(user_sensitive_dict),\ 'sensitive': sensitive_score} #print 'all_results:', all_results return all_results
def update_day_sensitive(uid_list): results = {} all_results = {} now_ts = time.time() #run_type if RUN_TYPE == 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = test_ts today_sensitive_dict = {} for i in range(WEEK, 0, -1): ts = now_date_ts - DAY * i count = 0 sensitive_results = r_cluster_3.hmget('sensitive_' + str(ts), uid_list) #print 'sensitive_results:', sensitive_results for uid in uid_list: if uid not in results: results[uid] = {} sensitive_item = sensitive_results[count] if uid not in today_sensitive_dict: today_sensitive_dict[uid] = {} if sensitive_item: sensitive_dict = json.loads(sensitive_item) else: sensitive_dict = {} for sensitive in sensitive_dict: try: results[uid][sensitive] += 1 except: results[uid][sensitive] = 1 if ts == now_date_ts - DAY: try: today_sensitive_dict[uid][sensitive] += 1 except: today_sensitive_dict[uid][sensitive] = 1 count += 1 #print 'results:', results for uid in uid_list: user_sensitive_dict = results[uid] #print 'uid,sensitive:', uid, user_sensitive_dict sensitive_score = 0 today_sensitive_dict_user = today_sensitive_dict[uid] for item in today_sensitive_dict_user: k = item v = today_sensitive_dict_user[k] tmp_stage = r_sensitive.hget('sensitive_words', k) if tmp_stage: sensitive_score += v * sensitive_score_dict[str(tmp_stage)] #print 'sensitive_score:', sensitive_score sensitive_string = '&'.join(user_sensitive_dict.keys()) #print 'uid, sensitive:', uid, sensitive_string, sensitive_score all_results[uid] = {'sensitive_string': sensitive_string, 'sensitive_dict':json.dumps(user_sensitive_dict),\ 'sensitive': sensitive_score} #print 'all_results:', all_results return all_results
def expand_index_action(item): index_body = {} index_body['uid'] = str(item['uid']) index_body['user_fansnum'] = int(item.get('user_fansnum', 0)) index_body['text'] = item['text'] index_body['mid'] = str(item['mid']) index_body['sentiment'] = str(item['sentiment']) index_body['timestamp'] = int(item['timestamp']) index_body['message_type'] = item['message_type'] index_body['keywords_dict'] = item['keywords_dict'] index_body['keywords_string'] = item['keywords_string'] index_body['sensitive_words_string'] = item['sensitive_words_string'] index_body['sensitive_words_dict'] = item['sensitive_words_dict'] index_body['retweeted'] = 0 index_body['comment'] = 0 index_body['sensitive'] = 0 sensitive_words_dict = json.loads(item['sensitive_words_dict']) if sensitive_words_dict: score = 0 for k,v in sensitive_words_dict.iteritems(): tmp_stage = r_sensitive.hget("sensitive_words", k) if tmp_stage: score += v*sensitive_score_dict[str(tmp_stage)] index_body['sensitive'] = score if item['message_type'] == 3: #for retweet message: get directed retweet uname and uid directed_uid, directed_uname = get_directed_retweet(item['text'], item['root_uid']) if directed_uid: index_body['directed_uid'] = int(directed_uid) else: #index_body['directed_uid'] = directed_uid index_body['directed_uid'] = 0 index_body['directed_uname'] = directed_uname index_body['root_mid'] = str(item['root_mid']) index_body['root_uid'] = str(item['root_uid']) elif item['message_type'] == 2: #for comment meesage: get directed comment uname and uid directed_uid, directed_uname = get_directed_comment(item['text'], item['root_uid']) if directed_uid: index_body['directed_uid'] = int(directed_uid) else: #index_body['directed_uid'] = directed_uid index_body['directed_uid'] = 0 index_body['directed_uname'] = directed_uname index_body['root_mid'] = str(item['root_mid']) index_body['root_uid'] = str(item['root_uid']) ip = item['send_ip'] index_body['ip'] = ip index_body['geo'] = ip2city(ip) #output: 中国&河北&石家庄 action = {'index': {'_id': index_body['mid']}} xdata = index_body return action, xdata
def update_day_sensitive(uid_list): results = {} for uid in uid_list: results[uid] = { "sensitive": 0, 'sensitive_string': "", 'sensitive_dict': json.dumps({}) } all_results = {} now_ts = time.time() if RUN_TYPE == 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = datetime2ts('2013-09-03') today_sensitive_dict = {} sensitive_results = redis_cluster.hmget("sensitive_" + str(now_date_ts), uid_list) for item in sensitive_results: for uid, words_dict in item.iteritems(): sensitive_index = 0 sensitive_words_dict = {} if words_dict: sensitive_words_dict = json.dumps(words_dict) for word, count in words_dict.iter_items(): tmp_stage = r_sensitive.hget("sensitive_words", word) if tmp_stage: tmp = json.loads(tmp_stage) sensitive_index += sensitive_score_dict[tmp[0]] * count sensitive_words_string = "&".join(sensitive_words_dict.keys()) results[uid] = { 'sensitive': sensitive_index, "sensitive_words_string": sensitive_words_string, "sensitive_words_dict": sensitive_words_dict } return results
def get_flow_information(uid_list): results = {} #results = {uid:{'hashtag_dict':{},'hashtag':'', 'keywords_dict':{}, 'keywords_string':'', 'activity_geo':'', 'activity_geo_dict':dict}} iter_results = { } # iter_results = {uid:{'hashtag': hashtag_dict, 'geo':geo_dict, 'keywords':keywords_dict}} now_ts = time.time() now_date_ts = datetime2ts(ts2datetime(now_ts)) #test now_date_ts = test_ts for i in range(7, 0, -1): ts = now_date_ts - DAY * i iter_date = ts2datetime(ts) flow_text_index_name = flow_text_index_name_pre + iter_date uid_day_geo = {} #compute hashtag and geo hashtag_results = r_cluster.hmget('hashtag_' + str(ts), uid_list) ip_results = r_cluster.hmget('new_ip_' + str(ts), uid_list) #compute sensitive_words sensitive_results = r_cluster.hmget('sensitive_' + str(ts), uid_list) count = 0 for uid in uid_list: #init iter_results[uid] if uid not in iter_results: iter_results[uid] = { 'hashtag': {}, 'geo': {}, 'geo_track': [], 'keywords': {}, 'sensitive': {} } #compute hashtag hashtag_item = hashtag_results[count] if hashtag_item: uid_hashtag_dict = json.loads(hashtag_item) else: uid_hashtag_dict = {} for hashtag in uid_hashtag_dict: try: iter_results[uid]['hashtag'][hashtag] += uid_hashtag_dict[ hashtag] except: iter_results[uid]['hashtag'][hashtag] = uid_hashtag_dict[ hashtag] #compute sensitive sensitive_item = sensitive_results[count] if sensitive_item: uid_sensitive_dict = json.loads(sensitive_item) else: uid_sensitive_dict = {} for sensitive_word in uid_sensitive_dict: try: iter_results[uid]['sensitive'][ sensitive_word] += uid_sensitive_dict[sensitive_word] except: iter_results[uid]['sensitive'][ sensitive_word] = uid_sensitive_dict[sensitive_word] #compute geo uid_day_geo[uid] = {} ip_item = ip_results[count] if ip_item: uid_ip_dict = json.loads(ip_item) else: uid_ip_dict = {} for ip in uid_ip_dict: ip_count = len(uid_ip_dict[ip].split('&')) geo = ip2city(ip) if geo: #print 'geo:', geo try: iter_results[uid]['geo'][geo] += ip_count except: iter_results[uid]['geo'][geo] = ip_count try: uid_day_geo[uid][geo] += ip_count except: uid_day_geo[uid][geo] = ip_count iter_results[uid]['geo_track'].append(uid_day_geo[uid]) count += 1 #compute keywords: try: text_results = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type, \ body={'query':{'filtered':{'filter':{'terms':{'uid': uid_list}}}}, 'size':MAX_VALUE}, _source=True, fields=['uid', 'keywords_dict','text'])['hits']['hits'] except: text_results = {} for item in text_results: #print 'keywords item:', item uid = item['fields']['uid'][0] uid_keywords_dict = json.loads(item['fields']['keywords_dict'][0]) for keywords in uid_keywords_dict: try: iter_results[uid]['keywords'][ keywords] += uid_keywords_dict[keywords] except: iter_results[uid]['keywords'][ keywords] = uid_keywords_dict[keywords] #jln filter keyword 2016/11/08 weibo_text = json.loads(item['fields']['text'][0]) filter_keywords_dict = get_weibo_single(weibo_text) for keywords in filter_keywords_dict: try: iter_results[uid]['filter_keywords'][ keywords] += uid_keywords_dict[keywords] except: iter_results[uid]['filter_keywords'][ keywords] = uid_keywords_dict[keywords] #get keywords top for uid in uid_list: results[uid] = {} hashtag_dict = iter_results[uid]['hashtag'] results[uid]['hashtag_dict'] = json.dumps(hashtag_dict) results[uid]['hashtag'] = '&'.join(hashtag_dict.keys()) #sensitive words sensitive_word_dict = iter_results[uid]['sensitive'] results[uid]['sensitive_dict'] = json.dumps(sensitive_word_dict) results[uid]['sensitive_string'] = '&'.join(sensitive_word_dict.keys()) sensitive_score = 0 for item in sensitive_word_dict: k = item v = sensitive_word_dict[k] tmp_stage = r_sensitive.hget('sensitive_words', k) if tmp_stage: sensitive_score += v * sensitive_score_dict[str(tmp_stage)] results[uid]['sensitive'] = sensitive_score #print 'sensitive_dict:', results[uid]['sensitive_dict'] #print 'sensitive_string:', results[uid]['sensitive_string'] #print 'sensitive:', results[uid]['sensitive'] #geo geo_dict = iter_results[uid]['geo'] geo_track_list = iter_results[uid]['geo_track'] results[uid]['activity_geo_dict'] = json.dumps(geo_track_list) geo_dict_keys = geo_dict.keys() #print 'geo_dict_keys:', geo_dict_keys results[uid]['activity_geo'] = '&'.join( ['&'.join(item.split('\t')) for item in geo_dict_keys]) #print 'activity_geo:', results[uid]['activity_geo'] keywords_dict = iter_results[uid]['keywords'] keywords_top50 = sorted(keywords_dict.items(), key=lambda x: x[1], reverse=True)[:50] keywords_top50_string = '&'.join( [keyword_item[0] for keyword_item in keywords_top50]) filter_keywords_dict = iter_results[uid]['filter_keywords'] f_keywords_top50 = sorted(filter_keywords_dict.items(), key=lambda x: x[1], reverse=True)[:50] f_keywords_top50_string = '&'.join( [filter_keywords_dict[0] for keyword_item in f_keywords_top50]) results[uid]['keywords'] = json.dumps(keywords_top50) results[uid]['keywords_string'] = keywords_top50_string results[uid]['filter_keywords'] = json.dumps(f_keywords_top50) results[uid]['filter_keywords_string'] = f_keywords_top50_string return results
def main(): if RUN_TYPE: now_ts = time.time()-DAY # 前一天 ts = str(datetime2ts(ts2datetime(now_ts))) else: ts = str(datetime2ts('2013-09-02')) now_ts = int(ts) print now_ts sensitive_string = "sensitive_" + ts date_string = ts update_sensitive_key = "sensitive_score_" + ts # 更新的键 sensitive_dict_key = "sensitive_dict_" + ts sensitive_string_key = "sensitive_string_" + ts sensitive_day_change_key = "sensitive_" + ts +"_day_change" del_month = datetime2ts(ts2datetime(now_ts - MONTH)) del_sensitive_key = "sensitive_score_"+str(del_month) # 要删除的键 former_ts = int(ts) - DAY former_date = str(datetime2ts(ts2datetime(former_ts))) former_sensitive_key = "sensitive_score_" + former_date iter_count = 0 bulk_action = [] mappings(ES_SENSITIVE_INDEX) total_number = r_cluster.hlen(sensitive_string) scan_cursor = 0 print total_number while 1: re_scan = r_cluster.hscan(sensitive_string, scan_cursor, count=1000) scan_cursor = re_scan[0] if len(re_scan[1]) != 0: sensitive_info = re_scan[1] # 字典形式,uid:sensitive_words_dict uid_list = sensitive_info.keys() sensitive_results = es.mget(index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX, body={"ids":uid_list})['docs'] if sensitive_results: for item in sensitive_results: uid = item['_id'] sensitive_words_dict = json.loads(sensitive_info[uid]) # json.loads current_sensitive_score = 0 for k,v in sensitive_words_dict.iteritems(): tmp_stage = r_sensitive.hget("sensitive_words", k) if tmp_stage: tmp_stage = json.loads(tmp_stage) current_sensitive_score += v*sensitive_score_dict[str(tmp_stage[0])] if item['found']: # 之前存在相关信息 revise_item = item["_source"] if del_sensitive_key in revise_item: item.pop(del_sensitive_key) revise_item['uid'] = uid # 新更新的敏感度 revise_item[update_sensitive_key] = current_sensitive_score revise_item['last_value'] = current_sensitive_score # 新更新的敏感词 revise_item[sensitive_dict_key] = sensitive_info[uid] # 新更新的string revise_item[sensitive_string_key] = "&".join(sensitive_words_dict.keys()) # 当天和之前一天、一周和一月均值的差异 revise_item['sensitive_day_change'] = current_sensitive_score - revise_item.get(former_sensitive_key, 0) revise_item['sensitive_week_change'] = current_sensitive_score - revise_item.get('sensitive_week_ave', 0) revise_item['sensitive_month_change'] = current_sensitive_score - revise_item.get('sensitive_month_ave', 0) # 更新后week、month的均值和方差 revise_item['sensitive_week_ave'], revise_item['sensitive_week_var'], revise_item['sensitive_week_sum'] = compute_week(revise_item, now_ts) revise_item['senstiive_month_ave'], revise_item['sensitive_month_var'], revise_item['sensitive_month_sum'] = compute_month(revise_item, now_ts) else: revise_item = dict() revise_item['uid'] = uid revise_item[update_sensitive_key] = current_sensitive_score revise_item['last_value'] = current_sensitive_score revise_item[sensitive_dict_key] = sensitive_info[uid] revise_item[sensitive_string_key] = "&".join(sensitive_words_dict.keys()) revise_item['sensitive_day_change'] = current_sensitive_score revise_item['sensitive_week_change'] = current_sensitive_score revise_item['sensitive_month_change'] = current_sensitive_score revise_item['sensitive_week_ave'], revise_item['sensitive_week_var'], revise_item['sensitive_week_sum'] = compute_week(revise_item, now_ts) revise_item['senstiive_month_ave'], revise_item['sensitive_month_var'], revise_item['sensitive_month_sum'] = compute_month(revise_item, now_ts) action = {'index':{'_id': uid}} bulk_action.extend([action, revise_item]) iter_count += 1 if iter_count % 1000 == 0: es.bulk(bulk_action, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX) bulk_action = [] print iter_count if int(scan_cursor) == 0: break if bulk_action: es.bulk(bulk_action, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX) print iter_count #######更新尚未完成的用户 update_scan = scan(es, query={"query":{"filtered":{"filter":{"missing":{"field":update_sensitive_key}}}}}, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX) iter_count = 0 bulk_action = [] while 1: try: tmp = update_scan.next() revise_item = tmp['_source'] if del_sensitive_key in revise_item: revise_item.pop(del_sensitive_key) uid = tmp['_id'] # 新更新的敏感度 revise_item[update_sensitive_key] = 0 revise_item['last_value'] = 0 # 新更新的敏感词 revise_item[sensitive_dict_key] = json.dumps({}) # 新更新的string revise_item[sensitive_string_key] = "" # 当天和之前一天、一周和一月均值的差异 revise_item['sensitive_day_change'] = 0 - revise_item.get(former_sensitive_key, 0) revise_item['sensitive_week_change'] = 0 - revise_item.get('sensitive_week_ave', 0) revise_item['sensitive_month_change'] = 0 - revise_item.get('sensitive_month_ave', 0) # 更新后week、month的均值和方差 revise_item['sensitive_week_ave'], revise_item['sensitive_week_var'], revise_item['sensitive_week_sum'] = compute_week(revise_item, now_ts) revise_item['senstiive_month_ave'], revise_item['sensitive_month_var'], revise_item['sensitive_month_sum'] = compute_month(revise_item, now_ts) action = {'index':{'_id': uid}} bulk_action.extend([action, revise_item]) iter_count += 1 if iter_count % 1000 == 0: es.bulk(bulk_action, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX) bulk_action = [] except StopIteration: print "all done" if bulk_action: es.bulk(bulk_action, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX) break except Exception, r: print Exception, r
index_name = flow_text_index_name_pre + ts2datetime(timestamp) try: search_results = es_xnr.search(index=index_name, doc_type=flow_text_index_type, body=query_body)['hits']['hits'] except Exception, e: pass search_results = [] for result in search_results: text = result['_source']['text'].encode('utf-8') node = createWordTree() sensitive_words_dict = searchWord(text, node) if sensitive_words_dict: sensitive_words_list = [] for k, v in sensitive_words_dict.iteritems(): tmp_stage = r_sensitive.hget("sensitive_words", k) if tmp_stage: score += v * sensitive_score_dict[str(tmp_stage)] return score if __name__ == '__main__': # '2017-10-15' # print get_sensitive_user(timestamp=1507996800, uid='100003271864059') print get_sensitive_info(timestamp=1507996800, mid='123124323', text=u"64和达赖太阳花") print get_sensitive_info(timestamp=1507996800, mid='123124323') print get_sensitive_info(timestamp=1507996800, text=u"64和达赖太阳花") print get_sensitive_info(timestamp=1507996800, text=u'达赖') print get_sensitive_info(timestamp=1507996800, text=u'军区')
def get_flow_information_v2(uid_list, all_user_keywords_dict): results = {} #results = {uid:{'hashtag_dict':{},'hashtag':'', 'keywords_dict':{}, 'keywords_string':'', 'activity_geo':'', 'activity_geo_dict':dict, 'activity_geo_aggs':''}} iter_results = { } # iter_results = {uid:{'hashtag': hashtag_dict, 'geo':geo_dict, 'keywords':keywords_dict}} now_ts = time.time() #run_type today_sensitive_results = {} if RUN_TYPE == 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = test_ts print 'run_type:', RUN_TYPE for i in range(WEEK, 0, -1): ts = now_date_ts - DAY * i uid_day_geo = {} #compute hashtag and geo hashtag_results = r_cluster_3.hmget('hashtag_' + str(ts), uid_list) ip_results = r_cluster.hmget('new_ip_' + str(ts), uid_list) #print 'ip_results:', ip_results #compute sensitive_words sensitive_results = r_cluster_3.hmget('sensitive_' + str(ts), uid_list) count = 0 for uid in uid_list: #init iter_results[uid] if uid not in iter_results: iter_results[uid] = { 'hashtag': {}, 'geo': {}, 'geo_track': [], 'keywords': {}, 'sensitive': {}, 'school': {}, 'week_ip': { 0: {}, 1: {}, 2: {}, 3: {}, 4: {}, 5: {} }, 'ip': {} } if uid not in today_sensitive_results: today_sensitive_results[uid] = {} #compute hashtag hashtag_item = hashtag_results[count] if hashtag_item: uid_hashtag_dict = json.loads(hashtag_item) else: uid_hashtag_dict = {} for hashtag in uid_hashtag_dict: try: iter_results[uid]['hashtag'][hashtag] += uid_hashtag_dict[ hashtag] except: iter_results[uid]['hashtag'][hashtag] = uid_hashtag_dict[ hashtag] #compute sensitive sensitive_item = sensitive_results[count] if sensitive_item: uid_sensitive_dict = json.loads(sensitive_item) else: uid_sensitive_dict = {} for sensitive_word in uid_sensitive_dict: try: iter_results[uid]['sensitive'][ sensitive_word] += uid_sensitive_dict[sensitive_word] except: iter_results[uid]['sensitive'][ sensitive_word] = uid_sensitive_dict[sensitive_word] if ts == now_date_ts - DAY: try: today_sensitive_results[uid][ sensitive_word] += uid_sensitive_dict[ sensitive_word] except: today_sensitive_results[uid][ sensitive_word] = uid_sensitive_dict[ sensitive_word] #compute geo uid_day_geo[uid] = {} ip_item = ip_results[count] if ip_item: uid_ip_dict = json.loads(ip_item) else: uid_ip_dict = {} for ip in uid_ip_dict: ip_count = len(uid_ip_dict[ip].split('&')) geo, school = ip2city(ip) if geo: try: iter_results[uid]['geo'][geo] += ip_count except: iter_results[uid]['geo'][geo] = ip_count try: uid_day_geo[uid][geo] += ip_count except: uid_day_geo[uid][geo] = ip_count if school: try: iter_results[uid]['school'][school] += ip_count except: iter_results[uid]['school'][school] = ip_count #deal ip: job_ip&home_ip&active_ip ip_time_list = uid_ip_dict[ip].split('&') try: iter_results[uid]['ip'][ip] += ip_count except: iter_results[uid]['ip'] = {ip: ip_count} for ip_time_item in ip_time_list: ip_timesegment = (int(ip_time_item) - ts) / IP_TIME_SEGMENT try: iter_results[uid]['week_ip'][ip_timesegment][ip] += 1 except: iter_results[uid]['week_ip'][ip_timesegment][ip] = 1 #end deal ip iter_results[uid]['geo_track'].append(uid_day_geo[uid]) count += 1 #get keywords top for uid in uid_list: #print 'test iter_results_ip:', iter_results[uid]['week_ip'] results[uid] = {} #hashtag hashtag_dict = iter_results[uid]['hashtag'] results[uid]['hashtag_dict'] = json.dumps(hashtag_dict) results[uid]['hashtag'] = '&'.join(hashtag_dict.keys()) #sensitive words sensitive_word_dict = iter_results[uid]['sensitive'] results[uid]['sensitive_dict'] = json.dumps(sensitive_word_dict) results[uid]['sensitive_string'] = '&'.join(sensitive_word_dict.keys()) sensitive_score = 0 today_sensitive_results_user = today_sensitive_results[uid] for sensitive_item in today_sensitive_results_user: k = sensitive_item v = today_sensitive_results_user[sensitive_item] tmp_stage = r_sensitive.hget('sensitive_words', k) if tmp_stage: sensitive_score += v * sensitive_score_dict[str(tmp_stage)] results[uid]['sensitive'] = sensitive_score #geo geo_dict = iter_results[uid]['geo'] geo_track_list = iter_results[uid]['geo_track'] results[uid]['activity_geo_dict'] = json.dumps(geo_track_list) geo_dict_keys = geo_dict.keys() results[uid]['activity_geo'] = '&'.join( ['&'.join(item.split('\t')) for item in geo_dict_keys]) try: results[uid]['activity_geo_aggs'] = '&'.join( [item.split('\t')[-1] for item in geo_dict_keys]) except: results[uid]['activity_geo_aggs'] = '' #keywords keywords_dict = all_user_keywords_dict[uid] keywords_top50 = sorted(keywords_dict.items(), key=lambda x: x[1], reverse=True)[:50] keywords_top50_string = '&'.join( [keyword_item[0] for keyword_item in keywords_top50]) results[uid]['keywords'] = json.dumps(keywords_top50) results[uid]['keywords_string'] = keywords_top50_string #school dict school_dict = iter_results[uid]['school'] school_string = '&'.join(school_dict.keys()) if school_dict != {}: is_school = '1' else: is_school = '0' results[uid]['is_school'] = is_school results[uid]['school_string'] = school_string results[uid]['school_dict'] = json.dumps(school_dict) #ip: job_ip&home_ip&activity_ip #activity_ip all_ip_dict = iter_results[uid]['ip'] sort_all_ip = sorted(all_ip_dict.items(), key=lambda x: x[1], reverse=True) try: activity_ip = sort_all_ip[0][0] except: activity_ip = '' results[uid]['activity_ip'] = str(activity_ip) #job_ip & home_ip week_time_ip_dict = iter_results[uid]['week_ip'] for i in range(0, 6): try: segment_dict = week_time_ip_dict[i] except: week_time_ip_dict[i] = {} home_ip, job_ip = get_ip_description(week_time_ip_dict) results[uid]['home_ip'] = str(home_ip) results[uid]['job_ip'] = str(job_ip) return results
def get_flow_information(uid_list): # 前七天的数据, 不能用于每天更新 lenth = len(uid_list) results = {} iter_results = {} result_dict = {} if RUN_TYPE: now_ts = time.time() now_date = ts2datetime(now_ts) # date: 2013-09-01 else: now_date = "2013-09-08" ts = datetime2ts(now_date) start_ts = ts - 8 * 3600 * 24 for i in range(1, 8): ts = start_ts + i * 3600 * 24 date = ts2datetime(ts) print "date:", date uid_day_geo = {} sensitive_uid_day_geo = {} flow_index_name = flow_text_index_name_pre + str(date) # hashtag print uid_list hashtag_results = redis_cluster.hmget('hashtag_' + str(ts), uid_list) sensitive_hashtag = redis_cluster.hmget('sensitive_hashtag_' + str(ts), uid_list) # sensitive_words sensitive_results = redis_cluster.hmget('sensitive_' + str(ts), uid_list) # ip if WORK_TYPE == 0: ip_index_name = ip_index_pre + str(date) sensitive_ip_index_name = sen_ip_index_pre + str(date) #activity_index_name = act_index_pre + str(date) #sensitive_activity_index_name = sen_act_index_pre + str(date) exist_bool = es_cluster.indices.exists(index=ip_index_name) sensitive_exist_bool = es_cluster.indices.exists( index=sensitive_ip_index_name) #activity_exist_bool = es_cluster.indices.exists(index=activity_index_name) #sensitive_activity_exist_bool = es_cluster.indices.exists(index=sensitive_activity_index_name) if exist_bool: ip_results = es_cluster.mget(index=ip_index_name, doc_type="ip", body={"ids": uid_list})["docs"] else: ip_results = [dict()] * lenth if sensitive_exist_bool: sensitive_ip_results = es_cluster.mget( index=sensitive_ip_index_name, doc_type="sensitive_ip", body={"ids": uid_list})["docs"] else: sensitive_ip_results = [dict()] * lenth """ if activity_exist_bool: activity_results = es_cluster.mget(index=activity_index_name, doc_type="activity", body={"ids":uid_list})["docs"] else: activity_results = [dict()]*lenth if sensitive_activity_exist_bool: sensitive_activity_results = es_cluster.mget(index=sensitive_activity_index_name, doc_type="sensitive_activity", body={"ids":uid_list})["docs"] else: sensitive_activity_results = [dict()]*lenth """ else: ip_results = redis_ip.hmget('ip_' + str(ts), uid_list) sensitive_ip_results = redis_ip.hmget('sensitive_ip_' + str(ts), uid_list) #activity_results = redis_activity.hmget('activity_'+str(date), uid_list) #sensitive_activity_results = redis_activity.hmget('sensitive_activity_'+str(date), uid_list) for j in range(0, len(uid_list)): uid = uid_list[j] if uid not in iter_results: iter_results[uid] = {'hashtag':{}, 'sensitive_hashtag':{}, 'geo':{}, "sensitive_geo":{},'geo_track':[],'keywords':{}, \ 'sensitive_words':{}, "sensitive_geo_track":[],'ip': [], 'sensitive_ip':[]} # sensitive words if sensitive_results[j]: sensitive_words_results = json.loads(sensitive_results[j]) for sensitive_word in sensitive_words_results: try: iter_results[uid]["sensitive_words"][ sensitive_word] += sensitive_words_results[ sensitive_word] except: iter_results[uid]["sensitive_words"][ sensitive_word] = sensitive_words_results[ sensitive_word] #print "sensitive_words:", iter_results[uid]["sensitive_words"] if hashtag_results[j]: hashtag_dict = json.loads(hashtag_results[j]) for hashtag in hashtag_dict: try: iter_results[uid]['hashtag'][hashtag] += hashtag_dict[ hashtag] except: iter_results[uid]['hashtag'][hashtag] = hashtag_dict[ hashtag] #print "hashtag: ", iter_results[uid]['hashtag'] if sensitive_hashtag[j]: sensitive_hashtag_dict = json.loads(sensitive_hashtag[j]) for hashtag in sensitive_hashtag_dict: try: iter_results[uid]['sensitive_hashtag'][ hashtag] += sensitive_hashtag_dict[hashtag] except: iter_results[uid]['sensitive_hashtag'][ hashtag] = sensitive_hashtag_dict[hashtag] #print "sensitive_hashtag:", iter_results[uid]['sensitive_hashtag'] uid_day_geo[uid] = {} sensitive_uid_day_geo[uid] = {} if WORK_TYPE == 0: # es if ip_results[j]: if ip_results[j]['found']: detail_item = ip_results[j]['_source'] ip_dict = json.loads(detail_item['ip_dict']) else: ip_dict = {} else: ip_dict = {} else: if ip_results[j]: ip_dict = json.loads(ip_results[j]) else: ip_dict = {} if ip_dict: #iter_results[uid]['ip'].append(ip_dict) geo_dict = ip2geo(ip_dict) for geo, count in geo_dict.iteritems(): try: iter_results[uid]['geo'][geo] += count except: iter_results[uid]['geo'][geo] = count try: uid_day_geo[uid][geo] += count except: uid_day_geo[uid][geo] = count #iter_results[uid]['ip'].append(ip_dict) iter_results[uid]['geo_track'].append(uid_day_geo[uid]) #print "ip:", iter_results[uid]['ip'], iter_results[uid]['geo_track'] if WORK_TYPE == 0: if sensitive_ip_results[j]: if sensitive_ip_results[j]['found']: detail_item = sensitive_ip_results[j]['_source'] sensitive_ip_dict = json.loads( detail_item['sensitive_ip_dict']) else: sensitive_ip_dict = dict() else: sensitive_ip_dict = dict() else: if sensitive_ip_results[j]: sensitive_ip_dict = json.loads(sensitive_ip_results[j]) else: sensitive_ip_dict = dict() if sensitive_ip_dict: sensitive_geo_dict = ip2geo(sensitive_ip_dict) #iter_results[uid]['sensitive_ip'].append(sensitive_ip_dict) for geo, count in sensitive_geo_dict.iteritems(): try: iter_results[uid]['sensitive_geo'][geo] += count except: iter_results[uid]['sensitive_geo'][geo] = count try: sensitive_uid_day_geo[uid][geo] += count except: sensitive_uid_day_geo[uid][geo] = count #iter_results[uid]['sensitive_ip'].append(sensitive_ip_dict) iter_results[uid]['sensitive_geo_track'].append( sensitive_uid_day_geo[uid]) #print "sensitive_ip:", iter_results[uid]['sensitive_ip'], iter_results[uid]['sensitive_geo_track'] # compute keywords flow_text_exist = es_flow_text.indices.exists(index=flow_index_name) if flow_text_exist: text_results = es_flow_text.search(index=flow_index_name, doc_type=flow_text_index_type,\ body={'query':{'filtered':{'filter':{'terms':{'uid': uid_list}}}}, 'size':MAX_VALUE},_source=False, fields=['uid', 'keywords_dict'])['hits']['hits'] else: text_results = {} for item in text_results: uid = item['fields']['uid'][0] uid_keywords_dict = json.loads(item['fields']['keywords_dict'][0]) for keywords in uid_keywords_dict: try: iter_results[uid]['keywords'][ keywords] += uid_keywords_dict[keywords] except: iter_results[uid]['keywords'][ keywords] = uid_keywords_dict[keywords] #print "keywords:", iter_results[uid]['keywords'] for uid in uid_list: results[uid] = {} # hashtag hashtag_dict = iter_results[uid]['hashtag'] results[uid]['hashtag_dict'] = json.dumps(hashtag_dict) results[uid]['hashtag_string'] = '&'.join(hashtag_dict.keys()) # sensitive hashtag sensitive_hashtag_dict = iter_results[uid]['sensitive_hashtag'] results[uid]['sensitive_hashtag_dict'] = json.dumps( sensitive_hashtag_dict) results[uid]['sensitive_hashtag_string'] = '&'.join( sensitive_hashtag_dict.keys()) # sensitive_words sensitive_word_dict = iter_results[uid]['sensitive_words'] results[uid]['sensitive_words_dict'] = json.dumps(sensitive_word_dict) results[uid]['sensitive_words_string'] = '&'.join( sensitive_word_dict.keys()) sensitive_score = 0 for k, v in sensitive_word_dict.iteritems(): tmp = r_sensitive.hget('sensitive_words', k) if tmp: tmp_stage = json.loads(tmp) sensitive_score += sensitive_score_dict[str(tmp_stage[0])] * v results[uid]['sensitive'] = sensitive_score # geo geo_dict = iter_results[uid]['geo'] geo_track_list = iter_results[uid]['geo_track'] results[uid]['activity_geo_dict'] = json.dumps(geo_track_list) geo_dict_keys = geo_dict.keys() results[uid]['activity_geo'] = '&'.join( ['&'.join(item.split('\t')) for item in geo_dict_keys]) results[uid]['activity_geo_aggs'] = '&'.join( [item.split('\t')[-1] for item in geo_dict_keys]) sensitive_geo_dict = iter_results[uid]['sensitive_geo'] sensitive_geo_track_list = iter_results[uid]['sensitive_geo_track'] results[uid]['sensitive_activity_geo_dict'] = json.dumps( sensitive_geo_track_list) sensitive_geo_dict_keys = sensitive_geo_dict.keys() results[uid]['sensitive_activity_geo'] = '&'.join( ['&'.join(item.split('\t')) for item in sensitive_geo_dict_keys]) results[uid]['sensitive_activity_geo_aggs'] = '&'.join( [item.split('\t')[-1] for item in sensitive_geo_dict_keys]) keywords_dict = iter_results[uid]['keywords'] keywords_top50 = sorted(keywords_dict.items(), key=lambda x: x[1], reverse=True)[:50] keywords_top50_string = '&'.join( [keyword_item[0] for keyword_item in keywords_top50]) results[uid]['keywords_dict'] = json.dumps(keywords_top50) results[uid]['keywords_string'] = keywords_top50_string return results
def get_flow_information_v2(uid_list, all_user_keywords_dict): results = {} #results = {uid:{'hashtag_dict':{},'hashtag':'', 'keywords_dict':{}, 'keywords_string':'', 'activity_geo':'', 'activity_geo_dict':dict, 'activity_geo_aggs':''}} iter_results = {} # iter_results = {uid:{'hashtag': hashtag_dict, 'geo':geo_dict, 'keywords':keywords_dict}} now_ts = time.time() #run_type today_sensitive_results = {} if RUN_TYPE == 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = test_ts for i in range(WEEK,0,-1): ts = now_date_ts - DAY*i uid_day_geo = {} #compute hashtag and geo hashtag_results = r_cluster_3.hmget('hashtag_'+str(ts), uid_list) ip_results = r_cluster.hmget('new_ip_'+str(ts), uid_list) #compute sensitive_words sensitive_results = r_cluster_3.hmget('sensitive_'+str(ts), uid_list) count = 0 for uid in uid_list: #init iter_results[uid] if uid not in iter_results: iter_results[uid] = {'hashtag':{}, 'geo':{},'geo_track':[],'keywords':{}, 'sensitive':{}, 'school':{}} if uid not in today_sensitive_results: today_sensitive_results[uid] = {} #compute hashtag hashtag_item = hashtag_results[count] if hashtag_item: uid_hashtag_dict = json.loads(hashtag_item) else: uid_hashtag_dict = {} for hashtag in uid_hashtag_dict: try: iter_results[uid]['hashtag'][hashtag] += uid_hashtag_dict[hashtag] except: iter_results[uid]['hashtag'][hashtag] = uid_hashtag_dict[hashtag] #compute sensitive sensitive_item = sensitive_results[count] if sensitive_item: uid_sensitive_dict = json.loads(sensitive_item) else: uid_sensitive_dict = {} for sensitive_word in uid_sensitive_dict: try: iter_results[uid]['sensitive'][sensitive_word] += uid_sensitive_dict[sensitive_word] except: iter_results[uid]['sensitive'][sensitive_word] = uid_sensitive_dict[sensitive_word] if ts == now_date_ts - DAY: try: today_sensitive_results[uid][sensitive_word] += uid_sensitive_dict[sensitive_word] except: today_sensitive_results[uid][sensitive_word] = uid_sensitive_dict[sensitive_word] #compute geo uid_day_geo[uid] = {} ip_item = ip_results[count] if ip_item: uid_ip_dict = json.loads(ip_item) else: uid_ip_dict = {} for ip in uid_ip_dict: ip_count = len(uid_ip_dict[ip].split('&')) geo, school = ip2city(ip) if geo: try: iter_results[uid]['geo'][geo] += ip_count except: iter_results[uid]['geo'][geo] = ip_count try: uid_day_geo[uid][geo] += ip_count except: uid_day_geo[uid][geo] = ip_count if school: try: iter_results[uid]['school'][school] += ip_count except: iter_results[uid]['school'][school] = ip_count iter_results[uid]['geo_track'].append(uid_day_geo[uid]) count += 1 #get keywords top for uid in uid_list: results[uid] = {} #hashtag hashtag_dict = iter_results[uid]['hashtag'] results[uid]['hashtag_dict'] = json.dumps(hashtag_dict) results[uid]['hashtag'] = '&'.join(hashtag_dict.keys()) #sensitive words sensitive_word_dict = iter_results[uid]['sensitive'] results[uid]['sensitive_dict'] = json.dumps(sensitive_word_dict) results[uid]['sensitive_string'] = '&'.join(sensitive_word_dict.keys()) sensitive_score = 0 today_sensitive_results_user = today_sensitive_results[uid] for sensitive_item in today_sensitive_results_user: k = sensitive_item v = today_sensitive_results_user[sensitive_item] tmp_stage = r_sensitive.hget('sensitive_words', k) if tmp_stage: sensitive_score += v * sensitive_score_dict[str(tmp_stage)] results[uid]['sensitive'] = sensitive_score #print 'sensitive_dict:', results[uid]['sensitive_dict'] #print 'sensitive_string:', results[uid]['sensitive_string'] #print 'sensitive:', results[uid]['sensitive'] #geo geo_dict = iter_results[uid]['geo'] geo_track_list = iter_results[uid]['geo_track'] results[uid]['activity_geo_dict'] = json.dumps(geo_track_list) geo_dict_keys = geo_dict.keys() results[uid]['activity_geo'] = '&'.join(['&'.join(item.split('\t')) for item in geo_dict_keys]) try: results[uid]['activity_geo_aggs'] = '&'.join([item.split('\t')[-1] for item in geo_dict_keys]) except: results[uid]['activity_geo_aggs'] = '' #keywords keywords_dict = all_user_keywords_dict[uid] keywords_top50 = sorted(keywords_dict.items(), key=lambda x:x[1], reverse=True)[:50] keywords_top50_string = '&'.join([keyword_item[0] for keyword_item in keywords_top50]) results[uid]['keywords'] = json.dumps(keywords_top50) results[uid]['keywords_string'] = keywords_top50_string #school dict school_dict = iter_results[uid]['school'] school_string = '&'.join(school_dict.keys()) if school_dict != {}: is_school = '1' else: is_school = '0' results[uid]['is_school'] = is_school results[uid]['school_string'] = school_string results[uid]['school_dict'] = json.dumps(school_dict) return results
def get_flow_information(uid_list): # 前七天的数据, 不能用于每天更新 lenth = len(uid_list) results = {} iter_results = {} result_dict = {} if RUN_TYPE: now_ts = time.time() now_date = ts2datetime(now_ts) # date: 2013-09-01 else: now_date = "2013-09-08" ts = datetime2ts(now_date) start_ts = ts - 8 * 3600 * 24 for i in range(1, 8): ts = start_ts + i * 3600 * 24 date = ts2datetime(ts) print "date:", date uid_day_geo = {} sensitive_uid_day_geo = {} flow_index_name = flow_text_index_name_pre + str(date) # hashtag print uid_list hashtag_results = redis_cluster.hmget("hashtag_" + str(ts), uid_list) sensitive_hashtag = redis_cluster.hmget("sensitive_hashtag_" + str(ts), uid_list) # sensitive_words sensitive_results = redis_cluster.hmget("sensitive_" + str(ts), uid_list) # ip if WORK_TYPE == 0: ip_index_name = ip_index_pre + str(date) sensitive_ip_index_name = sen_ip_index_pre + str(date) # activity_index_name = act_index_pre + str(date) # sensitive_activity_index_name = sen_act_index_pre + str(date) exist_bool = es_cluster.indices.exists(index=ip_index_name) sensitive_exist_bool = es_cluster.indices.exists(index=sensitive_ip_index_name) # activity_exist_bool = es_cluster.indices.exists(index=activity_index_name) # sensitive_activity_exist_bool = es_cluster.indices.exists(index=sensitive_activity_index_name) if exist_bool: ip_results = es_cluster.mget(index=ip_index_name, doc_type="ip", body={"ids": uid_list})["docs"] else: ip_results = [dict()] * lenth if sensitive_exist_bool: sensitive_ip_results = es_cluster.mget( index=sensitive_ip_index_name, doc_type="sensitive_ip", body={"ids": uid_list} )["docs"] else: sensitive_ip_results = [dict()] * lenth """ if activity_exist_bool: activity_results = es_cluster.mget(index=activity_index_name, doc_type="activity", body={"ids":uid_list})["docs"] else: activity_results = [dict()]*lenth if sensitive_activity_exist_bool: sensitive_activity_results = es_cluster.mget(index=sensitive_activity_index_name, doc_type="sensitive_activity", body={"ids":uid_list})["docs"] else: sensitive_activity_results = [dict()]*lenth """ else: ip_results = redis_ip.hmget("ip_" + str(ts), uid_list) sensitive_ip_results = redis_ip.hmget("sensitive_ip_" + str(ts), uid_list) # activity_results = redis_activity.hmget('activity_'+str(date), uid_list) # sensitive_activity_results = redis_activity.hmget('sensitive_activity_'+str(date), uid_list) for j in range(0, len(uid_list)): uid = uid_list[j] if uid not in iter_results: iter_results[uid] = { "hashtag": {}, "sensitive_hashtag": {}, "geo": {}, "sensitive_geo": {}, "geo_track": [], "keywords": {}, "sensitive_words": {}, "sensitive_geo_track": [], "ip": [], "sensitive_ip": [], } # sensitive words if sensitive_results[j]: sensitive_words_results = json.loads(sensitive_results[j]) for sensitive_word in sensitive_words_results: try: iter_results[uid]["sensitive_words"][sensitive_word] += sensitive_words_results[sensitive_word] except: iter_results[uid]["sensitive_words"][sensitive_word] = sensitive_words_results[sensitive_word] # print "sensitive_words:", iter_results[uid]["sensitive_words"] if hashtag_results[j]: hashtag_dict = json.loads(hashtag_results[j]) for hashtag in hashtag_dict: try: iter_results[uid]["hashtag"][hashtag] += hashtag_dict[hashtag] except: iter_results[uid]["hashtag"][hashtag] = hashtag_dict[hashtag] # print "hashtag: ", iter_results[uid]['hashtag'] if sensitive_hashtag[j]: sensitive_hashtag_dict = json.loads(sensitive_hashtag[j]) for hashtag in sensitive_hashtag_dict: try: iter_results[uid]["sensitive_hashtag"][hashtag] += sensitive_hashtag_dict[hashtag] except: iter_results[uid]["sensitive_hashtag"][hashtag] = sensitive_hashtag_dict[hashtag] # print "sensitive_hashtag:", iter_results[uid]['sensitive_hashtag'] uid_day_geo[uid] = {} sensitive_uid_day_geo[uid] = {} if WORK_TYPE == 0: # es if ip_results[j]: if ip_results[j]["found"]: detail_item = ip_results[j]["_source"] ip_dict = json.loads(detail_item["ip_dict"]) else: ip_dict = {} else: ip_dict = {} else: if ip_results[j]: ip_dict = json.loads(ip_results[j]) else: ip_dict = {} if ip_dict: # iter_results[uid]['ip'].append(ip_dict) geo_dict = ip2geo(ip_dict) for geo, count in geo_dict.iteritems(): try: iter_results[uid]["geo"][geo] += count except: iter_results[uid]["geo"][geo] = count try: uid_day_geo[uid][geo] += count except: uid_day_geo[uid][geo] = count # iter_results[uid]['ip'].append(ip_dict) iter_results[uid]["geo_track"].append(uid_day_geo[uid]) # print "ip:", iter_results[uid]['ip'], iter_results[uid]['geo_track'] if WORK_TYPE == 0: if sensitive_ip_results[j]: if sensitive_ip_results[j]["found"]: detail_item = sensitive_ip_results[j]["_source"] sensitive_ip_dict = json.loads(detail_item["sensitive_ip_dict"]) else: sensitive_ip_dict = dict() else: sensitive_ip_dict = dict() else: if sensitive_ip_results[j]: sensitive_ip_dict = json.loads(sensitive_ip_results[j]) else: sensitive_ip_dict = dict() if sensitive_ip_dict: sensitive_geo_dict = ip2geo(sensitive_ip_dict) # iter_results[uid]['sensitive_ip'].append(sensitive_ip_dict) for geo, count in sensitive_geo_dict.iteritems(): try: iter_results[uid]["sensitive_geo"][geo] += count except: iter_results[uid]["sensitive_geo"][geo] = count try: sensitive_uid_day_geo[uid][geo] += count except: sensitive_uid_day_geo[uid][geo] = count # iter_results[uid]['sensitive_ip'].append(sensitive_ip_dict) iter_results[uid]["sensitive_geo_track"].append(sensitive_uid_day_geo[uid]) # print "sensitive_ip:", iter_results[uid]['sensitive_ip'], iter_results[uid]['sensitive_geo_track'] # compute keywords flow_text_exist = es_flow_text.indices.exists(index=flow_index_name) if flow_text_exist: text_results = es_flow_text.search( index=flow_index_name, doc_type=flow_text_index_type, body={"query": {"filtered": {"filter": {"terms": {"uid": uid_list}}}}, "size": MAX_VALUE}, _source=False, fields=["uid", "keywords_dict"], )["hits"]["hits"] else: text_results = {} for item in text_results: uid = item["fields"]["uid"][0] uid_keywords_dict = json.loads(item["fields"]["keywords_dict"][0]) for keywords in uid_keywords_dict: try: iter_results[uid]["keywords"][keywords] += uid_keywords_dict[keywords] except: iter_results[uid]["keywords"][keywords] = uid_keywords_dict[keywords] # print "keywords:", iter_results[uid]['keywords'] for uid in uid_list: results[uid] = {} # hashtag hashtag_dict = iter_results[uid]["hashtag"] results[uid]["hashtag_dict"] = json.dumps(hashtag_dict) results[uid]["hashtag_string"] = "&".join(hashtag_dict.keys()) # sensitive hashtag sensitive_hashtag_dict = iter_results[uid]["sensitive_hashtag"] results[uid]["sensitive_hashtag_dict"] = json.dumps(sensitive_hashtag_dict) results[uid]["sensitive_hashtag_string"] = "&".join(sensitive_hashtag_dict.keys()) # sensitive_words sensitive_word_dict = iter_results[uid]["sensitive_words"] results[uid]["sensitive_words_dict"] = json.dumps(sensitive_word_dict) results[uid]["sensitive_words_string"] = "&".join(sensitive_word_dict.keys()) sensitive_score = 0 for k, v in sensitive_word_dict.iteritems(): tmp = r_sensitive.hget("sensitive_words", k) if tmp: tmp_stage = json.loads(tmp) sensitive_score += sensitive_score_dict[str(tmp_stage[0])] * v results[uid]["sensitive"] = sensitive_score # geo geo_dict = iter_results[uid]["geo"] geo_track_list = iter_results[uid]["geo_track"] results[uid]["activity_geo_dict"] = json.dumps(geo_track_list) geo_dict_keys = geo_dict.keys() results[uid]["activity_geo"] = "&".join(["&".join(item.split("\t")) for item in geo_dict_keys]) results[uid]["activity_geo_aggs"] = "&".join([item.split("\t")[-1] for item in geo_dict_keys]) sensitive_geo_dict = iter_results[uid]["sensitive_geo"] sensitive_geo_track_list = iter_results[uid]["sensitive_geo_track"] results[uid]["sensitive_activity_geo_dict"] = json.dumps(sensitive_geo_track_list) sensitive_geo_dict_keys = sensitive_geo_dict.keys() results[uid]["sensitive_activity_geo"] = "&".join( ["&".join(item.split("\t")) for item in sensitive_geo_dict_keys] ) results[uid]["sensitive_activity_geo_aggs"] = "&".join( [item.split("\t")[-1] for item in sensitive_geo_dict_keys] ) keywords_dict = iter_results[uid]["keywords"] keywords_top50 = sorted(keywords_dict.items(), key=lambda x: x[1], reverse=True)[:50] keywords_top50_string = "&".join([keyword_item[0] for keyword_item in keywords_top50]) results[uid]["keywords_dict"] = json.dumps(keywords_top50) results[uid]["keywords_string"] = keywords_top50_string return results
def get_flow_information(uid_list): results = {} #results = {uid:{'hashtag_dict':{},'hashtag':'', 'keywords_dict':{}, 'keywords_string':'', 'activity_geo':'', 'activity_geo_dict':dict}} iter_results = {} # iter_results = {uid:{'hashtag': hashtag_dict, 'geo':geo_dict, 'keywords':keywords_dict}} now_ts = time.time() now_date_ts = datetime2ts(ts2datetime(now_ts)) #test now_date_ts = test_ts for i in range(7,0,-1): ts = now_date_ts - DAY*i iter_date = ts2datetime(ts) flow_text_index_name = flow_text_index_name_pre + iter_date uid_day_geo = {} #compute hashtag and geo hashtag_results = r_cluster_3.hmget('hashtag_'+str(ts), uid_list) ip_results = r_cluster.hmget('new_ip_'+str(ts), uid_list) #compute sensitive_words sensitive_results = r_cluster_3.hmget('sensitive_'+str(ts), uid_list) count = 0 for uid in uid_list: #init iter_results[uid] if uid not in iter_results: iter_results[uid] = {'hashtag':{}, 'geo':{},'geo_track':[],'keywords':{}, 'sensitive':{}} #compute hashtag hashtag_item = hashtag_results[count] if hashtag_item: uid_hashtag_dict = json.loads(hashtag_item) else: uid_hashtag_dict = {} for hashtag in uid_hashtag_dict: try: iter_results[uid]['hashtag'][hashtag] += uid_hashtag_dict[hashtag] except: iter_results[uid]['hashtag'][hashtag] = uid_hashtag_dict[hashtag] #compute sensitive sensitive_item = sensitive_results[count] if sensitive_item: uid_sensitive_dict = json.loads(sensitive_item) else: uid_sensitive_dict = {} for sensitive_word in uid_sensitive_dict: try: iter_results[uid]['sensitive'][sensitive_word] += uid_sensitive_dict[sensitive_word] except: iter_results[uid]['sensitive'][sensitive_word] = uid_sensitive_dict[sensitive_word] #compute geo uid_day_geo[uid] = {} ip_item = ip_results[count] if ip_item: uid_ip_dict = json.loads(ip_item) else: uid_ip_dict = {} for ip in uid_ip_dict: ip_count = len(uid_ip_dict[ip].split('&')) geo = ip2city(ip) if geo: #print 'geo:', geo try: iter_results[uid]['geo'][geo] += ip_count except: iter_results[uid]['geo'][geo] = ip_count try: uid_day_geo[uid][geo] += ip_count except: uid_day_geo[uid][geo] = ip_count iter_results[uid]['geo_track'].append(uid_day_geo[uid]) count += 1 #compute keywords: try: text_results = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type, \ body={'query':{'filtered':{'filter':{'terms':{'uid': uid_list}}}}, 'size':MAX_VALUE}, _source=True, fields=['uid', 'keywords_dict'])['hits']['hits'] except: text_results = {} for item in text_results: #print 'keywords item:', item uid = item['fields']['uid'][0] uid_keywords_dict = json.loads(item['fields']['keywords_dict'][0]) for keywords in uid_keywords_dict: try: iter_results[uid]['keywords'][keywords] += uid_keywords_dict[keywords] except: iter_results[uid]['keywords'][keywords] = uid_keywords_dict[keywords] #get keywords top for uid in uid_list: results[uid] = {} hashtag_dict = iter_results[uid]['hashtag'] results[uid]['hashtag_dict'] = json.dumps(hashtag_dict) results[uid]['hashtag'] = '&'.join(hashtag_dict.keys()) #sensitive words sensitive_word_dict = iter_results[uid]['sensitive'] results[uid]['sensitive_dict'] = json.dumps(sensitive_word_dict) results[uid]['sensitive_string'] = '&'.join(sensitive_word_dict.keys()) sensitive_score = 0 for item in sensitive_word_dict: k = item v = sensitive_word_dict[k] tmp_stage = r_sensitive.hget('sensitive_words', k) if tmp_stage: sensitive_score += v * sensitive_score_dict[str(tmp_stage)] results[uid]['sensitive'] = sensitive_score #print 'sensitive_dict:', results[uid]['sensitive_dict'] #print 'sensitive_string:', results[uid]['sensitive_string'] #print 'sensitive:', results[uid]['sensitive'] #geo geo_dict = iter_results[uid]['geo'] geo_track_list = iter_results[uid]['geo_track'] results[uid]['activity_geo_dict'] = json.dumps(geo_track_list) geo_dict_keys = geo_dict.keys() #print 'geo_dict_keys:', geo_dict_keys results[uid]['activity_geo'] = '&'.join(['&'.join(item.split('\t')) for item in geo_dict_keys]) #print 'activity_geo:', results[uid]['activity_geo'] keywords_dict = iter_results[uid]['keywords'] keywords_top50 = sorted(keywords_dict.items(), key=lambda x:x[1], reverse=True)[:50] keywords_top50_string = '&'.join([keyword_item[0] for keyword_item in keywords_top50]) results[uid]['keywords'] = json.dumps(keywords_top50) results[uid]['keywords_string'] = keywords_top50_string return results
def main(): if RUN_TYPE: now_ts = time.time() - DAY # 前一天 ts = str(datetime2ts(ts2datetime(now_ts))) else: ts = str(datetime2ts('2013-09-02')) now_ts = int(ts) print now_ts sensitive_string = "sensitive_" + ts date_string = ts update_sensitive_key = "sensitive_score_" + ts # 更新的键 sensitive_dict_key = "sensitive_dict_" + ts sensitive_string_key = "sensitive_string_" + ts sensitive_day_change_key = "sensitive_" + ts + "_day_change" del_month = datetime2ts(ts2datetime(now_ts - MONTH)) del_sensitive_key = "sensitive_score_" + str(del_month) # 要删除的键 former_ts = int(ts) - DAY former_date = str(datetime2ts(ts2datetime(former_ts))) former_sensitive_key = "sensitive_score_" + former_date iter_count = 0 bulk_action = [] mappings(ES_SENSITIVE_INDEX) total_number = r_cluster.hlen(sensitive_string) scan_cursor = 0 print total_number while 1: re_scan = r_cluster.hscan(sensitive_string, scan_cursor, count=1000) scan_cursor = re_scan[0] if len(re_scan[1]) != 0: sensitive_info = re_scan[1] # 字典形式,uid:sensitive_words_dict uid_list = sensitive_info.keys() sensitive_results = es.mget(index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX, body={"ids": uid_list})['docs'] if sensitive_results: for item in sensitive_results: uid = item['_id'] sensitive_words_dict = json.loads( sensitive_info[uid]) # json.loads current_sensitive_score = 0 for k, v in sensitive_words_dict.iteritems(): tmp_stage = r_sensitive.hget("sensitive_words", k) if tmp_stage: tmp_stage = json.loads(tmp_stage) current_sensitive_score += v * sensitive_score_dict[ str(tmp_stage[0])] if item['found']: # 之前存在相关信息 revise_item = item["_source"] if del_sensitive_key in revise_item: item.pop(del_sensitive_key) revise_item['uid'] = uid # 新更新的敏感度 revise_item[ update_sensitive_key] = current_sensitive_score revise_item['last_value'] = current_sensitive_score # 新更新的敏感词 revise_item[sensitive_dict_key] = sensitive_info[uid] # 新更新的string revise_item[sensitive_string_key] = "&".join( sensitive_words_dict.keys()) # 当天和之前一天、一周和一月均值的差异 revise_item[ 'sensitive_day_change'] = current_sensitive_score - revise_item.get( former_sensitive_key, 0) revise_item[ 'sensitive_week_change'] = current_sensitive_score - revise_item.get( 'sensitive_week_ave', 0) revise_item[ 'sensitive_month_change'] = current_sensitive_score - revise_item.get( 'sensitive_month_ave', 0) # 更新后week、month的均值和方差 revise_item['sensitive_week_ave'], revise_item[ 'sensitive_week_var'], revise_item[ 'sensitive_week_sum'] = compute_week( revise_item, now_ts) revise_item['senstiive_month_ave'], revise_item[ 'sensitive_month_var'], revise_item[ 'sensitive_month_sum'] = compute_month( revise_item, now_ts) else: revise_item = dict() revise_item['uid'] = uid revise_item[ update_sensitive_key] = current_sensitive_score revise_item['last_value'] = current_sensitive_score revise_item[sensitive_dict_key] = sensitive_info[uid] revise_item[sensitive_string_key] = "&".join( sensitive_words_dict.keys()) revise_item[ 'sensitive_day_change'] = current_sensitive_score revise_item[ 'sensitive_week_change'] = current_sensitive_score revise_item[ 'sensitive_month_change'] = current_sensitive_score revise_item['sensitive_week_ave'], revise_item[ 'sensitive_week_var'], revise_item[ 'sensitive_week_sum'] = compute_week( revise_item, now_ts) revise_item['senstiive_month_ave'], revise_item[ 'sensitive_month_var'], revise_item[ 'sensitive_month_sum'] = compute_month( revise_item, now_ts) action = {'index': {'_id': uid}} bulk_action.extend([action, revise_item]) iter_count += 1 if iter_count % 1000 == 0: es.bulk(bulk_action, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX) bulk_action = [] print iter_count if int(scan_cursor) == 0: break if bulk_action: es.bulk(bulk_action, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX) print iter_count #######更新尚未完成的用户 update_scan = scan(es, query={ "query": { "filtered": { "filter": { "missing": { "field": update_sensitive_key } } } } }, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX) iter_count = 0 bulk_action = [] while 1: try: tmp = update_scan.next() revise_item = tmp['_source'] if del_sensitive_key in revise_item: revise_item.pop(del_sensitive_key) uid = tmp['_id'] # 新更新的敏感度 revise_item[update_sensitive_key] = 0 revise_item['last_value'] = 0 # 新更新的敏感词 revise_item[sensitive_dict_key] = json.dumps({}) # 新更新的string revise_item[sensitive_string_key] = "" # 当天和之前一天、一周和一月均值的差异 revise_item['sensitive_day_change'] = 0 - revise_item.get( former_sensitive_key, 0) revise_item['sensitive_week_change'] = 0 - revise_item.get( 'sensitive_week_ave', 0) revise_item['sensitive_month_change'] = 0 - revise_item.get( 'sensitive_month_ave', 0) # 更新后week、month的均值和方差 revise_item['sensitive_week_ave'], revise_item[ 'sensitive_week_var'], revise_item[ 'sensitive_week_sum'] = compute_week(revise_item, now_ts) revise_item['senstiive_month_ave'], revise_item[ 'sensitive_month_var'], revise_item[ 'sensitive_month_sum'] = compute_month( revise_item, now_ts) action = {'index': {'_id': uid}} bulk_action.extend([action, revise_item]) iter_count += 1 if iter_count % 1000 == 0: es.bulk(bulk_action, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX) bulk_action = [] except StopIteration: print "all done" if bulk_action: es.bulk(bulk_action, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX) break except Exception, r: print Exception, r
def test(ft_type): print ft_type if ft_type == 'facebook': index_name_pre = facebook_flow_text_index_name_pre index_type = facebook_flow_text_index_type user_index_name = facebook_user_index_name user_index_type = facebook_user_index_type else: index_name_pre = twitter_flow_text_index_name_pre index_type = twitter_flow_text_index_type user_index_name = twitter_user_index_name user_index_type = twitter_user_index_type # date_list = load_date_list(True) date_list = load_date_list() DFA = createWordTree() query_body = { 'post_filter': { 'missing': { 'field': 'keywords_string' } }, 'query': { 'filtered': { 'filter': { 'bool': { 'must': [{ 'range': { 'flag_ch': { 'gte': -1 } } }] } } } } } for date in date_list: count = 0 bulk_action = [] index_name = index_name_pre + date try: es_scan_results = scan(es, query=query_body, size=1000, index=index_name, doc_type=index_type) while True: try: scan_data = es_scan_results.next() item = scan_data['_source'] text = item['text_ch'] uid = item['uid'] if ft_type == 'facebook': _id = item['fid'] else: _id = item['tid'] ts = datetime2ts(date) #add sentiment field to weibo sentiment, keywords_list = triple_classifier(item) #add key words to weibo keywords_dict, keywords_string = get_weibo_keywords( keywords_list) #sensitive_words_dict sensitive_words_dict = searchWord( text.encode('utf-8', 'ignore'), DFA) if sensitive_words_dict: sensitive_words_string_data = "&".join( sensitive_words_dict.keys()) sensitive_words_dict_data = json.dumps( sensitive_words_dict) else: sensitive_words_string_data = "" sensitive_words_dict_data = json.dumps({}) #redis if sensitive_words_dict: sensitive_count_string = r_cluster.hget( 'sensitive_' + str(ts), str(uid)) if sensitive_count_string: #redis取空 sensitive_count_dict = json.loads( sensitive_count_string) for word in sensitive_words_dict.keys(): if sensitive_count_dict.has_key(word): sensitive_count_dict[ word] += sensitive_words_dict[word] else: sensitive_count_dict[ word] = sensitive_words_dict[word] r_cluster.hset('sensitive_' + str(ts), str(uid), json.dumps(sensitive_count_dict)) else: r_cluster.hset('sensitive_' + str(ts), str(uid), json.dumps(sensitive_words_dict)) #sensitive sensitive_score = 0 if sensitive_words_dict: for k, v in sensitive_words_dict.iteritems(): tmp_stage = r_sensitive.hget("sensitive_words", k) if tmp_stage: sensitive_score += v * sensitive_score_dict[ str(tmp_stage)] #directed_uid directed_uid_data = 0 directed_uid, directed_uname = get_root_retweet( text, uid, ft_type) if directed_uid: directed_uid_data = long(directed_uid) # hashtag hashtag = '' RE = re.compile( u'#([0-9a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+)[ ," =.。: :、]' ) hashtag_list = re.findall(RE, text) if hashtag_list: hashtag = '&'.join(hashtag_list) #action action = {'update': {'_id': _id}} # action_data action_data = { 'sentiment': str(sentiment), 'keywords_dict': json.dumps(keywords_dict), 'keywords_string': keywords_string, 'sensitive_words_string': sensitive_words_string_data, 'sensitive_words_dict': sensitive_words_dict_data, 'sensitive': sensitive_score, 'directed_uid': directed_uid_data, 'directed_uname': directed_uname, 'hashtag': hashtag, } bulk_action.extend([action, {'doc': action_data}]) count += 1 if count % 1000 == 0 and count != 0: if bulk_action: es.bulk(bulk_action, index=index_name, doc_type=facebook_flow_text_index_type, timeout=600) bulk_action = [] count = 0 except StopIteration: break if bulk_action: es.bulk(bulk_action, index=index_name, doc_type=facebook_flow_text_index_type, timeout=600) except Exception, e: #es文档不存在 print e
def main(): if RUN_TYPE: now_ts = time.time()-DAY # 前一天 now_ts = datetime2ts('2016-03-24') ts = str(datetime2ts(ts2datetime(now_ts))) else: ts = str(datetime2ts('2016-03-16')) now_ts = int(ts) print now_ts sensitive_string = "sensitive_" + ts date_string = ts update_sensitive_key = "sensitive_score_" + ts # 更新的键 sensitive_dict_key = "sensitive_dict_" + ts sensitive_string_key = "sensitive_string_" + ts sensitive_day_change_key = "sensitive_" + ts +"_day_change" del_month = datetime2ts(ts2datetime(now_ts - MONTH)) del_sensitive_key = "sensitive_score_"+str(del_month) # 要删除的键 former_ts = int(ts) - DAY former_date = str(datetime2ts(ts2datetime(former_ts))) former_sensitive_key = "sensitive_score_" + former_date iter_count = 0 bulk_action = [] mappings(ES_SENSITIVE_INDEX) total_number = r.hlen(sensitive_string) scan_cursor = 0 print total_number while 1: re_scan = r.hscan(sensitive_string, scan_cursor, count=1000) scan_cursor = re_scan[0] if len(re_scan[1]) != 0: sensitive_info = re_scan[1] # 字典形式,uid:sensitive_words_dict uid_list = sensitive_info.keys() sensitive_results = es.mget(index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX, body={"ids":uid_list})['docs'] if sensitive_results: for item in sensitive_results: uid = item['_id'] sensitive_words_dict = json.loads(sensitive_info[uid]) # json.loads current_sensitive_score = 0 for k,v in sensitive_words_dict.iteritems(): tmp_stage = r_sensitive.hget("sensitive_words", k) if tmp_stage: current_sensitive_score += v*sensitive_score_dict[str(tmp_stage)] if item['found']: # 之前存在相关信息 revise_item = item["_source"] if del_sensitive_key in revise_item: item.remove(del_sensitive_key) revise_item['uid'] = uid # 新更新的敏感度 revise_item[update_sensitive_key] = current_sensitive_score # 新更新的敏感词 revise_item[sensitive_dict_key] = sensitive_info[uid] # 新更新的string revise_item[sensitive_string_key] = "&".join(sensitive_words_dict.keys()) # 当天和之前一天、一周和一月均值的差异 revise_item['sensitive_day_change'] = current_sensitive_score - revise_item.get(former_sensitive_key, 0) revise_item['sensitive_week_change'] = current_sensitive_score - revise_item.get('sensitive_week_ave', 0) revise_item['sensitive_month_change'] = current_sensitive_score - revise_item.get('sensitive_month_ave', 0) # 更新后week、month的均值和方差 revise_item['sensitive_week_ave'], revise_item['sensitive_week_var'], revise_item['sensitive_week_sum'] = compute_week(revise_item, now_ts) revise_item['senstiive_month_ave'], revise_item['sensitive_month_var'], revise_item['sensitive_month_sum'] = compute_month(revise_item, now_ts) else: revise_item = dict() revise_item['uid'] = uid revise_item[update_sensitive_key] = current_sensitive_score revise_item[sensitive_dict_key] = sensitive_info[uid] revise_item[sensitive_string_key] = "&".join(sensitive_words_dict.keys()) revise_item['sensitive_day_change'] = current_sensitive_score revise_item['sensitive_week_change'] = current_sensitive_score revise_item['sensitive_month_change'] = current_sensitive_score revise_item['sensitive_week_ave'], revise_item['sensitive_week_var'], revise_item['sensitive_week_sum'] = compute_week(revise_item, now_ts) revise_item['senstiive_month_ave'], revise_item['sensitive_month_var'], revise_item['sensitive_month_sum'] = compute_month(revise_item, now_ts) action = {'index':{'_id': uid}} bulk_action.extend([action, revise_item]) iter_count += 1 if iter_count % 1000 == 0: es.bulk(bulk_action, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX) bulk_action = [] print iter_count if int(scan_cursor) == 0: break if bulk_action: es.bulk(bulk_action, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX) print iter_count
def get_flow_information_v2(uid_list, all_user_keywords_dict): results = {} #results = {uid:{'hashtag_dict':{},'hashtag':'', 'keywords_dict':{}, 'keywords_string':'', 'activity_geo':'', 'activity_geo_dict':dict, 'activity_geo_aggs':''}} iter_results = { } # iter_results = {uid:{'hashtag': hashtag_dict, 'geo':geo_dict, 'keywords':keywords_dict}} now_ts = time.time() #run_type if RUN_TYPE == 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = test_ts for i in range(WEEK, 0, -1): ts = now_date_ts - DAY * i print ts uid_day_geo = {} #compute hashtag and geo hashtag_results = r_cluster.hmget('hashtag_' + str(ts), uid_list) ip_results = r_cluster.hmget('new_ip_' + str(ts), uid_list) #compute sensitive_words sensitive_results = r_cluster.hmget('sensitive_' + str(ts), uid_list) count = 0 for uid in uid_list: #init iter_results[uid] if uid not in iter_results: iter_results[uid] = { 'hashtag': {}, 'geo': {}, 'geo_track': [], 'keywords': {}, 'sensitive': {} } #compute hashtag hashtag_item = hashtag_results[count] if hashtag_item: uid_hashtag_dict = json.loads(hashtag_item) else: uid_hashtag_dict = {} for hashtag in uid_hashtag_dict: try: iter_results[uid]['hashtag'][hashtag] += uid_hashtag_dict[ hashtag] except: iter_results[uid]['hashtag'][hashtag] = uid_hashtag_dict[ hashtag] #compute sensitive sensitive_item = sensitive_results[count] if sensitive_item: uid_sensitive_dict = json.loads(sensitive_item) else: uid_sensitive_dict = {} for sensitive_word in uid_sensitive_dict: try: iter_results[uid]['sensitive'][ sensitive_word] += uid_sensitive_dict[sensitive_word] except: iter_results[uid]['sensitive'][ sensitive_word] = uid_sensitive_dict[sensitive_word] #compute geo uid_day_geo[uid] = {} ip_item = ip_results[count] if ip_item: uid_ip_dict = json.loads(ip_item) else: uid_ip_dict = {} for ip in uid_ip_dict: ip_count = len(uid_ip_dict[ip].split('&')) geo = ip2city(ip) if geo: try: iter_results[uid]['geo'][geo] += ip_count except: iter_results[uid]['geo'][geo] = ip_count try: uid_day_geo[uid][geo] += ip_count except: uid_day_geo[uid][geo] = ip_count iter_results[uid]['geo_track'].append(uid_day_geo[uid]) count += 1 #get keywords top for uid in uid_list: results[uid] = {} #hashtag hashtag_dict = iter_results[uid]['hashtag'] results[uid]['hashtag_dict'] = json.dumps(hashtag_dict) results[uid]['hashtag'] = '&'.join(hashtag_dict.keys()) #sensitive words sensitive_word_dict = iter_results[uid]['sensitive'] results[uid]['sensitive_dict'] = json.dumps(sensitive_word_dict) results[uid]['sensitive_string'] = '&'.join(sensitive_word_dict.keys()) sensitive_score = 0 for sensitive_item in sensitive_word_dict: k = sensitive_item v = sensitive_word_dict[sensitive_item] tmp_stage = r_sensitive.hget('sensitive_words', k) if tmp_stage: sensitive_score += v * sensitive_score_dict[str(tmp_stage)] results[uid]['sensitive'] = sensitive_score #print 'sensitive_dict:', results[uid]['sensitive_dict'] #print 'sensitive_string:', results[uid]['sensitive_string'] #print 'sensitive:', results[uid]['sensitive'] #geo geo_dict = iter_results[uid]['geo'] geo_track_list = iter_results[uid]['geo_track'] results[uid]['activity_geo_dict'] = json.dumps(geo_track_list) geo_dict_keys = geo_dict.keys() results[uid]['activity_geo'] = '&'.join( ['&'.join(item.split('\t')) for item in geo_dict_keys]) try: results[uid]['activity_geo_aggs'] = '&'.join( [item.split('\t')[-1] for item in geo_dict_keys]) except: results[uid]['activity_geo_aggs'] = '' keywords_dict = all_user_keywords_dict[uid] keywords_top50 = sorted(keywords_dict.items(), key=lambda x: x[1], reverse=True)[:50] keywords_top50_string = '&'.join( [keyword_item[0] for keyword_item in keywords_top50]) results[uid]['keywords'] = json.dumps(keywords_top50) results[uid]['keywords_string'] = keywords_top50_string return results