def get_activity_time(uid_list): results = {} now_ts = time.time() now_date = ts2datetime(now_ts) #run_type if RUN_TYPE == 1: timestamp = datetime2ts(now_date) else: timestamp = datetime2ts("2013-09-08") activity_list_dict = {} # {uid:[activity_list], uid:[]} for i in range(1,WEEK+1): ts = timestamp - DAY*i if WORK_TYPE != 0: r_result = redis_activity.hmget('activity_'+str(ts), uid_list) else: r_result = [] index_name = "activity_" + str(ts2datetime(ts)) exist_bool = es_cluster.indices.exists(index=index_name) if exist_bool: es_results = es_cluster.mget(index=index_name, doc_type="activity", body={"ids":uid_list})["docs"] for item in es_results: if item['found']: r_result.append(item['_source']['activity_dict']) else: r_result.append(json.dumps({})) else: r_result = [json.dumps(dict())]*len(uid_list) if r_result: for j in range(0, len(uid_list)): uid = uid_list[j] if uid not in activity_list_dict: activity_list_dict[uid] = [0 for i in range(0, 96)] user_r_result = r_result[j] if user_r_result: user_activity_dict = json.loads(user_r_result) for i in range(0, 96): try: count = user_activity_dict[str(i)] except: count = 0 activity_list_dict[uid].append(count) for uid in uid_list: activity_list = activity_list_dict[uid] statusnum = sum(activity_list) signal = np.array(activity_list) fftResult = np.abs(np.fft.fft(signal))**2 n = signal.size freq = np.fft.fftfreq(n, d=1) i = 0 max_val = 0 max_freq = 0 for val in fftResult: if val>max_val and freq[i]>0: max_val = val max_freq = freq[i] i += 1 results[uid] = {'statusnum': statusnum, 'activity_time': math.log(max_freq + 1)} return results
def get_flow_information(uid_list): # 前七天的数据, 不能用于每天更新 lenth = len(uid_list) results = {} iter_results = {} result_dict = {} if RUN_TYPE: now_ts = time.time() now_date = ts2datetime(now_ts) # date: 2013-09-01 else: now_date = "2013-09-08" ts = datetime2ts(now_date) start_ts = ts - 8 * 3600 * 24 for i in range(1, 8): ts = start_ts + i * 3600 * 24 date = ts2datetime(ts) print "date:", date uid_day_geo = {} sensitive_uid_day_geo = {} flow_index_name = flow_text_index_name_pre + str(date) # hashtag print uid_list hashtag_results = redis_cluster.hmget('hashtag_' + str(ts), uid_list) sensitive_hashtag = redis_cluster.hmget('sensitive_hashtag_' + str(ts), uid_list) # sensitive_words sensitive_results = redis_cluster.hmget('sensitive_' + str(ts), uid_list) # ip if WORK_TYPE == 0: ip_index_name = ip_index_pre + str(date) sensitive_ip_index_name = sen_ip_index_pre + str(date) #activity_index_name = act_index_pre + str(date) #sensitive_activity_index_name = sen_act_index_pre + str(date) exist_bool = es_cluster.indices.exists(index=ip_index_name) sensitive_exist_bool = es_cluster.indices.exists( index=sensitive_ip_index_name) #activity_exist_bool = es_cluster.indices.exists(index=activity_index_name) #sensitive_activity_exist_bool = es_cluster.indices.exists(index=sensitive_activity_index_name) if exist_bool: ip_results = es_cluster.mget(index=ip_index_name, doc_type="ip", body={"ids": uid_list})["docs"] else: ip_results = [dict()] * lenth if sensitive_exist_bool: sensitive_ip_results = es_cluster.mget( index=sensitive_ip_index_name, doc_type="sensitive_ip", body={"ids": uid_list})["docs"] else: sensitive_ip_results = [dict()] * lenth """ if activity_exist_bool: activity_results = es_cluster.mget(index=activity_index_name, doc_type="activity", body={"ids":uid_list})["docs"] else: activity_results = [dict()]*lenth if sensitive_activity_exist_bool: sensitive_activity_results = es_cluster.mget(index=sensitive_activity_index_name, doc_type="sensitive_activity", body={"ids":uid_list})["docs"] else: sensitive_activity_results = [dict()]*lenth """ else: ip_results = redis_ip.hmget('ip_' + str(ts), uid_list) sensitive_ip_results = redis_ip.hmget('sensitive_ip_' + str(ts), uid_list) #activity_results = redis_activity.hmget('activity_'+str(date), uid_list) #sensitive_activity_results = redis_activity.hmget('sensitive_activity_'+str(date), uid_list) for j in range(0, len(uid_list)): uid = uid_list[j] if uid not in iter_results: iter_results[uid] = {'hashtag':{}, 'sensitive_hashtag':{}, 'geo':{}, "sensitive_geo":{},'geo_track':[],'keywords':{}, \ 'sensitive_words':{}, "sensitive_geo_track":[],'ip': [], 'sensitive_ip':[]} # sensitive words if sensitive_results[j]: sensitive_words_results = json.loads(sensitive_results[j]) for sensitive_word in sensitive_words_results: try: iter_results[uid]["sensitive_words"][ sensitive_word] += sensitive_words_results[ sensitive_word] except: iter_results[uid]["sensitive_words"][ sensitive_word] = sensitive_words_results[ sensitive_word] #print "sensitive_words:", iter_results[uid]["sensitive_words"] if hashtag_results[j]: hashtag_dict = json.loads(hashtag_results[j]) for hashtag in hashtag_dict: try: iter_results[uid]['hashtag'][hashtag] += hashtag_dict[ hashtag] except: iter_results[uid]['hashtag'][hashtag] = hashtag_dict[ hashtag] #print "hashtag: ", iter_results[uid]['hashtag'] if sensitive_hashtag[j]: sensitive_hashtag_dict = json.loads(sensitive_hashtag[j]) for hashtag in sensitive_hashtag_dict: try: iter_results[uid]['sensitive_hashtag'][ hashtag] += sensitive_hashtag_dict[hashtag] except: iter_results[uid]['sensitive_hashtag'][ hashtag] = sensitive_hashtag_dict[hashtag] #print "sensitive_hashtag:", iter_results[uid]['sensitive_hashtag'] uid_day_geo[uid] = {} sensitive_uid_day_geo[uid] = {} if WORK_TYPE == 0: # es if ip_results[j]: if ip_results[j]['found']: detail_item = ip_results[j]['_source'] ip_dict = json.loads(detail_item['ip_dict']) else: ip_dict = {} else: ip_dict = {} else: if ip_results[j]: ip_dict = json.loads(ip_results[j]) else: ip_dict = {} if ip_dict: #iter_results[uid]['ip'].append(ip_dict) geo_dict = ip2geo(ip_dict) for geo, count in geo_dict.iteritems(): try: iter_results[uid]['geo'][geo] += count except: iter_results[uid]['geo'][geo] = count try: uid_day_geo[uid][geo] += count except: uid_day_geo[uid][geo] = count #iter_results[uid]['ip'].append(ip_dict) iter_results[uid]['geo_track'].append(uid_day_geo[uid]) #print "ip:", iter_results[uid]['ip'], iter_results[uid]['geo_track'] if WORK_TYPE == 0: if sensitive_ip_results[j]: if sensitive_ip_results[j]['found']: detail_item = sensitive_ip_results[j]['_source'] sensitive_ip_dict = json.loads( detail_item['sensitive_ip_dict']) else: sensitive_ip_dict = dict() else: sensitive_ip_dict = dict() else: if sensitive_ip_results[j]: sensitive_ip_dict = json.loads(sensitive_ip_results[j]) else: sensitive_ip_dict = dict() if sensitive_ip_dict: sensitive_geo_dict = ip2geo(sensitive_ip_dict) #iter_results[uid]['sensitive_ip'].append(sensitive_ip_dict) for geo, count in sensitive_geo_dict.iteritems(): try: iter_results[uid]['sensitive_geo'][geo] += count except: iter_results[uid]['sensitive_geo'][geo] = count try: sensitive_uid_day_geo[uid][geo] += count except: sensitive_uid_day_geo[uid][geo] = count #iter_results[uid]['sensitive_ip'].append(sensitive_ip_dict) iter_results[uid]['sensitive_geo_track'].append( sensitive_uid_day_geo[uid]) #print "sensitive_ip:", iter_results[uid]['sensitive_ip'], iter_results[uid]['sensitive_geo_track'] # compute keywords flow_text_exist = es_flow_text.indices.exists(index=flow_index_name) if flow_text_exist: text_results = es_flow_text.search(index=flow_index_name, doc_type=flow_text_index_type,\ body={'query':{'filtered':{'filter':{'terms':{'uid': uid_list}}}}, 'size':MAX_VALUE},_source=False, fields=['uid', 'keywords_dict'])['hits']['hits'] else: text_results = {} for item in text_results: uid = item['fields']['uid'][0] uid_keywords_dict = json.loads(item['fields']['keywords_dict'][0]) for keywords in uid_keywords_dict: try: iter_results[uid]['keywords'][ keywords] += uid_keywords_dict[keywords] except: iter_results[uid]['keywords'][ keywords] = uid_keywords_dict[keywords] #print "keywords:", iter_results[uid]['keywords'] for uid in uid_list: results[uid] = {} # hashtag hashtag_dict = iter_results[uid]['hashtag'] results[uid]['hashtag_dict'] = json.dumps(hashtag_dict) results[uid]['hashtag_string'] = '&'.join(hashtag_dict.keys()) # sensitive hashtag sensitive_hashtag_dict = iter_results[uid]['sensitive_hashtag'] results[uid]['sensitive_hashtag_dict'] = json.dumps( sensitive_hashtag_dict) results[uid]['sensitive_hashtag_string'] = '&'.join( sensitive_hashtag_dict.keys()) # sensitive_words sensitive_word_dict = iter_results[uid]['sensitive_words'] results[uid]['sensitive_words_dict'] = json.dumps(sensitive_word_dict) results[uid]['sensitive_words_string'] = '&'.join( sensitive_word_dict.keys()) sensitive_score = 0 for k, v in sensitive_word_dict.iteritems(): tmp = r_sensitive.hget('sensitive_words', k) if tmp: tmp_stage = json.loads(tmp) sensitive_score += sensitive_score_dict[str(tmp_stage[0])] * v results[uid]['sensitive'] = sensitive_score # geo geo_dict = iter_results[uid]['geo'] geo_track_list = iter_results[uid]['geo_track'] results[uid]['activity_geo_dict'] = json.dumps(geo_track_list) geo_dict_keys = geo_dict.keys() results[uid]['activity_geo'] = '&'.join( ['&'.join(item.split('\t')) for item in geo_dict_keys]) results[uid]['activity_geo_aggs'] = '&'.join( [item.split('\t')[-1] for item in geo_dict_keys]) sensitive_geo_dict = iter_results[uid]['sensitive_geo'] sensitive_geo_track_list = iter_results[uid]['sensitive_geo_track'] results[uid]['sensitive_activity_geo_dict'] = json.dumps( sensitive_geo_track_list) sensitive_geo_dict_keys = sensitive_geo_dict.keys() results[uid]['sensitive_activity_geo'] = '&'.join( ['&'.join(item.split('\t')) for item in sensitive_geo_dict_keys]) results[uid]['sensitive_activity_geo_aggs'] = '&'.join( [item.split('\t')[-1] for item in sensitive_geo_dict_keys]) keywords_dict = iter_results[uid]['keywords'] keywords_top50 = sorted(keywords_dict.items(), key=lambda x: x[1], reverse=True)[:50] keywords_top50_string = '&'.join( [keyword_item[0] for keyword_item in keywords_top50]) results[uid]['keywords_dict'] = json.dumps(keywords_top50) results[uid]['keywords_string'] = keywords_top50_string return results
def update_day_geo(uid_list, user_info_list): results = {} now_ts = time.time() if RUN_TYPE == 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = datetime2ts('2013-09-02') now_date = ts2datetime(now_date_ts) if WORK_TYPE == 0: if es_cluster.indices.exists(index="ip_" + str(now_date)): ip_results = es_cluster.mget(index="ip_" + str(now_date), doc_type="ip", body={"ids": uid_list})['docs'] else: ip_results = [json.dumps({})] * len(uid) if es_cluster.indices.exists(index="sensitive_ip_" + str(now_date)): sensitive_ip_results = es_cluster.mget( index="sensitive_ip_" + str(now_date), doc_type="sensitive_ip", body={"ids": uid_list})["docs"] else: sensitive_ip_results = [{"found": False}] * len(uid) else: ip_results = redis_ip.mget("ip_" + str(now_date_ts), uid_list) sensitive_ip_results = redis_ip.mget( 'sensitive_ip_' + str(now_date_ts), uid_list) count = 0 for uid in uid_list: if uid not in results: results[uid] = {'activity_geo': {}, 'activity_geo_dict': []} results[uid] = { 'sensitive_activity_geo': {}, 'sensitive_activity_geo_dict': [] } uid_ip_results = ip_results[count] uid_sensitive_ip_results = sensitive_ip_results[count] if WORK_TYPE == 0: if uid_ip_results['found']: uid_ip_dict = json.loads(uid_ip_results['_source']['ip_dict']) else: uid_ip_dict = {} if sensitive_uid_ip_results['found']: sensitive_uid_ip_dict = json.loads( sensitive_uid_ip_results['_source']['sensitive_ip_dict']) else: sensitive_uid_ip_dict = {} else: if uid_ip_results: uid_ip_dict = json.loads(uid_ip_results) else: uid_ip_dict = {} if uid_sensitive_ip_results: sensitive_uid_ip_dict = json.loads(sensitive_uid_ip_results) else: sensitive_uid_ip_dict = {} day_results = {} sensitive_day_results = {} if uid_ip_dict: #iter_results[uid]['ip'].append(ip_dict) geo_dict = ip2geo(uid_ip_dict) for geo, count in geo_dict.iteritems(): try: day_results[geo] += count except: day_results[geo] = count if sensitive_uid_ip_dict: #iter_results[uid]['ip'].append(ip_dict) geo_dict = ip2geo(sensitive_uid_ip_dict) for geo, count in geo_dict.iteritems(): try: sensitive_day_results[geo] += count except: sensitive_day_results[geo] = count #update the activity_geo_dict activity_geo_history_list = json.loads( user_info_list[uid]['activity_geo_dict']) sensitive_activity_geo_history_list = json.loads( user_info_list[uid]['activity_geo_dict']) activity_geo_history_list.append(day_results) sensitive_activity_geo_history_list.append(sensitive_day_results) results[uid]['activity_geo_dict'] = json.dumps( activity_geo_history_list[-30:]) results[uid]['sensitive_activity_geo_dict'] = json.dumps( sensitive_activity_geo_history_list[-30:]) #update the activity_geo week_activity_geo_list = activity_geo_history_list[-7:] sensitive_week_activity_geo_list = sensitive_activity_geo_history_list[ -7:] week_geo_list = [] sensitive_week_geo_list = [] for activity_geo_item in week_activity_geo_list: geo_list = activity_geo_item.keys() week_geo_list.extend(geo_list) for activity_geo_item in sensitive_week_activity_geo_list: geo_list = activity_geo_item.keys() sensitive_week_geo_list.extend(geo_list) week_geo_list = list(set(week_geo_list)) sensitive_week_geo_list = list(set(sensitive_week_geo_list)) week_geo_string = '&'.join( ['&'.join(item.split('\t')) for item in week_geo_list]) sensitive_week_geo_string = '&'.join( ['&'.join(item.split('\t')) for item in sensitive_week_geo_list]) try: week_geo_aggs_string = '&'.join( [item.split('\t')[-1] for item in week_geo_list]) except: week_geo_aggs_string = '' try: sensitive_week_geo_aggs_string = '&'.join( [item.split('\t')[-1] for item in sensitive_week_geo_list]) except: sensitive_week_geo_aggs_string = '' results[uid]['activity_geo'] = week_geo_string results[uid]['activity_geo_aggs'] = week_geo_aggs_string results[uid]['sensitive_activity_geo'] = sensitive_week_geo_string results[uid][ 'sensitive_activity_geo_aggs'] = sensitive_week_geo_aggs_string #print 'update geo results:', results return results
def get_flow_information(uid_list): # 前七天的数据, 不能用于每天更新 lenth = len(uid_list) results = {} iter_results = {} result_dict = {} if RUN_TYPE: now_ts = time.time() now_date = ts2datetime(now_ts) # date: 2013-09-01 else: now_date = "2013-09-08" ts = datetime2ts(now_date) start_ts = ts - 8 * 3600 * 24 for i in range(1, 8): ts = start_ts + i * 3600 * 24 date = ts2datetime(ts) print "date:", date uid_day_geo = {} sensitive_uid_day_geo = {} flow_index_name = flow_text_index_name_pre + str(date) # hashtag print uid_list hashtag_results = redis_cluster.hmget("hashtag_" + str(ts), uid_list) sensitive_hashtag = redis_cluster.hmget("sensitive_hashtag_" + str(ts), uid_list) # sensitive_words sensitive_results = redis_cluster.hmget("sensitive_" + str(ts), uid_list) # ip if WORK_TYPE == 0: ip_index_name = ip_index_pre + str(date) sensitive_ip_index_name = sen_ip_index_pre + str(date) # activity_index_name = act_index_pre + str(date) # sensitive_activity_index_name = sen_act_index_pre + str(date) exist_bool = es_cluster.indices.exists(index=ip_index_name) sensitive_exist_bool = es_cluster.indices.exists(index=sensitive_ip_index_name) # activity_exist_bool = es_cluster.indices.exists(index=activity_index_name) # sensitive_activity_exist_bool = es_cluster.indices.exists(index=sensitive_activity_index_name) if exist_bool: ip_results = es_cluster.mget(index=ip_index_name, doc_type="ip", body={"ids": uid_list})["docs"] else: ip_results = [dict()] * lenth if sensitive_exist_bool: sensitive_ip_results = es_cluster.mget( index=sensitive_ip_index_name, doc_type="sensitive_ip", body={"ids": uid_list} )["docs"] else: sensitive_ip_results = [dict()] * lenth """ if activity_exist_bool: activity_results = es_cluster.mget(index=activity_index_name, doc_type="activity", body={"ids":uid_list})["docs"] else: activity_results = [dict()]*lenth if sensitive_activity_exist_bool: sensitive_activity_results = es_cluster.mget(index=sensitive_activity_index_name, doc_type="sensitive_activity", body={"ids":uid_list})["docs"] else: sensitive_activity_results = [dict()]*lenth """ else: ip_results = redis_ip.hmget("ip_" + str(ts), uid_list) sensitive_ip_results = redis_ip.hmget("sensitive_ip_" + str(ts), uid_list) # activity_results = redis_activity.hmget('activity_'+str(date), uid_list) # sensitive_activity_results = redis_activity.hmget('sensitive_activity_'+str(date), uid_list) for j in range(0, len(uid_list)): uid = uid_list[j] if uid not in iter_results: iter_results[uid] = { "hashtag": {}, "sensitive_hashtag": {}, "geo": {}, "sensitive_geo": {}, "geo_track": [], "keywords": {}, "sensitive_words": {}, "sensitive_geo_track": [], "ip": [], "sensitive_ip": [], } # sensitive words if sensitive_results[j]: sensitive_words_results = json.loads(sensitive_results[j]) for sensitive_word in sensitive_words_results: try: iter_results[uid]["sensitive_words"][sensitive_word] += sensitive_words_results[sensitive_word] except: iter_results[uid]["sensitive_words"][sensitive_word] = sensitive_words_results[sensitive_word] # print "sensitive_words:", iter_results[uid]["sensitive_words"] if hashtag_results[j]: hashtag_dict = json.loads(hashtag_results[j]) for hashtag in hashtag_dict: try: iter_results[uid]["hashtag"][hashtag] += hashtag_dict[hashtag] except: iter_results[uid]["hashtag"][hashtag] = hashtag_dict[hashtag] # print "hashtag: ", iter_results[uid]['hashtag'] if sensitive_hashtag[j]: sensitive_hashtag_dict = json.loads(sensitive_hashtag[j]) for hashtag in sensitive_hashtag_dict: try: iter_results[uid]["sensitive_hashtag"][hashtag] += sensitive_hashtag_dict[hashtag] except: iter_results[uid]["sensitive_hashtag"][hashtag] = sensitive_hashtag_dict[hashtag] # print "sensitive_hashtag:", iter_results[uid]['sensitive_hashtag'] uid_day_geo[uid] = {} sensitive_uid_day_geo[uid] = {} if WORK_TYPE == 0: # es if ip_results[j]: if ip_results[j]["found"]: detail_item = ip_results[j]["_source"] ip_dict = json.loads(detail_item["ip_dict"]) else: ip_dict = {} else: ip_dict = {} else: if ip_results[j]: ip_dict = json.loads(ip_results[j]) else: ip_dict = {} if ip_dict: # iter_results[uid]['ip'].append(ip_dict) geo_dict = ip2geo(ip_dict) for geo, count in geo_dict.iteritems(): try: iter_results[uid]["geo"][geo] += count except: iter_results[uid]["geo"][geo] = count try: uid_day_geo[uid][geo] += count except: uid_day_geo[uid][geo] = count # iter_results[uid]['ip'].append(ip_dict) iter_results[uid]["geo_track"].append(uid_day_geo[uid]) # print "ip:", iter_results[uid]['ip'], iter_results[uid]['geo_track'] if WORK_TYPE == 0: if sensitive_ip_results[j]: if sensitive_ip_results[j]["found"]: detail_item = sensitive_ip_results[j]["_source"] sensitive_ip_dict = json.loads(detail_item["sensitive_ip_dict"]) else: sensitive_ip_dict = dict() else: sensitive_ip_dict = dict() else: if sensitive_ip_results[j]: sensitive_ip_dict = json.loads(sensitive_ip_results[j]) else: sensitive_ip_dict = dict() if sensitive_ip_dict: sensitive_geo_dict = ip2geo(sensitive_ip_dict) # iter_results[uid]['sensitive_ip'].append(sensitive_ip_dict) for geo, count in sensitive_geo_dict.iteritems(): try: iter_results[uid]["sensitive_geo"][geo] += count except: iter_results[uid]["sensitive_geo"][geo] = count try: sensitive_uid_day_geo[uid][geo] += count except: sensitive_uid_day_geo[uid][geo] = count # iter_results[uid]['sensitive_ip'].append(sensitive_ip_dict) iter_results[uid]["sensitive_geo_track"].append(sensitive_uid_day_geo[uid]) # print "sensitive_ip:", iter_results[uid]['sensitive_ip'], iter_results[uid]['sensitive_geo_track'] # compute keywords flow_text_exist = es_flow_text.indices.exists(index=flow_index_name) if flow_text_exist: text_results = es_flow_text.search( index=flow_index_name, doc_type=flow_text_index_type, body={"query": {"filtered": {"filter": {"terms": {"uid": uid_list}}}}, "size": MAX_VALUE}, _source=False, fields=["uid", "keywords_dict"], )["hits"]["hits"] else: text_results = {} for item in text_results: uid = item["fields"]["uid"][0] uid_keywords_dict = json.loads(item["fields"]["keywords_dict"][0]) for keywords in uid_keywords_dict: try: iter_results[uid]["keywords"][keywords] += uid_keywords_dict[keywords] except: iter_results[uid]["keywords"][keywords] = uid_keywords_dict[keywords] # print "keywords:", iter_results[uid]['keywords'] for uid in uid_list: results[uid] = {} # hashtag hashtag_dict = iter_results[uid]["hashtag"] results[uid]["hashtag_dict"] = json.dumps(hashtag_dict) results[uid]["hashtag_string"] = "&".join(hashtag_dict.keys()) # sensitive hashtag sensitive_hashtag_dict = iter_results[uid]["sensitive_hashtag"] results[uid]["sensitive_hashtag_dict"] = json.dumps(sensitive_hashtag_dict) results[uid]["sensitive_hashtag_string"] = "&".join(sensitive_hashtag_dict.keys()) # sensitive_words sensitive_word_dict = iter_results[uid]["sensitive_words"] results[uid]["sensitive_words_dict"] = json.dumps(sensitive_word_dict) results[uid]["sensitive_words_string"] = "&".join(sensitive_word_dict.keys()) sensitive_score = 0 for k, v in sensitive_word_dict.iteritems(): tmp = r_sensitive.hget("sensitive_words", k) if tmp: tmp_stage = json.loads(tmp) sensitive_score += sensitive_score_dict[str(tmp_stage[0])] * v results[uid]["sensitive"] = sensitive_score # geo geo_dict = iter_results[uid]["geo"] geo_track_list = iter_results[uid]["geo_track"] results[uid]["activity_geo_dict"] = json.dumps(geo_track_list) geo_dict_keys = geo_dict.keys() results[uid]["activity_geo"] = "&".join(["&".join(item.split("\t")) for item in geo_dict_keys]) results[uid]["activity_geo_aggs"] = "&".join([item.split("\t")[-1] for item in geo_dict_keys]) sensitive_geo_dict = iter_results[uid]["sensitive_geo"] sensitive_geo_track_list = iter_results[uid]["sensitive_geo_track"] results[uid]["sensitive_activity_geo_dict"] = json.dumps(sensitive_geo_track_list) sensitive_geo_dict_keys = sensitive_geo_dict.keys() results[uid]["sensitive_activity_geo"] = "&".join( ["&".join(item.split("\t")) for item in sensitive_geo_dict_keys] ) results[uid]["sensitive_activity_geo_aggs"] = "&".join( [item.split("\t")[-1] for item in sensitive_geo_dict_keys] ) keywords_dict = iter_results[uid]["keywords"] keywords_top50 = sorted(keywords_dict.items(), key=lambda x: x[1], reverse=True)[:50] keywords_top50_string = "&".join([keyword_item[0] for keyword_item in keywords_top50]) results[uid]["keywords_dict"] = json.dumps(keywords_top50) results[uid]["keywords_string"] = keywords_top50_string return results
def update_day_geo(uid_list, user_info_list): results = {} now_ts = time.time() if RUN_TYPE == 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = datetime2ts("2013-09-02") now_date = ts2datetime(now_date_ts) if WORK_TYPE == 0: if es_cluster.indices.exists(index="ip_" + str(now_date)): ip_results = es_cluster.mget(index="ip_" + str(now_date), doc_type="ip", body={"ids": uid_list})["docs"] else: ip_results = [json.dumps({})] * len(uid) if es_cluster.indices.exists(index="sensitive_ip_" + str(now_date)): sensitive_ip_results = es_cluster.mget( index="sensitive_ip_" + str(now_date), doc_type="sensitive_ip", body={"ids": uid_list} )["docs"] else: sensitive_ip_results = [{"found": False}] * len(uid) else: ip_results = redis_ip.mget("ip_" + str(now_date_ts), uid_list) sensitive_ip_results = redis_ip.mget("sensitive_ip_" + str(now_date_ts), uid_list) count = 0 for uid in uid_list: if uid not in results: results[uid] = {"activity_geo": {}, "activity_geo_dict": []} results[uid] = {"sensitive_activity_geo": {}, "sensitive_activity_geo_dict": []} uid_ip_results = ip_results[count] uid_sensitive_ip_results = sensitive_ip_results[count] if WORK_TYPE == 0: if uid_ip_results["found"]: uid_ip_dict = json.loads(uid_ip_results["_source"]["ip_dict"]) else: uid_ip_dict = {} if sensitive_uid_ip_results["found"]: sensitive_uid_ip_dict = json.loads(sensitive_uid_ip_results["_source"]["sensitive_ip_dict"]) else: sensitive_uid_ip_dict = {} else: if uid_ip_results: uid_ip_dict = json.loads(uid_ip_results) else: uid_ip_dict = {} if uid_sensitive_ip_results: sensitive_uid_ip_dict = json.loads(sensitive_uid_ip_results) else: sensitive_uid_ip_dict = {} day_results = {} sensitive_day_results = {} if uid_ip_dict: # iter_results[uid]['ip'].append(ip_dict) geo_dict = ip2geo(uid_ip_dict) for geo, count in geo_dict.iteritems(): try: day_results[geo] += count except: day_results[geo] = count if sensitive_uid_ip_dict: # iter_results[uid]['ip'].append(ip_dict) geo_dict = ip2geo(sensitive_uid_ip_dict) for geo, count in geo_dict.iteritems(): try: sensitive_day_results[geo] += count except: sensitive_day_results[geo] = count # update the activity_geo_dict activity_geo_history_list = json.loads(user_info_list[uid]["activity_geo_dict"]) sensitive_activity_geo_history_list = json.loads(user_info_list[uid]["activity_geo_dict"]) activity_geo_history_list.append(day_results) sensitive_activity_geo_history_list.append(sensitive_day_results) results[uid]["activity_geo_dict"] = json.dumps(activity_geo_history_list[-30:]) results[uid]["sensitive_activity_geo_dict"] = json.dumps(sensitive_activity_geo_history_list[-30:]) # update the activity_geo week_activity_geo_list = activity_geo_history_list[-7:] sensitive_week_activity_geo_list = sensitive_activity_geo_history_list[-7:] week_geo_list = [] sensitive_week_geo_list = [] for activity_geo_item in week_activity_geo_list: geo_list = activity_geo_item.keys() week_geo_list.extend(geo_list) for activity_geo_item in sensitive_week_activity_geo_list: geo_list = activity_geo_item.keys() sensitive_week_geo_list.extend(geo_list) week_geo_list = list(set(week_geo_list)) sensitive_week_geo_list = list(set(sensitive_week_geo_list)) week_geo_string = "&".join(["&".join(item.split("\t")) for item in week_geo_list]) sensitive_week_geo_string = "&".join(["&".join(item.split("\t")) for item in sensitive_week_geo_list]) try: week_geo_aggs_string = "&".join([item.split("\t")[-1] for item in week_geo_list]) except: week_geo_aggs_string = "" try: sensitive_week_geo_aggs_string = "&".join([item.split("\t")[-1] for item in sensitive_week_geo_list]) except: sensitive_week_geo_aggs_string = "" results[uid]["activity_geo"] = week_geo_string results[uid]["activity_geo_aggs"] = week_geo_aggs_string results[uid]["sensitive_activity_geo"] = sensitive_week_geo_string results[uid]["sensitive_activity_geo_aggs"] = sensitive_week_geo_aggs_string # print 'update geo results:', results return results