def get_attr_geo_track(uid_list): date_results = [] # results = {'2013-09-01':[(geo1, count1), (geo2, track2)], '2013-09-02'...} 7day now_ts = time.time() now_date = ts2datetime(now_ts) #test now_date = '2013-09-08' ts = datetime2ts(now_date) for i in range(7, 0, -1): timestamp = ts - i*24*3600 #print 'timestamp:', ts2datetime(timestamp) ip_dict = dict() results = r_cluster.hmget('ip_'+str(timestamp), uid_list) #print 'results:',results for item in results: if item: item_dict = json.loads(item) #print 'item_dict:', item_dict for ip_item in item_dict: try: ip_dict[ip_item] += item_dict[ip_item] except: ip_dict[ip_item] = item_dict[ip_item] geo_dict = ip2geo(ip_dict) sort_geo_dict = sorted(geo_dict.items(), key=lambda x:x[1], reverse=True) date_key = ts2datetime(timestamp) date_results.append([date_key, sort_geo_dict[:2]]) #print 'results:', date_results return {'geo_track': json.dumps(date_results)}
def update_day_hashtag(uid_list): results = {} all_results = {} now_ts = time.time() #run_type if RUN_TYPE == 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = test_ts for i in range(WEEK,0,-1): ts = now_date_ts - DAY*i count = 0 hashtag_results = r_cluster.hmget('hashtag_'+str(ts), uid_list) for uid in uid_list: if uid not in results: results[uid] = {} hashtag_item = hashtag_results[count] if hashtag_item: hashtag_dict = json.loads(hashtag_item) else: hashtag_dict = {} for hashtag in hashtag_dict: try: results[uid][hashtag] += 1 except: results[uid][hashtag] = 1 for uid in uid_list: user_hashtag_dict = results[uid] hashtag_string = '&'.join(user_hashtag_dict.keys()) all_results[uid] = {'hashtag': hashtag_string, 'hashtag_dict':user_hashtag_dict} return all_results
def update_day_hashtag(uid_list): results = {} all_results = {} now_ts = time.time() #run_type if RUN_TYPE == 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = test_ts for i in range(WEEK, 0, -1): ts = now_date_ts - DAY * i count = 0 hashtag_results = r_cluster.hmget('hashtag_' + str(ts), uid_list) for uid in uid_list: if uid not in results: results[uid] = {} hashtag_item = hashtag_results[count] if hashtag_item: hashtag_dict = json.loads(hashtag_item) else: hashtag_dict = {} for hashtag in hashtag_dict: try: results[uid][hashtag] += 1 except: results[uid][hashtag] = 1 for uid in uid_list: user_hashtag_dict = results[uid] hashtag_string = '&'.join(user_hashtag_dict.keys()) all_results[uid] = { 'hashtag': hashtag_string, 'hashtag_dict': user_hashtag_dict } return results
def update_day_geo(uid_list, user_info_list): results = {} now_ts = time.time() #run_type if RUN_TYPE == 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = test_ts ip_results = r_cluster.hmget('new_ip_' + str(now_date_ts - DAY), uid_list) count = 0 for uid in uid_list: if uid not in results: results[uid] = {'activity_geo': {}, 'activity_geo_dict': []} uid_ip_results = ip_results[count] count += 1 if uid_ip_results: uid_ip_dict = json.loads(uid_ip_results) else: uid_ip_dict = {} day_results = {} for ip in uid_ip_dict: ip_count = len(uid_ip_dict[ip].split('&')) geo = ip2city(ip) geo = geo.decode('utf-8') try: day_results[geo] += ip_count except: day_results[geo] = ip_count #update the activity_geo_dict activity_geo_history_list = json.loads( user_info_list[uid]['activity_geo_dict']) activity_geo_history_list.append(day_results) results[uid]['activity_geo_dict'] = json.dumps( activity_geo_history_list[-30:]) #update the activity_geo week_activity_geo_list = activity_geo_history_list[-7:] week_geo_list = [] for activity_geo_item in week_activity_geo_list: geo_list = activity_geo_item.keys() week_geo_list.extend(geo_list) week_geo_list = list(set(week_geo_list)) week_geo_string = '&'.join([ '&'.join((item.encode('utf-8')).split('\t')) for item in week_geo_list ]) try: week_geo_aggs_string = '&'.join([ (item.encode('utf-8')).split('\t')[-1] for item in week_geo_list ]) except: week_geo_aggs_string = '' results[uid]['activity_geo'] = week_geo_string results[uid]['activity_geo_aggs'] = week_geo_aggs_string return results
def get_activity_time(uid_list): results = {} now_ts = time.time() now_date = ts2datetime(now_ts) #run_type if RUN_TYPE == 1: timestamp = datetime2ts(now_date) else: timestamp = datetime2ts(RUN_TEST_TIME) activity_list_dict = {} # {uid:[activity_list], uid:[]} for i in range(1, WEEK + 1): ts = timestamp - DAY * i print ts r_result = r_cluster.hmget('activity_' + str(ts), uid_list) #print r_result if r_result: for j in range(0, len(uid_list)): uid = uid_list[j] if uid not in activity_list_dict: activity_list_dict[uid] = [0 for i in range(0, 96)] user_r_result = r_result[j] if user_r_result: user_activity_dict = json.loads(user_r_result) for i in range(0, 96): try: count = user_activity_dict[str(i)] except: count = 0 activity_list_dict[uid].append(count) for uid in uid_list: activity_list = activity_list_dict[uid] statusnum = sum(activity_list) signal = np.array(activity_list) fftResult = np.abs(np.fft.fft(signal))**2 n = signal.size freq = np.fft.fftfreq(n, d=1) i = 0 max_val = 0 max_freq = 0 for val in fftResult: if val > max_val and freq[i] > 0: max_val = val max_freq = freq[i] i += 1 results[uid] = { 'statusnum': statusnum, 'activity_time': math.log(max_freq + 1) } return results
def update_day_geo(uid_list, user_info_list): results = {} now_ts = time.time() #run_type if RUN_TYPE == 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = test_ts ip_results = r_cluster.hmget('new_ip_'+str(now_date_ts - DAY), uid_list) count = 0 for uid in uid_list: if uid not in results: results[uid] = {'activity_geo':{}, 'activity_geo_dict':[]} uid_ip_results = ip_results[count] count += 1 if uid_ip_results: uid_ip_dict = json.loads(uid_ip_results) else: uid_ip_dict = {} day_results = {} for ip in uid_ip_dict: ip_count = len(uid_ip_dict[ip].split('&')) geo, school = ip2city(ip) if geo: geo = geo.decode('utf-8') try: day_results[geo] += ip_count except: day_results[geo] = ip_count #update the activity_geo_dict activity_geo_history_list = json.loads(user_info_list[uid]['activity_geo_dict']) activity_geo_history_list.append(day_results) results[uid]['activity_geo_dict'] = json.dumps(activity_geo_history_list[-30:]) #update the activity_geo week_activity_geo_list = activity_geo_history_list[-7:] week_geo_list = [] for activity_geo_item in week_activity_geo_list: geo_list = activity_geo_item.keys() week_geo_list.extend(geo_list) week_geo_list = list(set(week_geo_list)) week_geo_string = '&'.join(['&'.join(item.split('\t')) for item in week_geo_list]) try: week_geo_aggs_string = '&'.join([item.split('\t')[-1] for item in week_geo_list]) except: week_geo_aggs_string = '' results[uid]['activity_geo'] = week_geo_string results[uid]['activity_geo_aggs'] = week_geo_aggs_string #print 'update geo results:', results return results
def get_activity_time(uid_list): results = {} now_ts = time.time() now_date = ts2datetime(now_ts) #run_type if RUN_TYPE == 1: timestamp = datetime2ts(now_date) else: timestamp = datetime2ts(RUN_TEST_TIME) activity_list_dict = {} # {uid:[activity_list], uid:[]} for i in range(1,WEEK+1): ts = timestamp - DAY*i print ts r_result = r_cluster.hmget('activity_'+str(ts), uid_list) #print r_result if r_result: for j in range(0, len(uid_list)): uid = uid_list[j] if uid not in activity_list_dict: activity_list_dict[uid] = [0 for i in range(0, 96)] user_r_result = r_result[j] if user_r_result: user_activity_dict = json.loads(user_r_result) for i in range(0, 96): try: count = user_activity_dict[str(i)] except: count = 0 activity_list_dict[uid].append(count) for uid in uid_list: activity_list = activity_list_dict[uid] statusnum = sum(activity_list) signal = np.array(activity_list) fftResult = np.abs(np.fft.fft(signal))**2 n = signal.size freq = np.fft.fftfreq(n, d=1) i = 0 max_val = 0 max_freq = 0 for val in fftResult: if val>max_val and freq[i]>0: max_val = val max_freq = freq[i] i += 1 results[uid] = {'statusnum': statusnum, 'activity_time': math.log(max_freq + 1)} return results
def get_attr_trend(uid_list): result = {} now_ts = time.time() date = ts2datetime(now_ts - 24*3600) timestamp = datetime2ts(date) #test timestamp = datetime2ts('2013-09-08') time_result = dict() segment_result = dict() for i in range(1, 8): ts = timestamp - i*24*3600 r_result = r_cluster.hmget('activity_'+str(ts), uid_list) #print 'r_result:', r_result for item in r_result: if item: item = json.loads(item) for segment in item: try: time_result[int(segment)/16*15*60*16+ts] += item[segment] except: time_result[int(segment)/16*15*60*16+ts] = item[segment] try: segment_result[int(segment)/16*15*60*16] += item[segment] except: segment_result[int(segment)/16*15*60*16] = item[segment] trend_list = [] for i in range(1, 8): ts = timestamp - i*24*3600 for j in range(0, 6): time_seg = ts + j*15*60*16 if time_seg in time_result: trend_list.append((time_seg, time_result[time_seg])) else: trend_list.append((time_seg, 0)) sort_trend_list = sorted(trend_list, key=lambda x:x[0], reverse=False) #print 'time_result:', time_result #print 'trend_list:', trend_list #print 'sort_trend_list:', sort_trend_list result['activity_trend'] = json.dumps(sort_trend_list) result['activity_time'] = json.dumps(segment_result) return result
def get_school(uid_list): now_ts = time.time() now_date_ts = datetime2ts(ts2datetime(now_ts)) school_results = {} for i in range(WEEK, 0, -1): ts = now_date_ts - DAY * i ip_results = r_cluster.hmget('new_ip_' + str(ts), uid_list) count = 0 for uid in uid_list: if uid not in school_results: school_results[uid] = {} ip_item = ip_results[count] if ip_item: uid_ip_dict = json.loads(ip_item) else: uid_ip_dict = {} for ip in uid_ip_dict: ip_count = len(uid_ip_dict[ip].split('&')) school = ip2school(ip) if school: try: school_results[uid][school] += ip_count except: school_results[uid][school] = ip_count count += 1 results = {} for uid in uid_list: school_dict = school_results[uid] school_string = '&'.join(school_dict.keys()) if school_dict != {}: is_school = '1' else: is_school = '0' results[uid] = { 'is_school': is_school, 'school_string': school_string, 'school_dict': json.dumps(school_dict) } return results
def update_day_sensitive(uid_list): results = {} all_results = {} now_ts = time.time() #run_type if RUN_TYPE == 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = test_ts for i in range(WEEK,0,-1): ts = now_date_ts - DAY*i count = 0 sensitive_results = r_cluster.hmget('sensitive_'+str(ts), uid_list) for uid in uid_list: if uid not in results: results[uid] = {} sensitive_item = sensitive_results[count] if sensitive_item: sensitive_dict = json.loads(sensitive_item) else: sensitive_dict = {} for sensitive in sensitive_dict: try: results[uid][sensitive] += 1 except: results[uid][sensitive] = 1 for uid in uid_list: user_sensitive_dict = results[uid] sensitive_score = 0 for item in user_sensitive_dict: k = item v = user_sensitive_dict[k] tmp_stage = r_sensitive.hget('sensitive_words', k) if tmp_stage: sensitive_score += v * sensitive_score_dict[str(tmp_stage)] sensitive_string = '&'.join(user_sensitive_dict.keys()) all_results[uid] = {'sensitive_string': sensitive_string, 'sensitive_dict':user_sensitive_dict,\ 'sensitive': sensitive_score} return all_results
def update_day_sensitive(uid_list): results = {} all_results = {} now_ts = time.time() #run_type if RUN_TYPE == 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = test_ts for i in range(WEEK, 0, -1): ts = now_date_ts - DAY * i count = 0 sensitive_results = r_cluster.hmget('sensitive_' + str(ts), uid_list) for uid in uid_list: if uid not in results: results[uid] = {} sensitive_item = sensitive_results[count] if sensitive_item: sensitive_dict = json.loads(sensitive_item) else: sensitive_dict = {} for sensitive in sensitive_dict: try: results[uid][sensitive] += 1 except: results[uid][sensitive] = 1 for uid in uid_list: user_sensitive_dict = results[uid] sensitive_score = 0 for item in user_sensitive_dict: k = item v = user_sensitive_dict[k] tmp_stage = r_sensitive.hget('sensitive_words', k) if tmp_stage: sensitive_score += v * sensitive_score_dict[str(tmp_stage)] sensitive_string = '&'.join(user_sensitive_dict.keys()) all_results[uid] = {'sensitive_string': sensitive_string, 'sensitive_dict':user_sensitive_dict,\ 'sensitive': sensitive_score} return all_results
def get_school(uid_list): now_ts = time.time() now_date_ts = datetime2ts(ts2datetime(now_ts)) school_results = {} for i in range(WEEK, 0, -1): ts = now_date_ts - DAY * i ip_results = r_cluster.hmget('new_ip_'+str(ts), uid_list) count = 0 for uid in uid_list: if uid not in school_results: school_results[uid] = {} ip_item = ip_results[count] if ip_item: uid_ip_dict = json.loads(ip_item) else: uid_ip_dict = {} for ip in uid_ip_dict: ip_count = len(uid_ip_dict[ip].split('&')) school = ip2school(ip) if school: try: school_results[uid][school] += ip_count except: school_results[uid][school] = ip_count count += 1 results = {} for uid in uid_list: school_dict = school_results[uid] school_string = '&'.join(school_dict.keys()) if school_dict != {}: is_school = '1' else: is_school = '0' results[uid] = {'is_school': is_school, 'school_string': school_string, 'school_dict': json.dumps(school_dict)} return results
def get_flow_information(uid_list): # 每天更新,只计算前一天的数据 result_dict = {} now_ts = time.time() - 3600 * 24 now_date = ts2datetime(now_ts) # date: 2013-09-01 #now_date = "2013-09-08" hashtag_results = {} geo_results = {} #ts = datetime2ts(now_date) user_hashtag_result = {} user_sensitive_hashtag = {} sensitive_words = {} user_ip_result = {} user_sensitive_ip = {} for i in range(1, 8): ts = ts - 3600 * 24 date = ts2datetime(ts).replace('-', '') hashtag_results = r_cluster.hmget('hashtag_' + str(date), uid_list) sensitive_hashtag = r_cluster.hmget('sensitive_hashtag_' + str(date), uid_list) ip_results = r_cluster.hmget('ip_' + str(date), uid_list) sensitive_ip = r_cluster.hmget('sensitive_ip_' + str(date), uid_list) sensitive_results = r_cluster.hmget('sensitive_' + str(date), uid_list) for j in range(0, len(uid_list)): uid = uid_list[j] if sensitive_results[j]: sensitive_words_results = json.loads(sensitive_results[j]) if sensitive_words.has_key(uid): sensitive_words[uid].update( {date: sensitive_words_results}) else: sensitive_words[uid] = {date: sensitive_words_results} if hashtag_results[j]: hashtag_dict = json.loads(hashtag_results[j]) if user_hashtag_result.has_key(uid): user_hashtag_result[uid].update({date: hashtag_dict}) else: user_hashtag_result[uid] = {date: hashtag_dict} if sensitive_hashtag[j]: sensitive_hashtag_dict = json.loads(sensitive_hashtag[j]) if user_sensitive_hashtag.has_key(uid): user_sensitive_hashtag[uid].update( {date: sensitive_hashtag_dict}) else: user_sensitive_hashtag[uid] = { date: sensitive_hashtag_dict } if ip_results[j]: ip_dict = json.loads(ip_results[j]) if user_ip_result.has_key(uid): user_ip_result[uid].update({date: ip_dict}) else: user_ip_result[uid] = {date: ip_dict} if sensitive_ip[j]: sensitive_ip_result = json.loads(sensitive_ip[j]) if user_sensitive_ip.has_key(uid): user_sensitive_ip[uid].update({date: sensitive_ip_result}) else: user_sensitive_ip[uid] = {date: sensitive_ip_result} for uid in uid_list: hashtag_string = '' sensitive_hashtag_string = '' ip_string = '' ip_all = "" sensitive_ip_string = '' hashtag_dict = {} sensitive_hashtag_dict = {} ip_dict = {} sensitive_ip_dict = {} sensitive_words_string = '' sensitive_words_dict = {} if sensitive_words.has_key(uid): sensitive_words_string = extract_string(sensitive_words[uid]) sensitive_words_dict = json.dumps(sensitive_words[uid]) if user_hashtag_result.has_key(uid): hashtag_string = extract_string(user_hashtag_result[uid]) hashtag_dict = json.dumps(user_hashtag_result[uid]) if user_sensitive_hashtag.has_key(uid): sensitive_hashtag_string = extract_string( user_sensitive_hashtag[uid]) sensitive_hashtag_dict = json.dumps(user_sensitive_hashtag[uid]) if user_ip_result.has_key(uid): ip_string = extract_geo(user_ip_result[uid]) ip_dict = json.dumps(ip_to_geo(user_ip_result[uid])) ip_all = json.dumps(user_ip_result[uid]) if user_sensitive_ip.has_key(uid): sensitive_ip_string = extract_geo(user_sensitive_ip[uid]) sensitive_ip_dict = json.dumps(ip_to_geo(user_sensitive_ip[uid])) result_dict[uid] = {"hashtag_string": hashtag_string, "hashtag_dict": hashtag_dict, \ "sensitive_hashtag_string": sensitive_hashtag_string, "sensitive_hashtag_dict": sensitive_hashtag_dict, \ "geo_activity": ip_dict, "geo_string": ip_string, 'ip': ip_all, \ "sensitive_geo_activity": sensitive_ip_dict, "sensitive_geo_string":sensitive_ip_string, \ 'sensitive_words_string': sensitive_words_string, 'sensitive_words_dict': sensitive_words_dict} return result_dict
def update_flow_information(user_info): results = {} # results ={uid: {'activity_geo_dict':'', 'activity_geo':'', 'hashtag_dict':'', 'hashtag':'', 'online_pattern_dict':'', 'online_pattern':''}} uid_list = user_info.keys() now_ts = time.time() now_date = ts2datetime(now_ts) timestamp = datetime2ts(now_date) #test timestamp = datetime2ts('2013-09-08') user_hashtag_dict = dict() user_online_dict = dict() ip_user_count_dict = {} new_day_ip_dict = dict() for i in range(7,0,-1): ts = timestamp - 24*3600*i print 'iter date:', ts2date(ts) results = r_cluster.hmget('hashtag_'+str(ts), uid_list) online_pattern_results = r_cluster.hmget('online_'+str(ts), uid_list) if i==0: ip_result = r_cluater.hmget('hashtag_'+str(ts), uid_list) for j in range(0, len(uid_list)): uid = uid_list[j] #attr: hashtag if results[j]: hashtag_dict = json.loads(results[j]) for hashtag in hashtag_dict: if uid in user_hashtag_dict: try: user_hashtag_dict[uid][hashtag] += hashtag_dict[hashtag] except: user_hashtag_dict[uid][hashtag] = hashtag_dict[hashtag] else: user_hashtag_dict[uid] = {hashtag: hashtag_dict[hashtag]} ''' #attr: online_pattern if online_pattern_results[j]: online_pattern_dict = json.loads(online_pattern_results[j]) for online_pattern in online_pattern_dict: if uid in user_online_dict: try: user_online_dict[uid][online_pattern] += online_pattern_dict[online_pattern] except: user_online_dict[uid][online_pattern] = online_pattern_dict[online_pattern] else: user_online_dict[uid] = {online_pattern: online_pattern_dict[online_pattern]} ''' #attr: activity_geo by ip-timestamp if i==0 and ip_result[j]: ip_timestamp_dict = json.loads(ip_result[j]) old_flow_information = user_info[uid] old_day_geo_list = json.loads(old_flow_information['activity_geo_dict']) for ip in ip_timestamp_dict: ip_count = len(ip_timestamp_dict[ip].split('&')) new_day_ip_dict[uid][ip] = ip_count geo_dict = ip2city(new_day_ip_dict[uid]) if len(old_day_geo_list)>=30: new_day_geo_list = old_day_geo_list[1:].append(geo_dict) else: new_day_geo_list = old_day_geo_list.append(geo_dict) week_geo_list = [] week_day_geo_list = new_day_geo[-7:] for day_geo_dict in week_day_geo_list: week_geo_list.extend(day_geo_dict.keys()) week_geo_list = list(set(week_geo_list)) activity_geo_string = '' new_week_geo_list = [] for geo_string in week_geo_list: day_geo_string = '&'.join(geo_string.split('\t')) new_week_geo_list.append(day_geo_string) activity_geo_string = '&'.join(new_week_geo_list) print 'activity_geo_string:', activity_geo_string for uid in uid_list: #attr: hashtag try: hashtag_dict = user_hashtag_dict[uid] hashtag_string = json.dumps(hashtag_dict) hashtag_list = '&'.join(hashtag_dict.keys()) except KeyError: hashtag_string = '' hashtag_list = '' ''' #attr: online_pattern try: online_dict = user_online_dict[uid] online_string = json.dumps(online_dict) online_list = '&'.join(online_dict.keys()) except KeyError: online_string = '' online_list = '' ''' result[uid] = {'hashtag_dict':hashtag_string, 'hashtag':hashtag_list, \ 'activity_geo_dict': json.loads(new_day_geo_list), 'activity_geo': activity_geo_string, \ 'online_pattern_dict': online_pattern_string, 'online_pattern': online_pattern_list} return result
uhlist = zip(uidlist, hashtag_list) uhtlist = [] for uh in uhlist: uh = list(uh) uh.append(ts) uhtlist.append(uh) data.extend(uhtlist) with open("hashtag_0521.txt", "w") as fw: for d in data: if d[1] != None: fw.write("%s\n" % json.dumps(d)) at_data = [] for ts in tss: ns = "at_" + str(ts) hashtag_list = R_CLUSTER_FLOW2.hmget(ns, uidlist) hashtag_list = [json.loads(h) if h else None for h in hashtag_list] uhlist = zip(uidlist, hashtag_list) uhtlist = [] for uh in uhlist: uh = list(uh) uh.append(ts) uhtlist.append(uh) at_data.extend(uhtlist) with open("at_0521.txt", "w") as fw: for a in at_data: if a[1] != None: fw.write("%s\n" % json.dumps(a))
def get_flow_information(uid_list): results = {} #results = {uid:{'hashtag_dict':{},'hashtag':'', 'keywords_dict':{}, 'keywords_string':'', 'activity_geo':'', 'activity_geo_dict':dict}} iter_results = { } # iter_results = {uid:{'hashtag': hashtag_dict, 'geo':geo_dict, 'keywords':keywords_dict}} now_ts = time.time() now_date_ts = datetime2ts(ts2datetime(now_ts)) #test now_date_ts = test_ts for i in range(7, 0, -1): ts = now_date_ts - DAY * i iter_date = ts2datetime(ts) flow_text_index_name = flow_text_index_name_pre + iter_date uid_day_geo = {} #compute hashtag and geo hashtag_results = r_cluster.hmget('hashtag_' + str(ts), uid_list) ip_results = r_cluster.hmget('new_ip_' + str(ts), uid_list) #compute sensitive_words sensitive_results = r_cluster.hmget('sensitive_' + str(ts), uid_list) count = 0 for uid in uid_list: #init iter_results[uid] if uid not in iter_results: iter_results[uid] = { 'hashtag': {}, 'geo': {}, 'geo_track': [], 'keywords': {}, 'sensitive': {} } #compute hashtag hashtag_item = hashtag_results[count] if hashtag_item: uid_hashtag_dict = json.loads(hashtag_item) else: uid_hashtag_dict = {} for hashtag in uid_hashtag_dict: try: iter_results[uid]['hashtag'][hashtag] += uid_hashtag_dict[ hashtag] except: iter_results[uid]['hashtag'][hashtag] = uid_hashtag_dict[ hashtag] #compute sensitive sensitive_item = sensitive_results[count] if sensitive_item: uid_sensitive_dict = json.loads(sensitive_item) else: uid_sensitive_dict = {} for sensitive_word in uid_sensitive_dict: try: iter_results[uid]['sensitive'][ sensitive_word] += uid_sensitive_dict[sensitive_word] except: iter_results[uid]['sensitive'][ sensitive_word] = uid_sensitive_dict[sensitive_word] #compute geo uid_day_geo[uid] = {} ip_item = ip_results[count] if ip_item: uid_ip_dict = json.loads(ip_item) else: uid_ip_dict = {} for ip in uid_ip_dict: ip_count = len(uid_ip_dict[ip].split('&')) geo = ip2city(ip) if geo: #print 'geo:', geo try: iter_results[uid]['geo'][geo] += ip_count except: iter_results[uid]['geo'][geo] = ip_count try: uid_day_geo[uid][geo] += ip_count except: uid_day_geo[uid][geo] = ip_count iter_results[uid]['geo_track'].append(uid_day_geo[uid]) count += 1 #compute keywords: try: text_results = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type, \ body={'query':{'filtered':{'filter':{'terms':{'uid': uid_list}}}}, 'size':MAX_VALUE}, _source=True, fields=['uid', 'keywords_dict','text'])['hits']['hits'] except: text_results = {} for item in text_results: #print 'keywords item:', item uid = item['fields']['uid'][0] uid_keywords_dict = json.loads(item['fields']['keywords_dict'][0]) for keywords in uid_keywords_dict: try: iter_results[uid]['keywords'][ keywords] += uid_keywords_dict[keywords] except: iter_results[uid]['keywords'][ keywords] = uid_keywords_dict[keywords] #jln filter keyword 2016/11/08 weibo_text = json.loads(item['fields']['text'][0]) filter_keywords_dict = get_weibo_single(weibo_text) for keywords in filter_keywords_dict: try: iter_results[uid]['filter_keywords'][ keywords] += uid_keywords_dict[keywords] except: iter_results[uid]['filter_keywords'][ keywords] = uid_keywords_dict[keywords] #get keywords top for uid in uid_list: results[uid] = {} hashtag_dict = iter_results[uid]['hashtag'] results[uid]['hashtag_dict'] = json.dumps(hashtag_dict) results[uid]['hashtag'] = '&'.join(hashtag_dict.keys()) #sensitive words sensitive_word_dict = iter_results[uid]['sensitive'] results[uid]['sensitive_dict'] = json.dumps(sensitive_word_dict) results[uid]['sensitive_string'] = '&'.join(sensitive_word_dict.keys()) sensitive_score = 0 for item in sensitive_word_dict: k = item v = sensitive_word_dict[k] tmp_stage = r_sensitive.hget('sensitive_words', k) if tmp_stage: sensitive_score += v * sensitive_score_dict[str(tmp_stage)] results[uid]['sensitive'] = sensitive_score #print 'sensitive_dict:', results[uid]['sensitive_dict'] #print 'sensitive_string:', results[uid]['sensitive_string'] #print 'sensitive:', results[uid]['sensitive'] #geo geo_dict = iter_results[uid]['geo'] geo_track_list = iter_results[uid]['geo_track'] results[uid]['activity_geo_dict'] = json.dumps(geo_track_list) geo_dict_keys = geo_dict.keys() #print 'geo_dict_keys:', geo_dict_keys results[uid]['activity_geo'] = '&'.join( ['&'.join(item.split('\t')) for item in geo_dict_keys]) #print 'activity_geo:', results[uid]['activity_geo'] keywords_dict = iter_results[uid]['keywords'] keywords_top50 = sorted(keywords_dict.items(), key=lambda x: x[1], reverse=True)[:50] keywords_top50_string = '&'.join( [keyword_item[0] for keyword_item in keywords_top50]) filter_keywords_dict = iter_results[uid]['filter_keywords'] f_keywords_top50 = sorted(filter_keywords_dict.items(), key=lambda x: x[1], reverse=True)[:50] f_keywords_top50_string = '&'.join( [filter_keywords_dict[0] for keyword_item in f_keywords_top50]) results[uid]['keywords'] = json.dumps(keywords_top50) results[uid]['keywords_string'] = keywords_top50_string results[uid]['filter_keywords'] = json.dumps(f_keywords_top50) results[uid]['filter_keywords_string'] = f_keywords_top50_string return results
def get_flow_information_v2(uid_list, all_user_keywords_dict): results = {} #results = {uid:{'hashtag_dict':{},'hashtag':'', 'keywords_dict':{}, 'keywords_string':'', 'activity_geo':'', 'activity_geo_dict':dict, 'activity_geo_aggs':''}} iter_results = { } # iter_results = {uid:{'hashtag': hashtag_dict, 'geo':geo_dict, 'keywords':keywords_dict}} now_ts = time.time() #run_type today_sensitive_results = {} if RUN_TYPE == 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = test_ts print 'run_type:', RUN_TYPE for i in range(WEEK, 0, -1): ts = now_date_ts - DAY * i uid_day_geo = {} #compute hashtag and geo hashtag_results = r_cluster_3.hmget('hashtag_' + str(ts), uid_list) ip_results = r_cluster.hmget('new_ip_' + str(ts), uid_list) #print 'ip_results:', ip_results #compute sensitive_words sensitive_results = r_cluster_3.hmget('sensitive_' + str(ts), uid_list) count = 0 for uid in uid_list: #init iter_results[uid] if uid not in iter_results: iter_results[uid] = { 'hashtag': {}, 'geo': {}, 'geo_track': [], 'keywords': {}, 'sensitive': {}, 'school': {}, 'week_ip': { 0: {}, 1: {}, 2: {}, 3: {}, 4: {}, 5: {} }, 'ip': {} } if uid not in today_sensitive_results: today_sensitive_results[uid] = {} #compute hashtag hashtag_item = hashtag_results[count] if hashtag_item: uid_hashtag_dict = json.loads(hashtag_item) else: uid_hashtag_dict = {} for hashtag in uid_hashtag_dict: try: iter_results[uid]['hashtag'][hashtag] += uid_hashtag_dict[ hashtag] except: iter_results[uid]['hashtag'][hashtag] = uid_hashtag_dict[ hashtag] #compute sensitive sensitive_item = sensitive_results[count] if sensitive_item: uid_sensitive_dict = json.loads(sensitive_item) else: uid_sensitive_dict = {} for sensitive_word in uid_sensitive_dict: try: iter_results[uid]['sensitive'][ sensitive_word] += uid_sensitive_dict[sensitive_word] except: iter_results[uid]['sensitive'][ sensitive_word] = uid_sensitive_dict[sensitive_word] if ts == now_date_ts - DAY: try: today_sensitive_results[uid][ sensitive_word] += uid_sensitive_dict[ sensitive_word] except: today_sensitive_results[uid][ sensitive_word] = uid_sensitive_dict[ sensitive_word] #compute geo uid_day_geo[uid] = {} ip_item = ip_results[count] if ip_item: uid_ip_dict = json.loads(ip_item) else: uid_ip_dict = {} for ip in uid_ip_dict: ip_count = len(uid_ip_dict[ip].split('&')) geo, school = ip2city(ip) if geo: try: iter_results[uid]['geo'][geo] += ip_count except: iter_results[uid]['geo'][geo] = ip_count try: uid_day_geo[uid][geo] += ip_count except: uid_day_geo[uid][geo] = ip_count if school: try: iter_results[uid]['school'][school] += ip_count except: iter_results[uid]['school'][school] = ip_count #deal ip: job_ip&home_ip&active_ip ip_time_list = uid_ip_dict[ip].split('&') try: iter_results[uid]['ip'][ip] += ip_count except: iter_results[uid]['ip'] = {ip: ip_count} for ip_time_item in ip_time_list: ip_timesegment = (int(ip_time_item) - ts) / IP_TIME_SEGMENT try: iter_results[uid]['week_ip'][ip_timesegment][ip] += 1 except: iter_results[uid]['week_ip'][ip_timesegment][ip] = 1 #end deal ip iter_results[uid]['geo_track'].append(uid_day_geo[uid]) count += 1 #get keywords top for uid in uid_list: #print 'test iter_results_ip:', iter_results[uid]['week_ip'] results[uid] = {} #hashtag hashtag_dict = iter_results[uid]['hashtag'] results[uid]['hashtag_dict'] = json.dumps(hashtag_dict) results[uid]['hashtag'] = '&'.join(hashtag_dict.keys()) #sensitive words sensitive_word_dict = iter_results[uid]['sensitive'] results[uid]['sensitive_dict'] = json.dumps(sensitive_word_dict) results[uid]['sensitive_string'] = '&'.join(sensitive_word_dict.keys()) sensitive_score = 0 today_sensitive_results_user = today_sensitive_results[uid] for sensitive_item in today_sensitive_results_user: k = sensitive_item v = today_sensitive_results_user[sensitive_item] tmp_stage = r_sensitive.hget('sensitive_words', k) if tmp_stage: sensitive_score += v * sensitive_score_dict[str(tmp_stage)] results[uid]['sensitive'] = sensitive_score #geo geo_dict = iter_results[uid]['geo'] geo_track_list = iter_results[uid]['geo_track'] results[uid]['activity_geo_dict'] = json.dumps(geo_track_list) geo_dict_keys = geo_dict.keys() results[uid]['activity_geo'] = '&'.join( ['&'.join(item.split('\t')) for item in geo_dict_keys]) try: results[uid]['activity_geo_aggs'] = '&'.join( [item.split('\t')[-1] for item in geo_dict_keys]) except: results[uid]['activity_geo_aggs'] = '' #keywords keywords_dict = all_user_keywords_dict[uid] keywords_top50 = sorted(keywords_dict.items(), key=lambda x: x[1], reverse=True)[:50] keywords_top50_string = '&'.join( [keyword_item[0] for keyword_item in keywords_top50]) results[uid]['keywords'] = json.dumps(keywords_top50) results[uid]['keywords_string'] = keywords_top50_string #school dict school_dict = iter_results[uid]['school'] school_string = '&'.join(school_dict.keys()) if school_dict != {}: is_school = '1' else: is_school = '0' results[uid]['is_school'] = is_school results[uid]['school_string'] = school_string results[uid]['school_dict'] = json.dumps(school_dict) #ip: job_ip&home_ip&activity_ip #activity_ip all_ip_dict = iter_results[uid]['ip'] sort_all_ip = sorted(all_ip_dict.items(), key=lambda x: x[1], reverse=True) try: activity_ip = sort_all_ip[0][0] except: activity_ip = '' results[uid]['activity_ip'] = str(activity_ip) #job_ip & home_ip week_time_ip_dict = iter_results[uid]['week_ip'] for i in range(0, 6): try: segment_dict = week_time_ip_dict[i] except: week_time_ip_dict[i] = {} home_ip, job_ip = get_ip_description(week_time_ip_dict) results[uid]['home_ip'] = str(home_ip) results[uid]['job_ip'] = str(job_ip) return results
def get_flow_information(uid_list): # 每天更新,只计算前一天的数据 result_dict = {} now_ts = time.time()-3600*24 now_date = ts2datetime(now_ts) # date: 2013-09-01 #now_date = "2013-09-08" hashtag_results = {} geo_results = {} #ts = datetime2ts(now_date) user_hashtag_result = {} user_sensitive_hashtag = {} sensitive_words = {} user_ip_result = {} user_sensitive_ip = {} for i in range(1,8): ts = ts - 3600*24 date = ts2datetime(ts).replace('-','') hashtag_results = r_cluster.hmget('hashtag_'+str(date), uid_list) sensitive_hashtag = r_cluster.hmget('sensitive_hashtag_'+str(date), uid_list) ip_results = r_cluster.hmget('ip_'+str(date), uid_list) sensitive_ip = r_cluster.hmget('sensitive_ip_'+str(date), uid_list) sensitive_results = r_cluster.hmget('sensitive_'+str(date), uid_list) for j in range(0, len(uid_list)): uid = uid_list[j] if sensitive_results[j]: sensitive_words_results = json.loads(sensitive_results[j]) if sensitive_words.has_key(uid): sensitive_words[uid].update({date: sensitive_words_results}) else: sensitive_words[uid] = {date: sensitive_words_results} if hashtag_results[j]: hashtag_dict = json.loads(hashtag_results[j]) if user_hashtag_result.has_key(uid): user_hashtag_result[uid].update({date: hashtag_dict}) else: user_hashtag_result[uid] = {date: hashtag_dict} if sensitive_hashtag[j]: sensitive_hashtag_dict = json.loads(sensitive_hashtag[j]) if user_sensitive_hashtag.has_key(uid): user_sensitive_hashtag[uid].update({date: sensitive_hashtag_dict}) else: user_sensitive_hashtag[uid] = {date: sensitive_hashtag_dict} if ip_results[j]: ip_dict = json.loads(ip_results[j]) if user_ip_result.has_key(uid): user_ip_result[uid].update({date: ip_dict}) else: user_ip_result[uid] = {date: ip_dict} if sensitive_ip[j]: sensitive_ip_result = json.loads(sensitive_ip[j]) if user_sensitive_ip.has_key(uid): user_sensitive_ip[uid].update({date: sensitive_ip_result}) else: user_sensitive_ip[uid] = {date: sensitive_ip_result} for uid in uid_list: hashtag_string = '' sensitive_hashtag_string = '' ip_string = '' ip_all = "" sensitive_ip_string = '' hashtag_dict = {} sensitive_hashtag_dict = {} ip_dict = {} sensitive_ip_dict = {} sensitive_words_string = '' sensitive_words_dict = {} if sensitive_words.has_key(uid): sensitive_words_string = extract_string(sensitive_words[uid]) sensitive_words_dict = json.dumps(sensitive_words[uid]) if user_hashtag_result.has_key(uid): hashtag_string = extract_string(user_hashtag_result[uid]) hashtag_dict = json.dumps(user_hashtag_result[uid]) if user_sensitive_hashtag.has_key(uid): sensitive_hashtag_string = extract_string(user_sensitive_hashtag[uid]) sensitive_hashtag_dict = json.dumps(user_sensitive_hashtag[uid]) if user_ip_result.has_key(uid): ip_string = extract_geo(user_ip_result[uid]) ip_dict = json.dumps(ip_to_geo(user_ip_result[uid])) ip_all = json.dumps(user_ip_result[uid]) if user_sensitive_ip.has_key(uid): sensitive_ip_string = extract_geo(user_sensitive_ip[uid]) sensitive_ip_dict = json.dumps(ip_to_geo(user_sensitive_ip[uid])) result_dict[uid] = {"hashtag_string": hashtag_string, "hashtag_dict": hashtag_dict, \ "sensitive_hashtag_string": sensitive_hashtag_string, "sensitive_hashtag_dict": sensitive_hashtag_dict, \ "geo_activity": ip_dict, "geo_string": ip_string, 'ip': ip_all, \ "sensitive_geo_activity": sensitive_ip_dict, "sensitive_geo_string":sensitive_ip_string, \ 'sensitive_words_string': sensitive_words_string, 'sensitive_words_dict': sensitive_words_dict} return result_dict
def get_flow_information_v2(uid_list, all_user_keywords_dict): results = {} #results = {uid:{'hashtag_dict':{},'hashtag':'', 'keywords_dict':{}, 'keywords_string':'', 'activity_geo':'', 'activity_geo_dict':dict, 'activity_geo_aggs':''}} iter_results = { } # iter_results = {uid:{'hashtag': hashtag_dict, 'geo':geo_dict, 'keywords':keywords_dict}} now_ts = time.time() #run_type if RUN_TYPE == 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = test_ts for i in range(WEEK, 0, -1): ts = now_date_ts - DAY * i print ts uid_day_geo = {} #compute hashtag and geo hashtag_results = r_cluster.hmget('hashtag_' + str(ts), uid_list) ip_results = r_cluster.hmget('new_ip_' + str(ts), uid_list) #compute sensitive_words sensitive_results = r_cluster.hmget('sensitive_' + str(ts), uid_list) count = 0 for uid in uid_list: #init iter_results[uid] if uid not in iter_results: iter_results[uid] = { 'hashtag': {}, 'geo': {}, 'geo_track': [], 'keywords': {}, 'sensitive': {} } #compute hashtag hashtag_item = hashtag_results[count] if hashtag_item: uid_hashtag_dict = json.loads(hashtag_item) else: uid_hashtag_dict = {} for hashtag in uid_hashtag_dict: try: iter_results[uid]['hashtag'][hashtag] += uid_hashtag_dict[ hashtag] except: iter_results[uid]['hashtag'][hashtag] = uid_hashtag_dict[ hashtag] #compute sensitive sensitive_item = sensitive_results[count] if sensitive_item: uid_sensitive_dict = json.loads(sensitive_item) else: uid_sensitive_dict = {} for sensitive_word in uid_sensitive_dict: try: iter_results[uid]['sensitive'][ sensitive_word] += uid_sensitive_dict[sensitive_word] except: iter_results[uid]['sensitive'][ sensitive_word] = uid_sensitive_dict[sensitive_word] #compute geo uid_day_geo[uid] = {} ip_item = ip_results[count] if ip_item: uid_ip_dict = json.loads(ip_item) else: uid_ip_dict = {} for ip in uid_ip_dict: ip_count = len(uid_ip_dict[ip].split('&')) geo = ip2city(ip) if geo: try: iter_results[uid]['geo'][geo] += ip_count except: iter_results[uid]['geo'][geo] = ip_count try: uid_day_geo[uid][geo] += ip_count except: uid_day_geo[uid][geo] = ip_count iter_results[uid]['geo_track'].append(uid_day_geo[uid]) count += 1 #get keywords top for uid in uid_list: results[uid] = {} #hashtag hashtag_dict = iter_results[uid]['hashtag'] results[uid]['hashtag_dict'] = json.dumps(hashtag_dict) results[uid]['hashtag'] = '&'.join(hashtag_dict.keys()) #sensitive words sensitive_word_dict = iter_results[uid]['sensitive'] results[uid]['sensitive_dict'] = json.dumps(sensitive_word_dict) results[uid]['sensitive_string'] = '&'.join(sensitive_word_dict.keys()) sensitive_score = 0 for sensitive_item in sensitive_word_dict: k = sensitive_item v = sensitive_word_dict[sensitive_item] tmp_stage = r_sensitive.hget('sensitive_words', k) if tmp_stage: sensitive_score += v * sensitive_score_dict[str(tmp_stage)] results[uid]['sensitive'] = sensitive_score #print 'sensitive_dict:', results[uid]['sensitive_dict'] #print 'sensitive_string:', results[uid]['sensitive_string'] #print 'sensitive:', results[uid]['sensitive'] #geo geo_dict = iter_results[uid]['geo'] geo_track_list = iter_results[uid]['geo_track'] results[uid]['activity_geo_dict'] = json.dumps(geo_track_list) geo_dict_keys = geo_dict.keys() results[uid]['activity_geo'] = '&'.join( ['&'.join(item.split('\t')) for item in geo_dict_keys]) try: results[uid]['activity_geo_aggs'] = '&'.join( [item.split('\t')[-1] for item in geo_dict_keys]) except: results[uid]['activity_geo_aggs'] = '' keywords_dict = all_user_keywords_dict[uid] keywords_top50 = sorted(keywords_dict.items(), key=lambda x: x[1], reverse=True)[:50] keywords_top50_string = '&'.join( [keyword_item[0] for keyword_item in keywords_top50]) results[uid]['keywords'] = json.dumps(keywords_top50) results[uid]['keywords_string'] = keywords_top50_string return results
def get_flow_information_v2(uid_list, all_user_keywords_dict): results = {} #results = {uid:{'hashtag_dict':{},'hashtag':'', 'keywords_dict':{}, 'keywords_string':'', 'activity_geo':'', 'activity_geo_dict':dict, 'activity_geo_aggs':''}} iter_results = {} # iter_results = {uid:{'hashtag': hashtag_dict, 'geo':geo_dict, 'keywords':keywords_dict}} now_ts = time.time() #run_type today_sensitive_results = {} if RUN_TYPE == 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = test_ts for i in range(WEEK,0,-1): ts = now_date_ts - DAY*i uid_day_geo = {} #compute hashtag and geo hashtag_results = r_cluster_3.hmget('hashtag_'+str(ts), uid_list) ip_results = r_cluster.hmget('new_ip_'+str(ts), uid_list) #compute sensitive_words sensitive_results = r_cluster_3.hmget('sensitive_'+str(ts), uid_list) count = 0 for uid in uid_list: #init iter_results[uid] if uid not in iter_results: iter_results[uid] = {'hashtag':{}, 'geo':{},'geo_track':[],'keywords':{}, 'sensitive':{}, 'school':{}} if uid not in today_sensitive_results: today_sensitive_results[uid] = {} #compute hashtag hashtag_item = hashtag_results[count] if hashtag_item: uid_hashtag_dict = json.loads(hashtag_item) else: uid_hashtag_dict = {} for hashtag in uid_hashtag_dict: try: iter_results[uid]['hashtag'][hashtag] += uid_hashtag_dict[hashtag] except: iter_results[uid]['hashtag'][hashtag] = uid_hashtag_dict[hashtag] #compute sensitive sensitive_item = sensitive_results[count] if sensitive_item: uid_sensitive_dict = json.loads(sensitive_item) else: uid_sensitive_dict = {} for sensitive_word in uid_sensitive_dict: try: iter_results[uid]['sensitive'][sensitive_word] += uid_sensitive_dict[sensitive_word] except: iter_results[uid]['sensitive'][sensitive_word] = uid_sensitive_dict[sensitive_word] if ts == now_date_ts - DAY: try: today_sensitive_results[uid][sensitive_word] += uid_sensitive_dict[sensitive_word] except: today_sensitive_results[uid][sensitive_word] = uid_sensitive_dict[sensitive_word] #compute geo uid_day_geo[uid] = {} ip_item = ip_results[count] if ip_item: uid_ip_dict = json.loads(ip_item) else: uid_ip_dict = {} for ip in uid_ip_dict: ip_count = len(uid_ip_dict[ip].split('&')) geo, school = ip2city(ip) if geo: try: iter_results[uid]['geo'][geo] += ip_count except: iter_results[uid]['geo'][geo] = ip_count try: uid_day_geo[uid][geo] += ip_count except: uid_day_geo[uid][geo] = ip_count if school: try: iter_results[uid]['school'][school] += ip_count except: iter_results[uid]['school'][school] = ip_count iter_results[uid]['geo_track'].append(uid_day_geo[uid]) count += 1 #get keywords top for uid in uid_list: results[uid] = {} #hashtag hashtag_dict = iter_results[uid]['hashtag'] results[uid]['hashtag_dict'] = json.dumps(hashtag_dict) results[uid]['hashtag'] = '&'.join(hashtag_dict.keys()) #sensitive words sensitive_word_dict = iter_results[uid]['sensitive'] results[uid]['sensitive_dict'] = json.dumps(sensitive_word_dict) results[uid]['sensitive_string'] = '&'.join(sensitive_word_dict.keys()) sensitive_score = 0 today_sensitive_results_user = today_sensitive_results[uid] for sensitive_item in today_sensitive_results_user: k = sensitive_item v = today_sensitive_results_user[sensitive_item] tmp_stage = r_sensitive.hget('sensitive_words', k) if tmp_stage: sensitive_score += v * sensitive_score_dict[str(tmp_stage)] results[uid]['sensitive'] = sensitive_score #print 'sensitive_dict:', results[uid]['sensitive_dict'] #print 'sensitive_string:', results[uid]['sensitive_string'] #print 'sensitive:', results[uid]['sensitive'] #geo geo_dict = iter_results[uid]['geo'] geo_track_list = iter_results[uid]['geo_track'] results[uid]['activity_geo_dict'] = json.dumps(geo_track_list) geo_dict_keys = geo_dict.keys() results[uid]['activity_geo'] = '&'.join(['&'.join(item.split('\t')) for item in geo_dict_keys]) try: results[uid]['activity_geo_aggs'] = '&'.join([item.split('\t')[-1] for item in geo_dict_keys]) except: results[uid]['activity_geo_aggs'] = '' #keywords keywords_dict = all_user_keywords_dict[uid] keywords_top50 = sorted(keywords_dict.items(), key=lambda x:x[1], reverse=True)[:50] keywords_top50_string = '&'.join([keyword_item[0] for keyword_item in keywords_top50]) results[uid]['keywords'] = json.dumps(keywords_top50) results[uid]['keywords_string'] = keywords_top50_string #school dict school_dict = iter_results[uid]['school'] school_string = '&'.join(school_dict.keys()) if school_dict != {}: is_school = '1' else: is_school = '0' results[uid]['is_school'] = is_school results[uid]['school_string'] = school_string results[uid]['school_dict'] = json.dumps(school_dict) return results
def update_flow_information(user_info): results = { } # results ={uid: {'activity_geo_dict':'', 'activity_geo':'', 'hashtag_dict':'', 'hashtag':'', 'online_pattern_dict':'', 'online_pattern':''}} uid_list = user_info.keys() now_ts = time.time() now_date = ts2datetime(now_ts) timestamp = datetime2ts(now_date) #test timestamp = datetime2ts('2013-09-08') user_hashtag_dict = dict() user_online_dict = dict() ip_user_count_dict = {} new_day_ip_dict = dict() for i in range(7, 0, -1): ts = timestamp - 24 * 3600 * i print 'iter date:', ts2date(ts) results = r_cluster.hmget('hashtag_' + str(ts), uid_list) online_pattern_results = r_cluster.hmget('online_' + str(ts), uid_list) if i == 0: ip_result = r_cluater.hmget('hashtag_' + str(ts), uid_list) for j in range(0, len(uid_list)): uid = uid_list[j] #attr: hashtag if results[j]: hashtag_dict = json.loads(results[j]) for hashtag in hashtag_dict: if uid in user_hashtag_dict: try: user_hashtag_dict[uid][hashtag] += hashtag_dict[ hashtag] except: user_hashtag_dict[uid][hashtag] = hashtag_dict[ hashtag] else: user_hashtag_dict[uid] = { hashtag: hashtag_dict[hashtag] } ''' #attr: online_pattern if online_pattern_results[j]: online_pattern_dict = json.loads(online_pattern_results[j]) for online_pattern in online_pattern_dict: if uid in user_online_dict: try: user_online_dict[uid][online_pattern] += online_pattern_dict[online_pattern] except: user_online_dict[uid][online_pattern] = online_pattern_dict[online_pattern] else: user_online_dict[uid] = {online_pattern: online_pattern_dict[online_pattern]} ''' #attr: activity_geo by ip-timestamp if i == 0 and ip_result[j]: ip_timestamp_dict = json.loads(ip_result[j]) old_flow_information = user_info[uid] old_day_geo_list = json.loads( old_flow_information['activity_geo_dict']) for ip in ip_timestamp_dict: ip_count = len(ip_timestamp_dict[ip].split('&')) new_day_ip_dict[uid][ip] = ip_count geo_dict = ip2city(new_day_ip_dict[uid]) if len(old_day_geo_list) >= 30: new_day_geo_list = old_day_geo_list[1:].append(geo_dict) else: new_day_geo_list = old_day_geo_list.append(geo_dict) week_geo_list = [] week_day_geo_list = new_day_geo[-7:] for day_geo_dict in week_day_geo_list: week_geo_list.extend(day_geo_dict.keys()) week_geo_list = list(set(week_geo_list)) activity_geo_string = '' new_week_geo_list = [] for geo_string in week_geo_list: day_geo_string = '&'.join(geo_string.split('\t')) new_week_geo_list.append(day_geo_string) activity_geo_string = '&'.join(new_week_geo_list) print 'activity_geo_string:', activity_geo_string for uid in uid_list: #attr: hashtag try: hashtag_dict = user_hashtag_dict[uid] hashtag_string = json.dumps(hashtag_dict) hashtag_list = '&'.join(hashtag_dict.keys()) except KeyError: hashtag_string = '' hashtag_list = '' ''' #attr: online_pattern try: online_dict = user_online_dict[uid] online_string = json.dumps(online_dict) online_list = '&'.join(online_dict.keys()) except KeyError: online_string = '' online_list = '' ''' result[uid] = {'hashtag_dict':hashtag_string, 'hashtag':hashtag_list, \ 'activity_geo_dict': json.loads(new_day_geo_list), 'activity_geo': activity_geo_string, \ 'online_pattern_dict': online_pattern_string, 'online_pattern': online_pattern_list} return result
def get_flow_information(uid_list): results = {} #results = {uid:{'hashtag_dict':{},'hashtag':'', 'keywords_dict':{}, 'keywords_string':'', 'activity_geo':'', 'activity_geo_dict':dict}} iter_results = {} # iter_results = {uid:{'hashtag': hashtag_dict, 'geo':geo_dict, 'keywords':keywords_dict}} now_ts = time.time() now_date_ts = datetime2ts(ts2datetime(now_ts)) #test now_date_ts = test_ts for i in range(7,0,-1): ts = now_date_ts - DAY*i iter_date = ts2datetime(ts) flow_text_index_name = flow_text_index_name_pre + iter_date uid_day_geo = {} #compute hashtag and geo hashtag_results = r_cluster_3.hmget('hashtag_'+str(ts), uid_list) ip_results = r_cluster.hmget('new_ip_'+str(ts), uid_list) #compute sensitive_words sensitive_results = r_cluster_3.hmget('sensitive_'+str(ts), uid_list) count = 0 for uid in uid_list: #init iter_results[uid] if uid not in iter_results: iter_results[uid] = {'hashtag':{}, 'geo':{},'geo_track':[],'keywords':{}, 'sensitive':{}} #compute hashtag hashtag_item = hashtag_results[count] if hashtag_item: uid_hashtag_dict = json.loads(hashtag_item) else: uid_hashtag_dict = {} for hashtag in uid_hashtag_dict: try: iter_results[uid]['hashtag'][hashtag] += uid_hashtag_dict[hashtag] except: iter_results[uid]['hashtag'][hashtag] = uid_hashtag_dict[hashtag] #compute sensitive sensitive_item = sensitive_results[count] if sensitive_item: uid_sensitive_dict = json.loads(sensitive_item) else: uid_sensitive_dict = {} for sensitive_word in uid_sensitive_dict: try: iter_results[uid]['sensitive'][sensitive_word] += uid_sensitive_dict[sensitive_word] except: iter_results[uid]['sensitive'][sensitive_word] = uid_sensitive_dict[sensitive_word] #compute geo uid_day_geo[uid] = {} ip_item = ip_results[count] if ip_item: uid_ip_dict = json.loads(ip_item) else: uid_ip_dict = {} for ip in uid_ip_dict: ip_count = len(uid_ip_dict[ip].split('&')) geo = ip2city(ip) if geo: #print 'geo:', geo try: iter_results[uid]['geo'][geo] += ip_count except: iter_results[uid]['geo'][geo] = ip_count try: uid_day_geo[uid][geo] += ip_count except: uid_day_geo[uid][geo] = ip_count iter_results[uid]['geo_track'].append(uid_day_geo[uid]) count += 1 #compute keywords: try: text_results = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type, \ body={'query':{'filtered':{'filter':{'terms':{'uid': uid_list}}}}, 'size':MAX_VALUE}, _source=True, fields=['uid', 'keywords_dict'])['hits']['hits'] except: text_results = {} for item in text_results: #print 'keywords item:', item uid = item['fields']['uid'][0] uid_keywords_dict = json.loads(item['fields']['keywords_dict'][0]) for keywords in uid_keywords_dict: try: iter_results[uid]['keywords'][keywords] += uid_keywords_dict[keywords] except: iter_results[uid]['keywords'][keywords] = uid_keywords_dict[keywords] #get keywords top for uid in uid_list: results[uid] = {} hashtag_dict = iter_results[uid]['hashtag'] results[uid]['hashtag_dict'] = json.dumps(hashtag_dict) results[uid]['hashtag'] = '&'.join(hashtag_dict.keys()) #sensitive words sensitive_word_dict = iter_results[uid]['sensitive'] results[uid]['sensitive_dict'] = json.dumps(sensitive_word_dict) results[uid]['sensitive_string'] = '&'.join(sensitive_word_dict.keys()) sensitive_score = 0 for item in sensitive_word_dict: k = item v = sensitive_word_dict[k] tmp_stage = r_sensitive.hget('sensitive_words', k) if tmp_stage: sensitive_score += v * sensitive_score_dict[str(tmp_stage)] results[uid]['sensitive'] = sensitive_score #print 'sensitive_dict:', results[uid]['sensitive_dict'] #print 'sensitive_string:', results[uid]['sensitive_string'] #print 'sensitive:', results[uid]['sensitive'] #geo geo_dict = iter_results[uid]['geo'] geo_track_list = iter_results[uid]['geo_track'] results[uid]['activity_geo_dict'] = json.dumps(geo_track_list) geo_dict_keys = geo_dict.keys() #print 'geo_dict_keys:', geo_dict_keys results[uid]['activity_geo'] = '&'.join(['&'.join(item.split('\t')) for item in geo_dict_keys]) #print 'activity_geo:', results[uid]['activity_geo'] keywords_dict = iter_results[uid]['keywords'] keywords_top50 = sorted(keywords_dict.items(), key=lambda x:x[1], reverse=True)[:50] keywords_top50_string = '&'.join([keyword_item[0] for keyword_item in keywords_top50]) results[uid]['keywords'] = json.dumps(keywords_top50) results[uid]['keywords_string'] = keywords_top50_string return results
def get_flow_information_v2(uid_list, all_user_keywords_dict): results = {} #results = {uid:{'hashtag_dict':{},'hashtag':'', 'keywords_dict':{}, 'keywords_string':'', 'activity_geo':'', 'activity_geo_dict':dict, 'activity_geo_aggs':''}} iter_results = {} # iter_results = {uid:{'hashtag': hashtag_dict, 'geo':geo_dict, 'keywords':keywords_dict}} now_ts = time.time() #run_type if RUN_TYPE == 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = test_ts for i in range(WEEK,0,-1): ts = now_date_ts - DAY*i uid_day_geo = {} #compute hashtag and geo hashtag_results = r_cluster.hmget('hashtag_'+str(ts), uid_list) ip_results = r_cluster.hmget('new_ip_'+str(ts), uid_list) count = 0 for uid in uid_list: #init iter_results[uid] if uid not in iter_results: iter_results[uid] = {'hashtag':{}, 'geo':{},'geo_track':[],'keywords':{}} #compute hashtag hashtag_item = hashtag_results[count] if hashtag_item: uid_hashtag_dict = json.loads(hashtag_item) else: uid_hashtag_dict = {} for hashtag in uid_hashtag_dict: try: iter_results[uid]['hashtag'][hashtag] += uid_hashtag_dict[hashtag] except: iter_results[uid]['hashtag'][hashtag] = uid_hashtag_dict[hashtag] #compute geo uid_day_geo[uid] = {} ip_item = ip_results[count] if ip_item: uid_ip_dict = json.loads(ip_item) else: uid_ip_dict = {} for ip in uid_ip_dict: ip_count = len(uid_ip_dict[ip].split('&')) geo = ip2city(ip) if geo: try: iter_results[uid]['geo'][geo] += ip_count except: iter_results[uid]['geo'][geo] = ip_count try: uid_day_geo[uid][geo] += ip_count except: uid_day_geo[uid][geo] = ip_count iter_results[uid]['geo_track'].append(uid_day_geo[uid]) count += 1 #get keywords top for uid in uid_list: results[uid] = {} hashtag_dict = iter_results[uid]['hashtag'] results[uid]['hashtag_dict'] = json.dumps(hashtag_dict) results[uid]['hashtag'] = '&'.join(hashtag_dict.keys()) geo_dict = iter_results[uid]['geo'] geo_track_list = iter_results[uid]['geo_track'] results[uid]['activity_geo_dict'] = json.dumps(geo_track_list) geo_dict_keys = geo_dict.keys() results[uid]['activity_geo'] = '&'.join(['&'.join(item.split('\t')) for item in geo_dict_keys]) try: results[uid]['activity_geo_aggs'] = '&'.join([item.split('\t')[-1] for item in geo_dict_keys]) except: results[uid]['activity_geo_aggs'] = '' keywords_dict = all_user_keywords_dict[uid] keywords_top50 = sorted(keywords_dict.items(), key=lambda x:x[1], reverse=True)[:50] keywords_top50_string = '&'.join([keyword_item[0] for keyword_item in keywords_top50]) results[uid]['keywords'] = json.dumps(keywords_top50) results[uid]['keywords_string'] = keywords_top50_string return results