def cal_text_work(item): uid = item['uid'] timestamp = item['timestamp'] date = ts2datetime(timestamp) ts = datetime2ts(date) text = item['text'] if isinstance(text, str): text = text.decode('utf-8', 'ignore') RE = re.compile(u'#([a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+)#', re.UNICODE) hashtag_list = RE.findall(text) if hashtag_list: # there all use unicode· hashtag_dict = dict() for hashtag in hashtag_list: try: hashtag_dict[hashtag] += 1 except: hashtag_dict[hashtag] = 1 try: hashtag_count_string = r_cluster.hget('hashtag_'+str(ts), str(uid)) hashtag_count_dict = json.loads(hashtag_count_string) for hashtag in hashtag_dict: count = hashtag_dict[hashtag] try: hashtag_count_dict[hashtag] += count except: hashtag_count_dict[hashtag] = count r_cluster.hset('hashtag_'+str(ts), str(uid), json.dumps(hashtag_count_dict)) except: r_cluster.hset('hashtag_'+str(ts), str(uid), json.dumps(hashtag_dict))
def update_day_hashtag(uid_list): results = {} all_results = {} now_ts = time.time() #run_type if RUN_TYPE == 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = test_ts for i in range(WEEK,0,-1): ts = now_date_ts - DAY*i count = 0 hashtag_results = r_cluster_3.hmget('hashtag_'+str(ts), uid_list) for uid in uid_list: if uid not in results: results[uid] = {} hashtag_item = hashtag_results[count] if hashtag_item: hashtag_dict = json.loads(hashtag_item) else: hashtag_dict = {} for hashtag in hashtag_dict: try: results[uid][hashtag] += 1 except: results[uid][hashtag] = 1 count += 1 for uid in uid_list: user_hashtag_dict = results[uid] hashtag_string = '&'.join(user_hashtag_dict.keys()) all_results[uid] = {'hashtag': hashtag_string, 'hashtag_dict':json.dumps(user_hashtag_dict)} return all_results
def scan_offline_task(): query = { "query": { "bool": { "must": [{ "term": { "status": 0 } }] } }, "size": 1000 } results = es_user_portrait.search(index=USER_RANK_KEYWORD_TASK_INDEX, doc_type=USER_RANK_KEYWORD_TASK_TYPE, body=query)['hits']['hits'] if results: for item in results: task_id = item['_id'] iter_item = item['_source'] search_type = iter_item['search_type'] pre = iter_item['pre'] during = iter_item['during'] start_time = iter_item['start_time'] keyword = json.loads(iter_item['keyword']) search_key = iter_item['user_ts'] number = iter_item['number'] sort_norm = iter_item['sort_norm'] sort_scope = iter_item['sort_scope'] time = iter_item['time'] isall = iter_item['isall'] redis_task.lpush( "task_user_rank", json.dumps([ task_id, search_type, pre, during, start_time, keyword, search_key, sort_norm, sort_scope, time, isall, number ])) iter_item['status'] = -1 task_id = item['_id'] #print item es_user_portrait.index(index=USER_RANK_KEYWORD_TASK_INDEX, doc_type=USER_RANK_KEYWORD_TASK_TYPE, id=task_id, body=iter_item)
def update_day_sensitive(uid_list): results = {} all_results = {} now_ts = time.time() #run_type if RUN_TYPE == 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = test_ts today_sensitive_dict = {} for i in range(WEEK, 0, -1): ts = now_date_ts - DAY * i count = 0 sensitive_results = r_cluster_3.hmget('sensitive_' + str(ts), uid_list) #print 'sensitive_results:', sensitive_results for uid in uid_list: if uid not in results: results[uid] = {} sensitive_item = sensitive_results[count] if uid not in today_sensitive_dict: today_sensitive_dict[uid] = {} if sensitive_item: sensitive_dict = json.loads(sensitive_item) else: sensitive_dict = {} for sensitive in sensitive_dict: try: results[uid][sensitive] += 1 except: results[uid][sensitive] = 1 if ts == now_date_ts - DAY: try: today_sensitive_dict[uid][sensitive] += 1 except: today_sensitive_dict[uid][sensitive] = 1 count += 1 #print 'results:', results for uid in uid_list: user_sensitive_dict = results[uid] #print 'uid,sensitive:', uid, user_sensitive_dict sensitive_score = 0 today_sensitive_dict_user = today_sensitive_dict[uid] for item in today_sensitive_dict_user: k = item v = today_sensitive_dict_user[k] tmp_stage = r_sensitive.hget('sensitive_words', k) if tmp_stage: sensitive_score += v * sensitive_score_dict[str(tmp_stage)] #print 'sensitive_score:', sensitive_score sensitive_string = '&'.join(user_sensitive_dict.keys()) #print 'uid, sensitive:', uid, sensitive_string, sensitive_score all_results[uid] = {'sensitive_string': sensitive_string, 'sensitive_dict':json.dumps(user_sensitive_dict),\ 'sensitive': sensitive_score} #print 'all_results:', all_results return all_results
def update_day_sensitive(uid_list): results = {} all_results = {} now_ts = time.time() #run_type if RUN_TYPE == 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = test_ts today_sensitive_dict = {} for i in range(WEEK,0,-1): ts = now_date_ts - DAY*i count = 0 sensitive_results = r_cluster_3.hmget('sensitive_'+str(ts), uid_list) #print 'sensitive_results:', sensitive_results for uid in uid_list: if uid not in results: results[uid] = {} sensitive_item = sensitive_results[count] if uid not in today_sensitive_dict: today_sensitive_dict[uid] = {} if sensitive_item: sensitive_dict = json.loads(sensitive_item) else: sensitive_dict = {} for sensitive in sensitive_dict: try: results[uid][sensitive] += 1 except: results[uid][sensitive] = 1 if ts == now_date_ts - DAY: try: today_sensitive_dict[uid][sensitive] += 1 except: today_sensitive_dict[uid][sensitive] = 1 count += 1 #print 'results:', results for uid in uid_list: user_sensitive_dict = results[uid] #print 'uid,sensitive:', uid, user_sensitive_dict sensitive_score = 0 today_sensitive_dict_user = today_sensitive_dict[uid] for item in today_sensitive_dict_user: k = item v = today_sensitive_dict_user[k] tmp_stage = r_sensitive.hget('sensitive_words', k) if tmp_stage: sensitive_score += v * sensitive_score_dict[str(tmp_stage)] #print 'sensitive_score:', sensitive_score sensitive_string = '&'.join(user_sensitive_dict.keys()) #print 'uid, sensitive:', uid, sensitive_string, sensitive_score all_results[uid] = {'sensitive_string': sensitive_string, 'sensitive_dict':json.dumps(user_sensitive_dict),\ 'sensitive': sensitive_score} #print 'all_results:', all_results return all_results
def scan_offline_task(): query = {"query":{"bool":{"must":[{"term":{"status":0}}]}},"size":1000} results = es_user_portrait.search(index = USER_RANK_KEYWORD_TASK_INDEX , doc_type = USER_RANK_KEYWORD_TASK_TYPE,body=query)['hits']['hits'] if results : for item in results: task_id = item['_id'] iter_item = item['_source'] search_type = iter_item['search_type'] pre = iter_item['pre'] during = iter_item['during'] start_time = iter_item['start_time'] keyword = json.loads(iter_item['keyword']) search_key = iter_item['user_ts'] number = iter_item['number'] sort_norm = iter_item['sort_norm'] sort_scope = iter_item['sort_scope'] time = iter_item['time'] isall = iter_item['isall'] redis_task.lpush("task_user_rank", json.dumps([task_id, search_type , pre , during , start_time , keyword , search_key , sort_norm , sort_scope ,time , isall, number])) iter_item['status'] = -1 task_id = item['_id'] #print item es_user_portrait.index(index=USER_RANK_KEYWORD_TASK_INDEX, doc_type=USER_RANK_KEYWORD_TASK_TYPE, id=task_id, body=iter_item)
def update_day_hashtag(uid_list): results = {} all_results = {} now_ts = time.time() #run_type if RUN_TYPE == 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = test_ts for i in range(WEEK, 0, -1): ts = now_date_ts - DAY * i count = 0 hashtag_results = r_cluster_3.hmget('hashtag_' + str(ts), uid_list) for uid in uid_list: if uid not in results: results[uid] = {} hashtag_item = hashtag_results[count] if hashtag_item: hashtag_dict = json.loads(hashtag_item) else: hashtag_dict = {} for hashtag in hashtag_dict: try: results[uid][hashtag] += 1 except: results[uid][hashtag] = 1 count += 1 for uid in uid_list: user_hashtag_dict = results[uid] hashtag_string = '&'.join(user_hashtag_dict.keys()) all_results[uid] = { 'hashtag': hashtag_string, 'hashtag_dict': json.dumps(user_hashtag_dict) } return all_results
])) iter_item['status'] = -1 task_id = item['_id'] #print item es_user_portrait.index(index=USER_RANK_KEYWORD_TASK_INDEX, doc_type=USER_RANK_KEYWORD_TASK_TYPE, id=task_id, body=iter_item) def cron_task(data): key_words_search(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7], data[8], data[9], data[10], data[11]) if __name__ == "__main__": scan_offline_task() while 1: data = redis_task.rpop("task_user_rank") print data #""" if data: try: cron_task(json.loads(data)) except Exception, e: print e, '&error&', ts2date(time.time()) else: break #"""
uidlist = [] f = open("uid_list_0520.txt") for line in f: uid = line.strip() uidlist.append(uid) f.close() data = [] dates = [ "2016-05-14", "2016-05-15", "2016-05-16", "2016-05-17", "2016-05-18", "2016-05-19", "2016-05-20" ] tss = [datetime2ts(d) for d in dates] for ts in tss: ns = "hashtag_" + str(ts) hashtag_list = R_CLUSTER_FLOW3.hmget(ns, uidlist) hashtag_list = [json.loads(h) if h else None for h in hashtag_list] uhlist = zip(uidlist, hashtag_list) uhtlist = [] for uh in uhlist: uh = list(uh) uh.append(ts) uhtlist.append(uh) data.extend(uhtlist) with open("hashtag_0521.txt", "w") as fw: for d in data: if d[1] != None: fw.write("%s\n" % json.dumps(d)) at_data = []
time = iter_item['time'] isall = iter_item['isall'] redis_task.lpush("task_user_rank", json.dumps([task_id, search_type , pre , during , start_time , keyword , search_key , sort_norm , sort_scope ,time , isall, number])) iter_item['status'] = -1 task_id = item['_id'] #print item es_user_portrait.index(index=USER_RANK_KEYWORD_TASK_INDEX, doc_type=USER_RANK_KEYWORD_TASK_TYPE, id=task_id, body=iter_item) def cron_task(data): key_words_search(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7], data[8], data[9], data[10], data[11]) if __name__ == "__main__": scan_offline_task() while 1: data = redis_task.rpop("task_user_rank") print data #""" if data: try: cron_task(json.loads(data)) except Exception, e: print e, '&error&', ts2date(time.time()) else: break #"""
statusnum = 0 else: uname = uid location = '' try: fansnum = bci_history_dict['fields']['user_fansnum'][0] except: fansnum = 0 try: statusnum = bci_history_dict['fields']['weibo_month_sum'][0] except: statusnum = 0 if status == 'show_in': if user_type == "sensitive": tmp_ts = datetime2ts(date) - DAY tmp_data = r_cluster.hget("sensitive_"+str(tmp_ts), uid) if tmp_data: sensitive_dict = json.loads(tmp_data) sensitive_words = sensitive_dict.keys() else: sensitive_words = [] if sensitive_history_dict.get('fields',0): #print sensitive_history_dict['fields'][sensitive_string][0] #print top_sensitive sensitive_value = math.log(sensitive_history_dict['fields'][sensitive_string][0]/float(top_sensitive)*9+1, 10)*100 #print "sensitive_value", sensitive_value else: sensitive_value = 0 results.append([uid, uname, location, fansnum, statusnum, influence, sensitive_words, sensitive_value]) else: results.append([uid, uname, location, fansnum, statusnum, influence])
def main(): if RUN_TYPE: now_ts = time.time()-DAY # 前一天 now_ts = datetime2ts('2016-03-27') ts = str(datetime2ts(ts2datetime(now_ts))) else: ts = str(datetime2ts('2016-03-16')) now_ts = int(ts) print now_ts sensitive_string = "sensitive_" + ts date_string = ts update_sensitive_key = "sensitive_score_" + ts # 更新的键 sensitive_dict_key = "sensitive_dict_" + ts sensitive_string_key = "sensitive_string_" + ts sensitive_day_change_key = "sensitive_" + ts +"_day_change" del_month = datetime2ts(ts2datetime(now_ts - MONTH)) del_sensitive_key = "sensitive_score_"+str(del_month) # 要删除的键 former_ts = int(ts) - DAY former_date = str(datetime2ts(ts2datetime(former_ts))) former_sensitive_key = "sensitive_score_" + former_date iter_count = 0 bulk_action = [] mappings(ES_SENSITIVE_INDEX) total_number = r.hlen(sensitive_string) scan_cursor = 0 print total_number """ while 1: re_scan = r.hscan(sensitive_string, scan_cursor, count=1000) scan_cursor = re_scan[0] if len(re_scan[1]) != 0: sensitive_info = re_scan[1] # 字典形式,uid:sensitive_words_dict uid_list = sensitive_info.keys() sensitive_results = es.mget(index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX, body={"ids":uid_list})['docs'] if sensitive_results: for item in sensitive_results: uid = item['_id'] sensitive_words_dict = json.loads(sensitive_info[uid]) # json.loads current_sensitive_score = 0 for k,v in sensitive_words_dict.iteritems(): tmp_stage = r_sensitive.hget("sensitive_words", k) if tmp_stage: current_sensitive_score += v*sensitive_score_dict[str(tmp_stage)] if item['found']: # 之前存在相关信息 revise_item = item["_source"] if del_sensitive_key in revise_item: item.remove(del_sensitive_key) revise_item['uid'] = uid # 新更新的敏感度 revise_item[update_sensitive_key] = current_sensitive_score # 新更新的敏感词 revise_item[sensitive_dict_key] = sensitive_info[uid] # 新更新的string revise_item[sensitive_string_key] = "&".join(sensitive_words_dict.keys()) # 当天和之前一天、一周和一月均值的差异 revise_item['sensitive_day_change'] = current_sensitive_score - revise_item.get(former_sensitive_key, 0) revise_item['sensitive_week_change'] = current_sensitive_score - revise_item.get('sensitive_week_ave', 0) revise_item['sensitive_month_change'] = current_sensitive_score - revise_item.get('sensitive_month_ave', 0) # 更新后week、month的均值和方差 revise_item['sensitive_week_ave'], revise_item['sensitive_week_var'], revise_item['sensitive_week_sum'] = compute_week(revise_item, now_ts) revise_item['senstiive_month_ave'], revise_item['sensitive_month_var'], revise_item['sensitive_month_sum'] = compute_month(revise_item, now_ts) else: revise_item = dict() revise_item['uid'] = uid revise_item[update_sensitive_key] = current_sensitive_score revise_item[sensitive_dict_key] = sensitive_info[uid] revise_item[sensitive_string_key] = "&".join(sensitive_words_dict.keys()) revise_item['sensitive_day_change'] = current_sensitive_score revise_item['sensitive_week_change'] = current_sensitive_score revise_item['sensitive_month_change'] = current_sensitive_score revise_item['sensitive_week_ave'], revise_item['sensitive_week_var'], revise_item['sensitive_week_sum'] = compute_week(revise_item, now_ts) revise_item['senstiive_month_ave'], revise_item['sensitive_month_var'], revise_item['sensitive_month_sum'] = compute_month(revise_item, now_ts) action = {'index':{'_id': uid}} bulk_action.extend([action, revise_item]) iter_count += 1 if iter_count % 1000 == 0: es.bulk(bulk_action, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX) bulk_action = [] print iter_count if int(scan_cursor) == 0: break if bulk_action: es.bulk(bulk_action, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX) print iter_count ####### #######更新尚未完成的用户 update_scan = scan(es, query={"query":{"filtered":{"filter":{"missing":{"field":update_sensitive_key}}}}}, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX) iter_count = 0 bulk_action = [] """ while 1: try: tmp = update_scan.next() revise_item = tmp['_source'] if del_sensitive_key in revise_item: revise_item.remove(del_sensitive_key) uid = revise_item['uid'] # 新更新的敏感度 revise_item[update_sensitive_key] = 0 # 新更新的敏感词 revise_item[sensitive_dict_key] = json.dumps({}) # 新更新的string revise_item[sensitive_string_key] = "" # 当天和之前一天、一周和一月均值的差异 revise_item['sensitive_day_change'] = 0 - revise_item.get(former_sensitive_key, 0) revise_item['sensitive_week_change'] = 0 - revise_item.get('sensitive_week_ave', 0) revise_item['sensitive_month_change'] = 0 - revise_item.get('sensitive_month_ave', 0) # 更新后week、month的均值和方差 revise_item['sensitive_week_ave'], revise_item['sensitive_week_var'], revise_item['sensitive_week_sum'] = compute_week(revise_item, now_ts) revise_item['senstiive_month_ave'], revise_item['sensitive_month_var'], revise_item['sensitive_month_sum'] = compute_month(revise_item, now_ts) action = {'index':{'_id': uid}} bulk_action.extend([action, revise_item]) iter_count += 1 if iter_count % 1000 == 0: es.bulk(bulk_action, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX) bulk_action = [] print iter_count except StopIteration: print "all done" if bulk_action: es.bulk(bulk_action, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX) except Exception, r: print Exception, r if bulk_action: es.bulk(bulk_action, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX)
sensitive_words_dict = searchWord(text.encode('utf-8', 'ignore'), DFA) if sensitive_words_dict: item['sensitive_words_string'] = "&".join( sensitive_words_dict.keys()) item['sensitive_words_dict'] = json.dumps(sensitive_words_dict) else: item['sensitive_words_string'] = "" item['sensitive_words_dict'] = json.dumps({}) timestamp = item['timestamp'] date = ts2datetime(timestamp) ts = datetime2ts(date) if sensitive_words_dict: print sensitive_words_dict.keys()[0] sensitive_count_string = r_cluster.hget( 'sensitive_' + str(ts), str(uid)) if sensitive_count_string: #redis取空 sensitive_count_dict = json.loads(sensitive_count_string) for word in sensitive_words_dict.keys(): if sensitive_count_dict.has_key(word): sensitive_count_dict[word] += sensitive_words_dict[ word] else: sensitive_count_dict[word] = sensitive_words_dict[ word] r_cluster.hset('sensitive_' + str(ts), str(uid), json.dumps(sensitive_count_dict)) else: r_cluster.hset('sensitive_' + str(ts), str(uid), json.dumps(sensitive_words_dict))
def main(): if RUN_TYPE: now_ts = time.time() - DAY # 前一天 now_ts = datetime2ts('2016-03-27') ts = str(datetime2ts(ts2datetime(now_ts))) else: ts = str(datetime2ts('2016-03-16')) now_ts = int(ts) print now_ts sensitive_string = "sensitive_" + ts date_string = ts update_sensitive_key = "sensitive_score_" + ts # 更新的键 sensitive_dict_key = "sensitive_dict_" + ts sensitive_string_key = "sensitive_string_" + ts sensitive_day_change_key = "sensitive_" + ts + "_day_change" del_month = datetime2ts(ts2datetime(now_ts - MONTH)) del_sensitive_key = "sensitive_score_" + str(del_month) # 要删除的键 former_ts = int(ts) - DAY former_date = str(datetime2ts(ts2datetime(former_ts))) former_sensitive_key = "sensitive_score_" + former_date iter_count = 0 bulk_action = [] mappings(ES_SENSITIVE_INDEX) total_number = r.hlen(sensitive_string) scan_cursor = 0 print total_number """ while 1: re_scan = r.hscan(sensitive_string, scan_cursor, count=1000) scan_cursor = re_scan[0] if len(re_scan[1]) != 0: sensitive_info = re_scan[1] # 字典形式,uid:sensitive_words_dict uid_list = sensitive_info.keys() sensitive_results = es.mget(index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX, body={"ids":uid_list})['docs'] if sensitive_results: for item in sensitive_results: uid = item['_id'] sensitive_words_dict = json.loads(sensitive_info[uid]) # json.loads current_sensitive_score = 0 for k,v in sensitive_words_dict.iteritems(): tmp_stage = r_sensitive.hget("sensitive_words", k) if tmp_stage: current_sensitive_score += v*sensitive_score_dict[str(tmp_stage)] if item['found']: # 之前存在相关信息 revise_item = item["_source"] if del_sensitive_key in revise_item: item.remove(del_sensitive_key) revise_item['uid'] = uid # 新更新的敏感度 revise_item[update_sensitive_key] = current_sensitive_score # 新更新的敏感词 revise_item[sensitive_dict_key] = sensitive_info[uid] # 新更新的string revise_item[sensitive_string_key] = "&".join(sensitive_words_dict.keys()) # 当天和之前一天、一周和一月均值的差异 revise_item['sensitive_day_change'] = current_sensitive_score - revise_item.get(former_sensitive_key, 0) revise_item['sensitive_week_change'] = current_sensitive_score - revise_item.get('sensitive_week_ave', 0) revise_item['sensitive_month_change'] = current_sensitive_score - revise_item.get('sensitive_month_ave', 0) # 更新后week、month的均值和方差 revise_item['sensitive_week_ave'], revise_item['sensitive_week_var'], revise_item['sensitive_week_sum'] = compute_week(revise_item, now_ts) revise_item['senstiive_month_ave'], revise_item['sensitive_month_var'], revise_item['sensitive_month_sum'] = compute_month(revise_item, now_ts) else: revise_item = dict() revise_item['uid'] = uid revise_item[update_sensitive_key] = current_sensitive_score revise_item[sensitive_dict_key] = sensitive_info[uid] revise_item[sensitive_string_key] = "&".join(sensitive_words_dict.keys()) revise_item['sensitive_day_change'] = current_sensitive_score revise_item['sensitive_week_change'] = current_sensitive_score revise_item['sensitive_month_change'] = current_sensitive_score revise_item['sensitive_week_ave'], revise_item['sensitive_week_var'], revise_item['sensitive_week_sum'] = compute_week(revise_item, now_ts) revise_item['senstiive_month_ave'], revise_item['sensitive_month_var'], revise_item['sensitive_month_sum'] = compute_month(revise_item, now_ts) action = {'index':{'_id': uid}} bulk_action.extend([action, revise_item]) iter_count += 1 if iter_count % 1000 == 0: es.bulk(bulk_action, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX) bulk_action = [] print iter_count if int(scan_cursor) == 0: break if bulk_action: es.bulk(bulk_action, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX) print iter_count ####### #######更新尚未完成的用户 update_scan = scan(es, query={"query":{"filtered":{"filter":{"missing":{"field":update_sensitive_key}}}}}, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX) iter_count = 0 bulk_action = [] """ while 1: try: tmp = update_scan.next() revise_item = tmp['_source'] if del_sensitive_key in revise_item: revise_item.remove(del_sensitive_key) uid = revise_item['uid'] # 新更新的敏感度 revise_item[update_sensitive_key] = 0 # 新更新的敏感词 revise_item[sensitive_dict_key] = json.dumps({}) # 新更新的string revise_item[sensitive_string_key] = "" # 当天和之前一天、一周和一月均值的差异 revise_item['sensitive_day_change'] = 0 - revise_item.get( former_sensitive_key, 0) revise_item['sensitive_week_change'] = 0 - revise_item.get( 'sensitive_week_ave', 0) revise_item['sensitive_month_change'] = 0 - revise_item.get( 'sensitive_month_ave', 0) # 更新后week、month的均值和方差 revise_item['sensitive_week_ave'], revise_item[ 'sensitive_week_var'], revise_item[ 'sensitive_week_sum'] = compute_week(revise_item, now_ts) revise_item['senstiive_month_ave'], revise_item[ 'sensitive_month_var'], revise_item[ 'sensitive_month_sum'] = compute_month( revise_item, now_ts) action = {'index': {'_id': uid}} bulk_action.extend([action, revise_item]) iter_count += 1 if iter_count % 1000 == 0: es.bulk(bulk_action, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX) bulk_action = [] print iter_count except StopIteration: print "all done" if bulk_action: es.bulk(bulk_action, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX) except Exception, r: print Exception, r if bulk_action: es.bulk(bulk_action, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX)
def get_flow_information(uid_list): results = {} #results = {uid:{'hashtag_dict':{},'hashtag':'', 'keywords_dict':{}, 'keywords_string':'', 'activity_geo':'', 'activity_geo_dict':dict}} iter_results = {} # iter_results = {uid:{'hashtag': hashtag_dict, 'geo':geo_dict, 'keywords':keywords_dict}} now_ts = time.time() now_date_ts = datetime2ts(ts2datetime(now_ts)) #test now_date_ts = test_ts for i in range(7,0,-1): ts = now_date_ts - DAY*i iter_date = ts2datetime(ts) flow_text_index_name = flow_text_index_name_pre + iter_date uid_day_geo = {} #compute hashtag and geo hashtag_results = r_cluster_3.hmget('hashtag_'+str(ts), uid_list) ip_results = r_cluster.hmget('new_ip_'+str(ts), uid_list) #compute sensitive_words sensitive_results = r_cluster_3.hmget('sensitive_'+str(ts), uid_list) count = 0 for uid in uid_list: #init iter_results[uid] if uid not in iter_results: iter_results[uid] = {'hashtag':{}, 'geo':{},'geo_track':[],'keywords':{}, 'sensitive':{}} #compute hashtag hashtag_item = hashtag_results[count] if hashtag_item: uid_hashtag_dict = json.loads(hashtag_item) else: uid_hashtag_dict = {} for hashtag in uid_hashtag_dict: try: iter_results[uid]['hashtag'][hashtag] += uid_hashtag_dict[hashtag] except: iter_results[uid]['hashtag'][hashtag] = uid_hashtag_dict[hashtag] #compute sensitive sensitive_item = sensitive_results[count] if sensitive_item: uid_sensitive_dict = json.loads(sensitive_item) else: uid_sensitive_dict = {} for sensitive_word in uid_sensitive_dict: try: iter_results[uid]['sensitive'][sensitive_word] += uid_sensitive_dict[sensitive_word] except: iter_results[uid]['sensitive'][sensitive_word] = uid_sensitive_dict[sensitive_word] #compute geo uid_day_geo[uid] = {} ip_item = ip_results[count] if ip_item: uid_ip_dict = json.loads(ip_item) else: uid_ip_dict = {} for ip in uid_ip_dict: ip_count = len(uid_ip_dict[ip].split('&')) geo = ip2city(ip) if geo: #print 'geo:', geo try: iter_results[uid]['geo'][geo] += ip_count except: iter_results[uid]['geo'][geo] = ip_count try: uid_day_geo[uid][geo] += ip_count except: uid_day_geo[uid][geo] = ip_count iter_results[uid]['geo_track'].append(uid_day_geo[uid]) count += 1 #compute keywords: try: text_results = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type, \ body={'query':{'filtered':{'filter':{'terms':{'uid': uid_list}}}}, 'size':MAX_VALUE}, _source=True, fields=['uid', 'keywords_dict'])['hits']['hits'] except: text_results = {} for item in text_results: #print 'keywords item:', item uid = item['fields']['uid'][0] uid_keywords_dict = json.loads(item['fields']['keywords_dict'][0]) for keywords in uid_keywords_dict: try: iter_results[uid]['keywords'][keywords] += uid_keywords_dict[keywords] except: iter_results[uid]['keywords'][keywords] = uid_keywords_dict[keywords] #get keywords top for uid in uid_list: results[uid] = {} hashtag_dict = iter_results[uid]['hashtag'] results[uid]['hashtag_dict'] = json.dumps(hashtag_dict) results[uid]['hashtag'] = '&'.join(hashtag_dict.keys()) #sensitive words sensitive_word_dict = iter_results[uid]['sensitive'] results[uid]['sensitive_dict'] = json.dumps(sensitive_word_dict) results[uid]['sensitive_string'] = '&'.join(sensitive_word_dict.keys()) sensitive_score = 0 for item in sensitive_word_dict: k = item v = sensitive_word_dict[k] tmp_stage = r_sensitive.hget('sensitive_words', k) if tmp_stage: sensitive_score += v * sensitive_score_dict[str(tmp_stage)] results[uid]['sensitive'] = sensitive_score #print 'sensitive_dict:', results[uid]['sensitive_dict'] #print 'sensitive_string:', results[uid]['sensitive_string'] #print 'sensitive:', results[uid]['sensitive'] #geo geo_dict = iter_results[uid]['geo'] geo_track_list = iter_results[uid]['geo_track'] results[uid]['activity_geo_dict'] = json.dumps(geo_track_list) geo_dict_keys = geo_dict.keys() #print 'geo_dict_keys:', geo_dict_keys results[uid]['activity_geo'] = '&'.join(['&'.join(item.split('\t')) for item in geo_dict_keys]) #print 'activity_geo:', results[uid]['activity_geo'] keywords_dict = iter_results[uid]['keywords'] keywords_top50 = sorted(keywords_dict.items(), key=lambda x:x[1], reverse=True)[:50] keywords_top50_string = '&'.join([keyword_item[0] for keyword_item in keywords_top50]) results[uid]['keywords'] = json.dumps(keywords_top50) results[uid]['keywords_string'] = keywords_top50_string return results
def get_flow_information_v2(uid_list, all_user_keywords_dict): results = {} #results = {uid:{'hashtag_dict':{},'hashtag':'', 'keywords_dict':{}, 'keywords_string':'', 'activity_geo':'', 'activity_geo_dict':dict, 'activity_geo_aggs':''}} iter_results = {} # iter_results = {uid:{'hashtag': hashtag_dict, 'geo':geo_dict, 'keywords':keywords_dict}} now_ts = time.time() #run_type today_sensitive_results = {} if RUN_TYPE == 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = test_ts for i in range(WEEK,0,-1): ts = now_date_ts - DAY*i uid_day_geo = {} #compute hashtag and geo hashtag_results = r_cluster_3.hmget('hashtag_'+str(ts), uid_list) ip_results = r_cluster.hmget('new_ip_'+str(ts), uid_list) #compute sensitive_words sensitive_results = r_cluster_3.hmget('sensitive_'+str(ts), uid_list) count = 0 for uid in uid_list: #init iter_results[uid] if uid not in iter_results: iter_results[uid] = {'hashtag':{}, 'geo':{},'geo_track':[],'keywords':{}, 'sensitive':{}, 'school':{}} if uid not in today_sensitive_results: today_sensitive_results[uid] = {} #compute hashtag hashtag_item = hashtag_results[count] if hashtag_item: uid_hashtag_dict = json.loads(hashtag_item) else: uid_hashtag_dict = {} for hashtag in uid_hashtag_dict: try: iter_results[uid]['hashtag'][hashtag] += uid_hashtag_dict[hashtag] except: iter_results[uid]['hashtag'][hashtag] = uid_hashtag_dict[hashtag] #compute sensitive sensitive_item = sensitive_results[count] if sensitive_item: uid_sensitive_dict = json.loads(sensitive_item) else: uid_sensitive_dict = {} for sensitive_word in uid_sensitive_dict: try: iter_results[uid]['sensitive'][sensitive_word] += uid_sensitive_dict[sensitive_word] except: iter_results[uid]['sensitive'][sensitive_word] = uid_sensitive_dict[sensitive_word] if ts == now_date_ts - DAY: try: today_sensitive_results[uid][sensitive_word] += uid_sensitive_dict[sensitive_word] except: today_sensitive_results[uid][sensitive_word] = uid_sensitive_dict[sensitive_word] #compute geo uid_day_geo[uid] = {} ip_item = ip_results[count] if ip_item: uid_ip_dict = json.loads(ip_item) else: uid_ip_dict = {} for ip in uid_ip_dict: ip_count = len(uid_ip_dict[ip].split('&')) geo, school = ip2city(ip) if geo: try: iter_results[uid]['geo'][geo] += ip_count except: iter_results[uid]['geo'][geo] = ip_count try: uid_day_geo[uid][geo] += ip_count except: uid_day_geo[uid][geo] = ip_count if school: try: iter_results[uid]['school'][school] += ip_count except: iter_results[uid]['school'][school] = ip_count iter_results[uid]['geo_track'].append(uid_day_geo[uid]) count += 1 #get keywords top for uid in uid_list: results[uid] = {} #hashtag hashtag_dict = iter_results[uid]['hashtag'] results[uid]['hashtag_dict'] = json.dumps(hashtag_dict) results[uid]['hashtag'] = '&'.join(hashtag_dict.keys()) #sensitive words sensitive_word_dict = iter_results[uid]['sensitive'] results[uid]['sensitive_dict'] = json.dumps(sensitive_word_dict) results[uid]['sensitive_string'] = '&'.join(sensitive_word_dict.keys()) sensitive_score = 0 today_sensitive_results_user = today_sensitive_results[uid] for sensitive_item in today_sensitive_results_user: k = sensitive_item v = today_sensitive_results_user[sensitive_item] tmp_stage = r_sensitive.hget('sensitive_words', k) if tmp_stage: sensitive_score += v * sensitive_score_dict[str(tmp_stage)] results[uid]['sensitive'] = sensitive_score #print 'sensitive_dict:', results[uid]['sensitive_dict'] #print 'sensitive_string:', results[uid]['sensitive_string'] #print 'sensitive:', results[uid]['sensitive'] #geo geo_dict = iter_results[uid]['geo'] geo_track_list = iter_results[uid]['geo_track'] results[uid]['activity_geo_dict'] = json.dumps(geo_track_list) geo_dict_keys = geo_dict.keys() results[uid]['activity_geo'] = '&'.join(['&'.join(item.split('\t')) for item in geo_dict_keys]) try: results[uid]['activity_geo_aggs'] = '&'.join([item.split('\t')[-1] for item in geo_dict_keys]) except: results[uid]['activity_geo_aggs'] = '' #keywords keywords_dict = all_user_keywords_dict[uid] keywords_top50 = sorted(keywords_dict.items(), key=lambda x:x[1], reverse=True)[:50] keywords_top50_string = '&'.join([keyword_item[0] for keyword_item in keywords_top50]) results[uid]['keywords'] = json.dumps(keywords_top50) results[uid]['keywords_string'] = keywords_top50_string #school dict school_dict = iter_results[uid]['school'] school_string = '&'.join(school_dict.keys()) if school_dict != {}: is_school = '1' else: is_school = '0' results[uid]['is_school'] = is_school results[uid]['school_string'] = school_string results[uid]['school_dict'] = json.dumps(school_dict) return results
item['keywords_string'] = keywords_string # use to search sensitive_words_dict = searchWord(text.encode('utf-8', 'ignore'), DFA) if sensitive_words_dict: item['sensitive_words_string'] = "&".join(sensitive_words_dict.keys()) item['sensitive_words_dict'] = json.dumps(sensitive_words_dict) else: item['sensitive_words_string'] = "" item['sensitive_words_dict'] = json.dumps({}) timestamp = item['timestamp'] date = ts2datetime(timestamp) ts = datetime2ts(date) if sensitive_words_dict: print sensitive_words_dict.keys()[0] sensitive_count_string = r_cluster.hget('sensitive_'+str(ts), str(uid)) if sensitive_count_string: #redis取空 sensitive_count_dict = json.loads(sensitive_count_string) for word in sensitive_words_dict.keys(): if sensitive_count_dict.has_key(word): sensitive_count_dict[word] += sensitive_words_dict[word] else: sensitive_count_dict[word] = sensitive_words_dict[word] r_cluster.hset('sensitive_'+str(ts), str(uid), json.dumps(sensitive_count_dict)) else: r_cluster.hset('sensitive_'+str(ts), str(uid), json.dumps(sensitive_words_dict)) #identify whether to mapping new es weibo_timestamp = item['timestamp'] should_index_name_date = ts2datetime(weibo_timestamp) if should_index_name_date != now_index_name_date:
from time_utils import datetime2ts,ts2datetime from global_utils import R_CLUSTER_FLOW3, R_CLUSTER_FLOW2, R_CLUSTER_FLOW1 uidlist = [] f = open("uid_list_0520.txt") for line in f: uid = line.strip() uidlist.append(uid) f.close() data = [] dates = ["2016-05-14", "2016-05-15", "2016-05-16", "2016-05-17", "2016-05-18", "2016-05-19", "2016-05-20"] tss = [datetime2ts(d) for d in dates] for ts in tss: ns = "hashtag_" + str(ts) hashtag_list = R_CLUSTER_FLOW3.hmget(ns, uidlist) hashtag_list = [json.loads(h) if h else None for h in hashtag_list] uhlist = zip(uidlist, hashtag_list) uhtlist = [] for uh in uhlist: uh = list(uh) uh.append(ts) uhtlist.append(uh) data.extend(uhtlist) with open("hashtag_0521.txt", "w") as fw: for d in data: if d[1] != None: fw.write("%s\n" % json.dumps(d)) at_data = []