def scan_reducer(): if RUN_TYPE: ts = datetime2ts(ts2datetime(time.time - DAY)) else: ts = datetime2ts('2016-05-14') date = ts2datetime(ts) ts = str(ts) hash_name = pre_ip + ts #sen_hash_name = sen_pre_ip + ts index_name = ip_index_pre + date #sen_index_name = sen_ip_index_pre + date mapping(index_name, ip_doc_type) count = 0 bulk_action = [] tb = time.time() while 1: tmp_list = redis_ip.rpop('ip_uid_list_0520') if tmp_list: uid_list = json.loads(tmp_list) ip_dict = redis_ip.hmget(hash_name, uid_list) for i in range(len(uid_list)): save_dict = dict() uid = uid_list[i] save_dict['uid'] = uid_list[i] save_dict['ip_dict'] = ip_dict[i] bulk_action.extend([{'index': {'_id': uid}}, save_dict]) es_cluster.bulk(bulk_action, index=index_name, doc_type=ip_doc_type) bulk_action = [] count += len(uid_list) te = time.time() if RUN_TYPE == 0: print '%s sec scan %s count user' % (te - tb, count) tb = te else: print count break
def scan_reducer(): if RUN_TYPE: ts = datetime2ts(ts2datetime(time.time - DAY)) date = ts2datetime(time.time - DAY) else: ts = datetime2ts('2013-09-01') date = '2013-09-01' ts = str(ts) hash_name = pre_ip + ts #sen_hash_name = sen_pre_ip + ts index_name = ip_index_pre + date #sen_index_name = sen_ip_index_pre + date mapping(index_name, ip_doc_type) count = 0 bulk_action = [] tb = time.time() while 1: tmp_list = redis_ip.rpop('ip_uid_list') if tmp_list: uid_list = json.loads(tmp_list) ip_dict = redis_ip.hmget(hash_name, uid_list) for i in range(len(uid_list)): save_dict = dict() uid = uid_list[i] save_dict['uid'] = uid_list[i] save_dict['ip_dict'] = ip_dict[i] bulk_action.extend([{'index':{'_id':uid}}, save_dict]) es_cluster.bulk(bulk_action, index=index_name, doc_type=ip_doc_type) bulk_action = [] count += len(uid_list) te = time.time() if RUN_TYPE == 0: print '%s sec scan %s count user' % (te-tb, count) tb = te else: print count break
def get_school(uid_list): now_ts = time.time() now_date_ts = datetime2ts(ts2datetime(now_ts)) school_results = {} for i in range(WEEK, 0, -1): ts = now_date_ts - DAY * i ip_results = redis_ip.hmget('ip_'+str(ts), uid_list) count = 0 for uid in uid_list: if uid not in school_results: school_results[uid] = {} ip_item = ip_results[count] if ip_item: uid_ip_dict = json.loads(ip_item) else: uid_ip_dict = {} for ip in uid_ip_dict: ip_count = len(uid_ip_dict[ip].split('&')) school = ip2school(ip) if school: try: school_results[uid][school] += ip_count except: school_results[uid][school] = ip_count count += 1 results = {} for uid in uid_list: school_dict = school_results[uid] school_string = '&'.join(school_dict.keys()) if school_dict != {}: is_school = '1' else: is_school = '0' results[uid] = {'is_school': is_school, 'school_string': school_string, 'school_dict': json.dumps(school_dict)} return results
def scan_reducer(): if RUN_TYPE: ts = datetime2ts(ts2datetime(time.time - DAY)) else: ts = datetime2ts("2016-05-14") date = ts2datetime(ts) ts = str(ts) hash_name = pre_ip + ts # sen_hash_name = sen_pre_ip + ts index_name = ip_index_pre + date # sen_index_name = sen_ip_index_pre + date mapping(index_name, ip_doc_type) count = 0 bulk_action = [] tb = time.time() while 1: tmp_list = redis_ip.rpop("ip_uid_list_0520") if tmp_list: uid_list = json.loads(tmp_list) ip_dict = redis_ip.hmget(hash_name, uid_list) for i in range(len(uid_list)): save_dict = dict() uid = uid_list[i] save_dict["uid"] = uid_list[i] save_dict["ip_dict"] = ip_dict[i] bulk_action.extend([{"index": {"_id": uid}}, save_dict]) es_cluster.bulk(bulk_action, index=index_name, doc_type=ip_doc_type) bulk_action = [] count += len(uid_list) te = time.time() if RUN_TYPE == 0: print "%s sec scan %s count user" % (te - tb, count) tb = te else: print count break
def get_flow_information(uid_list): # 前七天的数据, 不能用于每天更新 lenth = len(uid_list) results = {} iter_results = {} result_dict = {} if RUN_TYPE: now_ts = time.time() now_date = ts2datetime(now_ts) # date: 2013-09-01 else: now_date = "2013-09-08" ts = datetime2ts(now_date) start_ts = ts - 8 * 3600 * 24 for i in range(1, 8): ts = start_ts + i * 3600 * 24 date = ts2datetime(ts) print "date:", date uid_day_geo = {} sensitive_uid_day_geo = {} flow_index_name = flow_text_index_name_pre + str(date) # hashtag print uid_list hashtag_results = redis_cluster.hmget('hashtag_' + str(ts), uid_list) sensitive_hashtag = redis_cluster.hmget('sensitive_hashtag_' + str(ts), uid_list) # sensitive_words sensitive_results = redis_cluster.hmget('sensitive_' + str(ts), uid_list) # ip if WORK_TYPE == 0: ip_index_name = ip_index_pre + str(date) sensitive_ip_index_name = sen_ip_index_pre + str(date) #activity_index_name = act_index_pre + str(date) #sensitive_activity_index_name = sen_act_index_pre + str(date) exist_bool = es_cluster.indices.exists(index=ip_index_name) sensitive_exist_bool = es_cluster.indices.exists( index=sensitive_ip_index_name) #activity_exist_bool = es_cluster.indices.exists(index=activity_index_name) #sensitive_activity_exist_bool = es_cluster.indices.exists(index=sensitive_activity_index_name) if exist_bool: ip_results = es_cluster.mget(index=ip_index_name, doc_type="ip", body={"ids": uid_list})["docs"] else: ip_results = [dict()] * lenth if sensitive_exist_bool: sensitive_ip_results = es_cluster.mget( index=sensitive_ip_index_name, doc_type="sensitive_ip", body={"ids": uid_list})["docs"] else: sensitive_ip_results = [dict()] * lenth """ if activity_exist_bool: activity_results = es_cluster.mget(index=activity_index_name, doc_type="activity", body={"ids":uid_list})["docs"] else: activity_results = [dict()]*lenth if sensitive_activity_exist_bool: sensitive_activity_results = es_cluster.mget(index=sensitive_activity_index_name, doc_type="sensitive_activity", body={"ids":uid_list})["docs"] else: sensitive_activity_results = [dict()]*lenth """ else: ip_results = redis_ip.hmget('ip_' + str(ts), uid_list) sensitive_ip_results = redis_ip.hmget('sensitive_ip_' + str(ts), uid_list) #activity_results = redis_activity.hmget('activity_'+str(date), uid_list) #sensitive_activity_results = redis_activity.hmget('sensitive_activity_'+str(date), uid_list) for j in range(0, len(uid_list)): uid = uid_list[j] if uid not in iter_results: iter_results[uid] = {'hashtag':{}, 'sensitive_hashtag':{}, 'geo':{}, "sensitive_geo":{},'geo_track':[],'keywords':{}, \ 'sensitive_words':{}, "sensitive_geo_track":[],'ip': [], 'sensitive_ip':[]} # sensitive words if sensitive_results[j]: sensitive_words_results = json.loads(sensitive_results[j]) for sensitive_word in sensitive_words_results: try: iter_results[uid]["sensitive_words"][ sensitive_word] += sensitive_words_results[ sensitive_word] except: iter_results[uid]["sensitive_words"][ sensitive_word] = sensitive_words_results[ sensitive_word] #print "sensitive_words:", iter_results[uid]["sensitive_words"] if hashtag_results[j]: hashtag_dict = json.loads(hashtag_results[j]) for hashtag in hashtag_dict: try: iter_results[uid]['hashtag'][hashtag] += hashtag_dict[ hashtag] except: iter_results[uid]['hashtag'][hashtag] = hashtag_dict[ hashtag] #print "hashtag: ", iter_results[uid]['hashtag'] if sensitive_hashtag[j]: sensitive_hashtag_dict = json.loads(sensitive_hashtag[j]) for hashtag in sensitive_hashtag_dict: try: iter_results[uid]['sensitive_hashtag'][ hashtag] += sensitive_hashtag_dict[hashtag] except: iter_results[uid]['sensitive_hashtag'][ hashtag] = sensitive_hashtag_dict[hashtag] #print "sensitive_hashtag:", iter_results[uid]['sensitive_hashtag'] uid_day_geo[uid] = {} sensitive_uid_day_geo[uid] = {} if WORK_TYPE == 0: # es if ip_results[j]: if ip_results[j]['found']: detail_item = ip_results[j]['_source'] ip_dict = json.loads(detail_item['ip_dict']) else: ip_dict = {} else: ip_dict = {} else: if ip_results[j]: ip_dict = json.loads(ip_results[j]) else: ip_dict = {} if ip_dict: #iter_results[uid]['ip'].append(ip_dict) geo_dict = ip2geo(ip_dict) for geo, count in geo_dict.iteritems(): try: iter_results[uid]['geo'][geo] += count except: iter_results[uid]['geo'][geo] = count try: uid_day_geo[uid][geo] += count except: uid_day_geo[uid][geo] = count #iter_results[uid]['ip'].append(ip_dict) iter_results[uid]['geo_track'].append(uid_day_geo[uid]) #print "ip:", iter_results[uid]['ip'], iter_results[uid]['geo_track'] if WORK_TYPE == 0: if sensitive_ip_results[j]: if sensitive_ip_results[j]['found']: detail_item = sensitive_ip_results[j]['_source'] sensitive_ip_dict = json.loads( detail_item['sensitive_ip_dict']) else: sensitive_ip_dict = dict() else: sensitive_ip_dict = dict() else: if sensitive_ip_results[j]: sensitive_ip_dict = json.loads(sensitive_ip_results[j]) else: sensitive_ip_dict = dict() if sensitive_ip_dict: sensitive_geo_dict = ip2geo(sensitive_ip_dict) #iter_results[uid]['sensitive_ip'].append(sensitive_ip_dict) for geo, count in sensitive_geo_dict.iteritems(): try: iter_results[uid]['sensitive_geo'][geo] += count except: iter_results[uid]['sensitive_geo'][geo] = count try: sensitive_uid_day_geo[uid][geo] += count except: sensitive_uid_day_geo[uid][geo] = count #iter_results[uid]['sensitive_ip'].append(sensitive_ip_dict) iter_results[uid]['sensitive_geo_track'].append( sensitive_uid_day_geo[uid]) #print "sensitive_ip:", iter_results[uid]['sensitive_ip'], iter_results[uid]['sensitive_geo_track'] # compute keywords flow_text_exist = es_flow_text.indices.exists(index=flow_index_name) if flow_text_exist: text_results = es_flow_text.search(index=flow_index_name, doc_type=flow_text_index_type,\ body={'query':{'filtered':{'filter':{'terms':{'uid': uid_list}}}}, 'size':MAX_VALUE},_source=False, fields=['uid', 'keywords_dict'])['hits']['hits'] else: text_results = {} for item in text_results: uid = item['fields']['uid'][0] uid_keywords_dict = json.loads(item['fields']['keywords_dict'][0]) for keywords in uid_keywords_dict: try: iter_results[uid]['keywords'][ keywords] += uid_keywords_dict[keywords] except: iter_results[uid]['keywords'][ keywords] = uid_keywords_dict[keywords] #print "keywords:", iter_results[uid]['keywords'] for uid in uid_list: results[uid] = {} # hashtag hashtag_dict = iter_results[uid]['hashtag'] results[uid]['hashtag_dict'] = json.dumps(hashtag_dict) results[uid]['hashtag_string'] = '&'.join(hashtag_dict.keys()) # sensitive hashtag sensitive_hashtag_dict = iter_results[uid]['sensitive_hashtag'] results[uid]['sensitive_hashtag_dict'] = json.dumps( sensitive_hashtag_dict) results[uid]['sensitive_hashtag_string'] = '&'.join( sensitive_hashtag_dict.keys()) # sensitive_words sensitive_word_dict = iter_results[uid]['sensitive_words'] results[uid]['sensitive_words_dict'] = json.dumps(sensitive_word_dict) results[uid]['sensitive_words_string'] = '&'.join( sensitive_word_dict.keys()) sensitive_score = 0 for k, v in sensitive_word_dict.iteritems(): tmp = r_sensitive.hget('sensitive_words', k) if tmp: tmp_stage = json.loads(tmp) sensitive_score += sensitive_score_dict[str(tmp_stage[0])] * v results[uid]['sensitive'] = sensitive_score # geo geo_dict = iter_results[uid]['geo'] geo_track_list = iter_results[uid]['geo_track'] results[uid]['activity_geo_dict'] = json.dumps(geo_track_list) geo_dict_keys = geo_dict.keys() results[uid]['activity_geo'] = '&'.join( ['&'.join(item.split('\t')) for item in geo_dict_keys]) results[uid]['activity_geo_aggs'] = '&'.join( [item.split('\t')[-1] for item in geo_dict_keys]) sensitive_geo_dict = iter_results[uid]['sensitive_geo'] sensitive_geo_track_list = iter_results[uid]['sensitive_geo_track'] results[uid]['sensitive_activity_geo_dict'] = json.dumps( sensitive_geo_track_list) sensitive_geo_dict_keys = sensitive_geo_dict.keys() results[uid]['sensitive_activity_geo'] = '&'.join( ['&'.join(item.split('\t')) for item in sensitive_geo_dict_keys]) results[uid]['sensitive_activity_geo_aggs'] = '&'.join( [item.split('\t')[-1] for item in sensitive_geo_dict_keys]) keywords_dict = iter_results[uid]['keywords'] keywords_top50 = sorted(keywords_dict.items(), key=lambda x: x[1], reverse=True)[:50] keywords_top50_string = '&'.join( [keyword_item[0] for keyword_item in keywords_top50]) results[uid]['keywords_dict'] = json.dumps(keywords_top50) results[uid]['keywords_string'] = keywords_top50_string return results
def get_flow_information(uid_list): # 前七天的数据, 不能用于每天更新 lenth = len(uid_list) results = {} iter_results = {} result_dict = {} if RUN_TYPE: now_ts = time.time() now_date = ts2datetime(now_ts) # date: 2013-09-01 else: now_date = "2013-09-08" ts = datetime2ts(now_date) start_ts = ts - 8 * 3600 * 24 for i in range(1, 8): ts = start_ts + i * 3600 * 24 date = ts2datetime(ts) print "date:", date uid_day_geo = {} sensitive_uid_day_geo = {} flow_index_name = flow_text_index_name_pre + str(date) # hashtag print uid_list hashtag_results = redis_cluster.hmget("hashtag_" + str(ts), uid_list) sensitive_hashtag = redis_cluster.hmget("sensitive_hashtag_" + str(ts), uid_list) # sensitive_words sensitive_results = redis_cluster.hmget("sensitive_" + str(ts), uid_list) # ip if WORK_TYPE == 0: ip_index_name = ip_index_pre + str(date) sensitive_ip_index_name = sen_ip_index_pre + str(date) # activity_index_name = act_index_pre + str(date) # sensitive_activity_index_name = sen_act_index_pre + str(date) exist_bool = es_cluster.indices.exists(index=ip_index_name) sensitive_exist_bool = es_cluster.indices.exists(index=sensitive_ip_index_name) # activity_exist_bool = es_cluster.indices.exists(index=activity_index_name) # sensitive_activity_exist_bool = es_cluster.indices.exists(index=sensitive_activity_index_name) if exist_bool: ip_results = es_cluster.mget(index=ip_index_name, doc_type="ip", body={"ids": uid_list})["docs"] else: ip_results = [dict()] * lenth if sensitive_exist_bool: sensitive_ip_results = es_cluster.mget( index=sensitive_ip_index_name, doc_type="sensitive_ip", body={"ids": uid_list} )["docs"] else: sensitive_ip_results = [dict()] * lenth """ if activity_exist_bool: activity_results = es_cluster.mget(index=activity_index_name, doc_type="activity", body={"ids":uid_list})["docs"] else: activity_results = [dict()]*lenth if sensitive_activity_exist_bool: sensitive_activity_results = es_cluster.mget(index=sensitive_activity_index_name, doc_type="sensitive_activity", body={"ids":uid_list})["docs"] else: sensitive_activity_results = [dict()]*lenth """ else: ip_results = redis_ip.hmget("ip_" + str(ts), uid_list) sensitive_ip_results = redis_ip.hmget("sensitive_ip_" + str(ts), uid_list) # activity_results = redis_activity.hmget('activity_'+str(date), uid_list) # sensitive_activity_results = redis_activity.hmget('sensitive_activity_'+str(date), uid_list) for j in range(0, len(uid_list)): uid = uid_list[j] if uid not in iter_results: iter_results[uid] = { "hashtag": {}, "sensitive_hashtag": {}, "geo": {}, "sensitive_geo": {}, "geo_track": [], "keywords": {}, "sensitive_words": {}, "sensitive_geo_track": [], "ip": [], "sensitive_ip": [], } # sensitive words if sensitive_results[j]: sensitive_words_results = json.loads(sensitive_results[j]) for sensitive_word in sensitive_words_results: try: iter_results[uid]["sensitive_words"][sensitive_word] += sensitive_words_results[sensitive_word] except: iter_results[uid]["sensitive_words"][sensitive_word] = sensitive_words_results[sensitive_word] # print "sensitive_words:", iter_results[uid]["sensitive_words"] if hashtag_results[j]: hashtag_dict = json.loads(hashtag_results[j]) for hashtag in hashtag_dict: try: iter_results[uid]["hashtag"][hashtag] += hashtag_dict[hashtag] except: iter_results[uid]["hashtag"][hashtag] = hashtag_dict[hashtag] # print "hashtag: ", iter_results[uid]['hashtag'] if sensitive_hashtag[j]: sensitive_hashtag_dict = json.loads(sensitive_hashtag[j]) for hashtag in sensitive_hashtag_dict: try: iter_results[uid]["sensitive_hashtag"][hashtag] += sensitive_hashtag_dict[hashtag] except: iter_results[uid]["sensitive_hashtag"][hashtag] = sensitive_hashtag_dict[hashtag] # print "sensitive_hashtag:", iter_results[uid]['sensitive_hashtag'] uid_day_geo[uid] = {} sensitive_uid_day_geo[uid] = {} if WORK_TYPE == 0: # es if ip_results[j]: if ip_results[j]["found"]: detail_item = ip_results[j]["_source"] ip_dict = json.loads(detail_item["ip_dict"]) else: ip_dict = {} else: ip_dict = {} else: if ip_results[j]: ip_dict = json.loads(ip_results[j]) else: ip_dict = {} if ip_dict: # iter_results[uid]['ip'].append(ip_dict) geo_dict = ip2geo(ip_dict) for geo, count in geo_dict.iteritems(): try: iter_results[uid]["geo"][geo] += count except: iter_results[uid]["geo"][geo] = count try: uid_day_geo[uid][geo] += count except: uid_day_geo[uid][geo] = count # iter_results[uid]['ip'].append(ip_dict) iter_results[uid]["geo_track"].append(uid_day_geo[uid]) # print "ip:", iter_results[uid]['ip'], iter_results[uid]['geo_track'] if WORK_TYPE == 0: if sensitive_ip_results[j]: if sensitive_ip_results[j]["found"]: detail_item = sensitive_ip_results[j]["_source"] sensitive_ip_dict = json.loads(detail_item["sensitive_ip_dict"]) else: sensitive_ip_dict = dict() else: sensitive_ip_dict = dict() else: if sensitive_ip_results[j]: sensitive_ip_dict = json.loads(sensitive_ip_results[j]) else: sensitive_ip_dict = dict() if sensitive_ip_dict: sensitive_geo_dict = ip2geo(sensitive_ip_dict) # iter_results[uid]['sensitive_ip'].append(sensitive_ip_dict) for geo, count in sensitive_geo_dict.iteritems(): try: iter_results[uid]["sensitive_geo"][geo] += count except: iter_results[uid]["sensitive_geo"][geo] = count try: sensitive_uid_day_geo[uid][geo] += count except: sensitive_uid_day_geo[uid][geo] = count # iter_results[uid]['sensitive_ip'].append(sensitive_ip_dict) iter_results[uid]["sensitive_geo_track"].append(sensitive_uid_day_geo[uid]) # print "sensitive_ip:", iter_results[uid]['sensitive_ip'], iter_results[uid]['sensitive_geo_track'] # compute keywords flow_text_exist = es_flow_text.indices.exists(index=flow_index_name) if flow_text_exist: text_results = es_flow_text.search( index=flow_index_name, doc_type=flow_text_index_type, body={"query": {"filtered": {"filter": {"terms": {"uid": uid_list}}}}, "size": MAX_VALUE}, _source=False, fields=["uid", "keywords_dict"], )["hits"]["hits"] else: text_results = {} for item in text_results: uid = item["fields"]["uid"][0] uid_keywords_dict = json.loads(item["fields"]["keywords_dict"][0]) for keywords in uid_keywords_dict: try: iter_results[uid]["keywords"][keywords] += uid_keywords_dict[keywords] except: iter_results[uid]["keywords"][keywords] = uid_keywords_dict[keywords] # print "keywords:", iter_results[uid]['keywords'] for uid in uid_list: results[uid] = {} # hashtag hashtag_dict = iter_results[uid]["hashtag"] results[uid]["hashtag_dict"] = json.dumps(hashtag_dict) results[uid]["hashtag_string"] = "&".join(hashtag_dict.keys()) # sensitive hashtag sensitive_hashtag_dict = iter_results[uid]["sensitive_hashtag"] results[uid]["sensitive_hashtag_dict"] = json.dumps(sensitive_hashtag_dict) results[uid]["sensitive_hashtag_string"] = "&".join(sensitive_hashtag_dict.keys()) # sensitive_words sensitive_word_dict = iter_results[uid]["sensitive_words"] results[uid]["sensitive_words_dict"] = json.dumps(sensitive_word_dict) results[uid]["sensitive_words_string"] = "&".join(sensitive_word_dict.keys()) sensitive_score = 0 for k, v in sensitive_word_dict.iteritems(): tmp = r_sensitive.hget("sensitive_words", k) if tmp: tmp_stage = json.loads(tmp) sensitive_score += sensitive_score_dict[str(tmp_stage[0])] * v results[uid]["sensitive"] = sensitive_score # geo geo_dict = iter_results[uid]["geo"] geo_track_list = iter_results[uid]["geo_track"] results[uid]["activity_geo_dict"] = json.dumps(geo_track_list) geo_dict_keys = geo_dict.keys() results[uid]["activity_geo"] = "&".join(["&".join(item.split("\t")) for item in geo_dict_keys]) results[uid]["activity_geo_aggs"] = "&".join([item.split("\t")[-1] for item in geo_dict_keys]) sensitive_geo_dict = iter_results[uid]["sensitive_geo"] sensitive_geo_track_list = iter_results[uid]["sensitive_geo_track"] results[uid]["sensitive_activity_geo_dict"] = json.dumps(sensitive_geo_track_list) sensitive_geo_dict_keys = sensitive_geo_dict.keys() results[uid]["sensitive_activity_geo"] = "&".join( ["&".join(item.split("\t")) for item in sensitive_geo_dict_keys] ) results[uid]["sensitive_activity_geo_aggs"] = "&".join( [item.split("\t")[-1] for item in sensitive_geo_dict_keys] ) keywords_dict = iter_results[uid]["keywords"] keywords_top50 = sorted(keywords_dict.items(), key=lambda x: x[1], reverse=True)[:50] keywords_top50_string = "&".join([keyword_item[0] for keyword_item in keywords_top50]) results[uid]["keywords_dict"] = json.dumps(keywords_top50) results[uid]["keywords_string"] = keywords_top50_string return results