def get_top_user(): results = dict() domain_results = dict() topic_results = dict() for item in domain_list: search_query_body = query_body("domain", item) search_results = es.search(index="sensitive_user_portrait", doc_type="user", body=search_query_body, _source=False, fields=['uname','photo_url'])['hits']['hits'] uid_list = [] for iter_item in search_results: uid_list.append([iter_item['_id'], iter_item['fields']['uname'][0], iter_item['fields']['photo_url'][0]]) domain_results[item] = uid_list for item in topic_list: search_query_body = query_body("topic_string", item) search_results = es.search(index="sensitive_user_portrait", doc_type="user", body=search_query_body, _source=False, fields=['uname','photo_url'])['hits']['hits'] uid_list = [] for iter_item in search_results: uid_list.append([iter_item['_id'], iter_item['fields']['uname'][0], iter_item['fields']['photo_url'][0]]) topic_results[item] = uid_list results['domain_rank'] = domain_results results['topic_rank'] = topic_results return results
def search_important(category, detail): query_body = { "query": { "filtered": { "filter": { "term": { category: detail } } } }, "sort": { "sensitive": { "order": "desc" } }, "size": 20 } results = es.search(index="sensitive_user_portrait", doc_type="user", body=query_body, _source=False, fields=['uname'])['hits']['hits'] uid_list = [] for item in results: uid_list.append([item['_id'], item['fields']['uname'][0]]) return uid_list
def get_evaluate_max(): max_result = {} evaluate_index = ['activeness', 'importance', 'influence', 'sensitive'] for evaluate in evaluate_index: query_body = { "query": { 'match_all': {} }, "size": 1, 'sort': [{ evaluate: { 'order': 'desc' } }] } result = es.search(index=portrait_index_name, doc_type=portrait_index_type, body=query_body)['hits']['hits'] max_evaluate = result[0]['_source'][evaluate] if max_evaluate != 0: max_result[evaluate] = max_evaluate else: max_result[evaluate] = 99999 return max_result
def get_user_portrait_byidname(uid, isuid=True, specify_field=[]): uid_list = [uid] results = [] max_result = get_evaluate_max() fields_list = ['uname','domain','topic_string','politics','fansnum','statusnum','friendsnum','location', 'hashtag', 'activity_geo', 'keywords_string'] if specify_field: fields_list = specify_field if isuid: search_results = es.mget(index=portrait_index_name,doc_type=portrait_index_type,body={"ids":uid_list}, _source=False, \ fields=['uname','domain','topic_string','politics','fansnum','statusnum', 'hashtag_string', 'activity_geo', 'friendsnum','location','activeness','importance','influence','sensitive', 'keywords_dict'])["docs"] for item in search_results: iter_result = [] iter_result.append(item['_id']) if item['found']: for iter_field in fields_list: if iter_field == "topic_string": iter_result.append(item['fields'][iter_field][0]) #iter_result.append(item['fields'][iter_field][0].split('&')) elif iter_field == "keywords_dict": iter_result.append(json.loads(item['fields'][iter_field][0])) else: iter_result.append(item['fields'][iter_field][0]) else: iter_result = None results.append(iter_result) else: query_body = { "query":{ "bool": { "should": [ {"term": {"uname": uid}} ] } }, "size": 1 } search_results = es.search(index=portrait_index_name,doc_type=portrait_index_type,body=query_body, \ fields=['uname','domain','topic_string','politics','fansnum','statusnum', 'hashtag_string', 'activity_geo', 'friendsnum','location','activeness','importance','influence','sensitive', 'keywords_dict'])['hits']['hits'] if len(search_results) == 0: results.append(None) for item in search_results: iter_result = [] iter_result.append(item['_id']) for iter_field in fields_list: if iter_field == "topic_string": iter_result.append(item['fields'][iter_field][0]) #iter_result.append(item['fields'][iter_field][0].split('&')) elif iter_field == "keywords_dict": iter_result.append(json.loads(item['fields'][iter_field][0])) else: iter_result.append(item['fields'][iter_field][0]) results.append(iter_result) return results
def search_in_portrait(category, max_result): query_body={ "query":{ "match_all": {} }, "sort": {category: {"order": "desc"}} } results = es.search(index="sensitive_user_portrait", doc_type="user", body=query_body)['hits']['hits'] uid_list = [] for item in results: uid_list.append([item['_source']['uid'], item['_source']['uname'], normalize_index(item['_source'][category], max_result[category])]) return uid_list
def get_top_user(): results = dict() domain_results = dict() topic_results = dict() for item in domain_list: search_query_body = query_body("domain", item) search_results = es.search(index="sensitive_user_portrait", doc_type="user", body=search_query_body, _source=False, fields=['uname', 'photo_url'])['hits']['hits'] uid_list = [] for iter_item in search_results: uid_list.append([ iter_item['_id'], iter_item['fields']['uname'][0], iter_item['fields']['photo_url'][0] ]) domain_results[item] = uid_list for item in topic_list: search_query_body = query_body("topic_string", item) search_results = es.search(index="sensitive_user_portrait", doc_type="user", body=search_query_body, _source=False, fields=['uname', 'photo_url'])['hits']['hits'] uid_list = [] for iter_item in search_results: uid_list.append([ iter_item['_id'], iter_item['fields']['uname'][0], iter_item['fields']['photo_url'][0] ]) topic_results[item] = uid_list results['domain_rank'] = domain_results results['topic_rank'] = topic_results return results
def search_important(category, detail): query_body={ "query":{ "filtered":{ "filter":{ "term": {category: detail} } } }, "sort": {"sensitive": {"order": "desc"}}, "size": 20 } results = es.search(index="sensitive_user_portrait", doc_type="user", body=query_body, _source=False, fields=['uname'])['hits']['hits'] uid_list = [] for item in results: uid_list.append([item['_id'], item['fields']['uname'][0]]) return uid_list
def compute_mid_result_one(task_name, task_user, start_ts): result = [] #step1: count the sensitive or not weibo count #step2: count the sensitive or not weibo geo count #step3: sentiment in sensitive / unsensitive #step4: compute hashtag #step5: compute sensitive_word #save mid_result query_body = [] #query user query_body.append({'term':{'uid': task_user[0]}}) #query time_segment query_body.append({'range':{'timestamp':{'from':start_ts, 'to':start_ts+900}}}) try: task_user_weibo = es.search(index=text_index_name, doc_type=text_index_type,\ body={'query':{'bool':{'must': query_body}}, 'size':100000})['hits']['hits'] except Exception,e: raise e
def get_evaluate_max(): max_result = {} evaluate_index = ['activeness', 'importance', 'influence', 'sensitive'] for evaluate in evaluate_index: query_body = { "query":{ 'match_all':{} }, "size":1, 'sort':[{evaluate: {'order': 'desc'}}] } result = es.search(index=portrait_index_name, doc_type=portrait_index_type, body=query_body)['hits']['hits'] max_evaluate = result[0]['_source'][evaluate] if max_evaluate != 0: max_result[evaluate] = max_evaluate else: max_result[evaluate] = 99999 return max_result
def search_in_portrait(category, max_result): query_body = { "query": { "match_all": {} }, "sort": { category: { "order": "desc" } } } results = es.search(index="sensitive_user_portrait", doc_type="user", body=query_body)['hits']['hits'] uid_list = [] for item in results: uid_list.append([ item['_source']['uid'], item['_source']['uname'], normalize_index(item['_source'][category], max_result[category]) ]) return uid_list
def compute_mid_result_one(task_name, task_user, start_ts): result = [] #step1: count the sensitive or not weibo count #step2: count the sensitive or not weibo geo count #step3: sentiment in sensitive / unsensitive #step4: compute hashtag #step5: compute sensitive_word #save mid_result query_body = [] #query user query_body.append({'term': {'uid': task_user[0]}}) #query time_segment query_body.append( {'range': { 'timestamp': { 'from': start_ts, 'to': start_ts + 900 } }}) try: task_user_weibo = es.search(index=text_index_name, doc_type=text_index_type,\ body={'query':{'bool':{'must': query_body}}, 'size':100000})['hits']['hits'] except Exception, e: raise e
def compute_mid_result_group(task_name, task_user, start_ts): result = [] #step1: count the sensitive or not weibo count #step2: count the geo weibo count #step3: count the sentiment weibo count #step4: compute hashtag #step5: compute sensitive #step: compute the social #save mid result sensitive_weibo_dict = {} #sentiment_weibo_dict = {'0':{}, '1':{}} #geo_weibo_dict = {'0':{}, '1':{}} #hashtag_weibo_dict = {'0':{}, '1':{}} sentiment_weibo_dict = {} geo_weibo_dict = {} hashtag_weibo_dict = {} sensitive_word_dict = {} search_count = 0 for uid in task_user: query_body = [] query_body.append({'term':{'uid':str(uid)}}) query_body.append({'range':{'timestamp':{'from': start_ts, 'to':start_ts+900}}}) try: user_weibo = es.search(index=text_index_name, doc_type=text_index_type, \ body={'query':{'bool':{'must':query_body}}, 'size':100000})['hits']['hits'] except Exception, e: raise e print 'user_weibo:', len(user_weibo) search_count += len(user_weibo) if user_weibo: for weibo_item in user_weibo: weibo_dict = weibo_item['_source'] #compute sensitive_weibo_count and unsensitive_weibo_count in time-segment sensitive = weibo_dict['sensitive'] try: sensitive_weibo_dict[str(sensitive)] += 1 except: sensitive_weibo_dict[str(sensitive)] = 1 #compute geo_weibo_count geo = weibo_dict['geo'] if str(sensitive) in geo_weibo_dict: try: geo_weibo_dict[str(sensitive)][geo] += 1 except: geo_weibo_dict[str(sensitive)][geo] = 1 else: geo_weibo_dict[str(sensitive)] = {geo: 1} #compute sentiment_weibo_count sentiment = weibo_dict['sentiment'] if str(sensitive) in sentiment_weibo_dict: try: sentiment_weibo_dict[str(sensitive)][sentiment] += 1 except: sentiment_weibo_dict[str(sensitive)][sentiment] = 1 else: sentiment_weibo_dict[str(sensitive)] = {sentiment: 1} #compute hashtag_weibo_dict try: hashtag_list = weibo_dict['hashtag'].split('&') except: hashtag_list = None if hashtag_list: for hashtag in hashtag_list: if str(sensitive) in hashtag_weibo_dict: try: hashtag_weibo_dict[str(sensitive)][hashtag] += 1 except: hashtag_weibo_dict[str(sensitive)][hashtag] = 1 else: hashtag_weibo_dict[str(sensitive)] = {hashtag: 1} #compute sensitive_word_dict try: sensitive_word_list = weibo_dict['sensitive_word'].split('&') except: sensitive_word_list = None if sensitive_word_list: for sensitive_word in sensitive_word_list: try: sensitive_word_dict[sensitive_word] += 1 except: sensitive_word_dict[sensitive_word] = 1
def get_attr(date): results = dict() total_number = es.count(index="sensitive_user_portrait", doc_type="user")['count'] results['total_number'] = total_number max_result = get_evaluate_max() query_body={ "query":{ "filtered":{ "filter":{ "term":{ "sensitive": 0 } } } } } influence_number = es.count(index="sensitive_user_portrait", doc_type="user", body=query_body)['count'] results['sensitive_number'] = total_number - influence_number results['influence_number'] = influence_number # 政治倾向性统计 query_body = query_body_module('politics') politic_array = es.search(index='sensitive_user_portrait', doc_type='user', body=query_body)['aggregations']['all_interests']['buckets'] politic_dict = dict() for item in politic_array: politic_dict[item['key']] = item['doc_count'] results['politics'] = politic_dict # 入库推荐人数 recommend_in_sensitive = 0 recommend_in_sensitive = r.hlen("recomment_" + date +'sensitive') recommend_in_influence = 0 recommend_in_influence = r.hlen("recomment_" + date + "_influence") results['recommend_in'] = recommend_in_influence + recommend_in_sensitive # 群体分析任务 results['monitor_number'] = [4, 83] # test query_body = { "query":{ "bool":{ "must":[ {"term":{'task_type':"detect"}}, {"term":{"state":0}} ] } } } group_detect_number = es.count(index=group_index_name, doc_type=group_index_type, body=query_body)["count"] query_body = { "query":{ "bool":{ "must":[ {"term":{'task_type':"analysis"}}, {"term":{"state":0}} ] } } } group_analysis_number = es.count(index=group_index_name, doc_type=group_index_type, body=query_body)["count"] results["group_detect_number"] = group_detect_number results["group_analysis_number"] = group_analysis_number # 敏感词 query_body = query_body_module('sensitive_words_string') sw_list = es.search(index='sensitive_user_portrait', doc_type='user', body=query_body)['aggregations']['all_interests']['buckets'] sensitive_words = [] for item in sw_list: temp = [] temp.append(item['key']) temp.append(item['doc_count']) sensitive_words.append(temp) results['sensitive_words'] = sensitive_words query_body = query_body_module('keywords_string') sg_list = es.search(index='sensitive_user_portrait', doc_type='user', body=query_body)['aggregations']['all_interests']['buckets'] sensitive_geo = [] for item in sg_list: temp = [] temp.append(item['key']) temp.append(item['doc_count']) sensitive_geo.append(temp) results['keywords_string'] = sensitive_geo query_body = query_body_module('sensitive_hashtag_string') sh_list = es.search(index='sensitive_user_portrait', doc_type='user', body=query_body)['aggregations']['all_interests']['buckets'] sensitive_hashtag = [] for item in sh_list: temp = [] temp.append(item['key']) temp.append(item['doc_count']) sensitive_hashtag.append(temp) results['sensitive_hashtag'] = sensitive_hashtag query_body = query_body_module('sensitive_activity_geo_aggs') sg_list = es.search(index='sensitive_user_portrait', doc_type='user', body=query_body)['aggregations']['all_interests']['buckets'] sensitive_geo = [] for item in sg_list: temp = [] temp.append(item['key']) temp.append(item['doc_count']) sensitive_geo.append(temp) results['sensitive_geo'] = sensitive_geo ''' query_body = query_body_module('domain_string') sd_list = es.search(index='sensitive_user_portrait', doc_type='user', body=query_body)['aggregations']['all_interests']['buckets'] domain = [] for item in sd_list: temp = [] temp.append(item['key']) temp.append(item['doc_count']) domain.append(temp) results['domain'] = domain ''' # tendency distribution # domain and topic rank_results = get_top_user() results.update(rank_results) # rank results['importance'] = search_in_portrait('importance', max_result) results['sensitive'] = search_in_portrait('sensitive', max_result) results['influence'] = search_in_portrait('influence', max_result) results['activeness'] = search_in_portrait('activeness', max_result) # 敏感微博转发量和评论量 mid_list = get_top_mid() sensitive_hot_retweet = sort_retweet_sensitive_weibo(mid_list) sensitive_hot_comment = sort_comment_sensitive_weibo(mid_list) sensitive_weibo_text = get_weibo_detail(mid_list) results['sensitive_hot_retweet'] = sensitive_hot_retweet results['sensitive_hot_comment'] = sensitive_hot_comment results['sensitive_weibo_text'] = sensitive_weibo_text r.set('overview', json.dumps(results))
def get_attr(date): results = dict() total_number = es.count(index="sensitive_user_portrait", doc_type="user")['count'] results['total_number'] = total_number max_result = get_evaluate_max() query_body = { "query": { "filtered": { "filter": { "term": { "sensitive": 0 } } } } } influence_number = es.count(index="sensitive_user_portrait", doc_type="user", body=query_body)['count'] results['sensitive_number'] = total_number - influence_number results['influence_number'] = influence_number # 政治倾向性统计 query_body = query_body_module('politics') politic_array = es.search( index='sensitive_user_portrait', doc_type='user', body=query_body)['aggregations']['all_interests']['buckets'] politic_dict = dict() for item in politic_array: politic_dict[item['key']] = item['doc_count'] results['politics'] = politic_dict # 入库推荐人数 recommend_in_sensitive = 0 recommend_in_sensitive = r.hlen("recomment_" + date + 'sensitive') recommend_in_influence = 0 recommend_in_influence = r.hlen("recomment_" + date + "_influence") results['recommend_in'] = recommend_in_influence + recommend_in_sensitive # 群体分析任务 results['monitor_number'] = [4, 83] # test query_body = { "query": { "bool": { "must": [{ "term": { 'task_type': "detect" } }, { "term": { "state": 0 } }] } } } group_detect_number = es.count(index=group_index_name, doc_type=group_index_type, body=query_body)["count"] query_body = { "query": { "bool": { "must": [{ "term": { 'task_type': "analysis" } }, { "term": { "state": 0 } }] } } } group_analysis_number = es.count(index=group_index_name, doc_type=group_index_type, body=query_body)["count"] results["group_detect_number"] = group_detect_number results["group_analysis_number"] = group_analysis_number # 敏感词 query_body = query_body_module('sensitive_words_string') sw_list = es.search( index='sensitive_user_portrait', doc_type='user', body=query_body)['aggregations']['all_interests']['buckets'] sensitive_words = [] for item in sw_list: temp = [] temp.append(item['key']) temp.append(item['doc_count']) sensitive_words.append(temp) results['sensitive_words'] = sensitive_words query_body = query_body_module('keywords_string') sg_list = es.search( index='sensitive_user_portrait', doc_type='user', body=query_body)['aggregations']['all_interests']['buckets'] sensitive_geo = [] for item in sg_list: temp = [] temp.append(item['key']) temp.append(item['doc_count']) sensitive_geo.append(temp) results['keywords_string'] = sensitive_geo query_body = query_body_module('sensitive_hashtag_string') sh_list = es.search( index='sensitive_user_portrait', doc_type='user', body=query_body)['aggregations']['all_interests']['buckets'] sensitive_hashtag = [] for item in sh_list: temp = [] temp.append(item['key']) temp.append(item['doc_count']) sensitive_hashtag.append(temp) results['sensitive_hashtag'] = sensitive_hashtag query_body = query_body_module('sensitive_activity_geo_aggs') sg_list = es.search( index='sensitive_user_portrait', doc_type='user', body=query_body)['aggregations']['all_interests']['buckets'] sensitive_geo = [] for item in sg_list: temp = [] temp.append(item['key']) temp.append(item['doc_count']) sensitive_geo.append(temp) results['sensitive_geo'] = sensitive_geo ''' query_body = query_body_module('domain_string') sd_list = es.search(index='sensitive_user_portrait', doc_type='user', body=query_body)['aggregations']['all_interests']['buckets'] domain = [] for item in sd_list: temp = [] temp.append(item['key']) temp.append(item['doc_count']) domain.append(temp) results['domain'] = domain ''' # tendency distribution # domain and topic rank_results = get_top_user() results.update(rank_results) # rank results['importance'] = search_in_portrait('importance', max_result) results['sensitive'] = search_in_portrait('sensitive', max_result) results['influence'] = search_in_portrait('influence', max_result) results['activeness'] = search_in_portrait('activeness', max_result) # 敏感微博转发量和评论量 mid_list = get_top_mid() sensitive_hot_retweet = sort_retweet_sensitive_weibo(mid_list) sensitive_hot_comment = sort_comment_sensitive_weibo(mid_list) sensitive_weibo_text = get_weibo_detail(mid_list) results['sensitive_hot_retweet'] = sensitive_hot_retweet results['sensitive_hot_comment'] = sensitive_hot_comment results['sensitive_weibo_text'] = sensitive_weibo_text r.set('overview', json.dumps(results))
def get_user_portrait_byidname(uid, isuid=True, specify_field=[]): uid_list = [uid] results = [] max_result = get_evaluate_max() fields_list = [ 'uname', 'domain', 'topic_string', 'politics', 'fansnum', 'statusnum', 'friendsnum', 'location', 'hashtag', 'activity_geo', 'keywords_string' ] if specify_field: fields_list = specify_field if isuid: search_results = es.mget(index=portrait_index_name,doc_type=portrait_index_type,body={"ids":uid_list}, _source=False, \ fields=['uname','domain','topic_string','politics','fansnum','statusnum', 'hashtag_string', 'activity_geo', 'friendsnum','location','activeness','importance','influence','sensitive', 'keywords_dict'])["docs"] for item in search_results: iter_result = [] iter_result.append(item['_id']) if item['found']: for iter_field in fields_list: if iter_field == "topic_string": iter_result.append(item['fields'][iter_field][0]) #iter_result.append(item['fields'][iter_field][0].split('&')) elif iter_field == "keywords_dict": iter_result.append( json.loads(item['fields'][iter_field][0])) else: iter_result.append(item['fields'][iter_field][0]) else: iter_result = None results.append(iter_result) else: query_body = { "query": { "bool": { "should": [{ "term": { "uname": uid } }] } }, "size": 1 } search_results = es.search(index=portrait_index_name,doc_type=portrait_index_type,body=query_body, \ fields=['uname','domain','topic_string','politics','fansnum','statusnum', 'hashtag_string', 'activity_geo', 'friendsnum','location','activeness','importance','influence','sensitive', 'keywords_dict'])['hits']['hits'] if len(search_results) == 0: results.append(None) for item in search_results: iter_result = [] iter_result.append(item['_id']) for iter_field in fields_list: if iter_field == "topic_string": iter_result.append(item['fields'][iter_field][0]) #iter_result.append(item['fields'][iter_field][0].split('&')) elif iter_field == "keywords_dict": iter_result.append( json.loads(item['fields'][iter_field][0])) else: iter_result.append(item['fields'][iter_field][0]) results.append(iter_result) return results
def compute_mid_result_group(task_name, task_user, start_ts): result = [] #step1: count the sensitive or not weibo count #step2: count the geo weibo count #step3: count the sentiment weibo count #step4: compute hashtag #step5: compute sensitive #step: compute the social #save mid result sensitive_weibo_dict = {} #sentiment_weibo_dict = {'0':{}, '1':{}} #geo_weibo_dict = {'0':{}, '1':{}} #hashtag_weibo_dict = {'0':{}, '1':{}} sentiment_weibo_dict = {} geo_weibo_dict = {} hashtag_weibo_dict = {} sensitive_word_dict = {} search_count = 0 for uid in task_user: query_body = [] query_body.append({'term': {'uid': str(uid)}}) query_body.append( {'range': { 'timestamp': { 'from': start_ts, 'to': start_ts + 900 } }}) try: user_weibo = es.search(index=text_index_name, doc_type=text_index_type, \ body={'query':{'bool':{'must':query_body}}, 'size':100000})['hits']['hits'] except Exception, e: raise e print 'user_weibo:', len(user_weibo) search_count += len(user_weibo) if user_weibo: for weibo_item in user_weibo: weibo_dict = weibo_item['_source'] #compute sensitive_weibo_count and unsensitive_weibo_count in time-segment sensitive = weibo_dict['sensitive'] try: sensitive_weibo_dict[str(sensitive)] += 1 except: sensitive_weibo_dict[str(sensitive)] = 1 #compute geo_weibo_count geo = weibo_dict['geo'] if str(sensitive) in geo_weibo_dict: try: geo_weibo_dict[str(sensitive)][geo] += 1 except: geo_weibo_dict[str(sensitive)][geo] = 1 else: geo_weibo_dict[str(sensitive)] = {geo: 1} #compute sentiment_weibo_count sentiment = weibo_dict['sentiment'] if str(sensitive) in sentiment_weibo_dict: try: sentiment_weibo_dict[str(sensitive)][sentiment] += 1 except: sentiment_weibo_dict[str(sensitive)][sentiment] = 1 else: sentiment_weibo_dict[str(sensitive)] = {sentiment: 1} #compute hashtag_weibo_dict try: hashtag_list = weibo_dict['hashtag'].split('&') except: hashtag_list = None if hashtag_list: for hashtag in hashtag_list: if str(sensitive) in hashtag_weibo_dict: try: hashtag_weibo_dict[str( sensitive)][hashtag] += 1 except: hashtag_weibo_dict[str(sensitive)][hashtag] = 1 else: hashtag_weibo_dict[str(sensitive)] = {hashtag: 1} #compute sensitive_word_dict try: sensitive_word_list = weibo_dict['sensitive_word'].split( '&') except: sensitive_word_list = None if sensitive_word_list: for sensitive_word in sensitive_word_list: try: sensitive_word_dict[sensitive_word] += 1 except: sensitive_word_dict[sensitive_word] = 1