def query_vary_top_k(index_name, doctype, top_k, sort_index="vary"): query_body = { "query": { "match_all": {} }, "size": top_k, "sort": [{sort_index: {"order": "desc"}}] } result = es.search(index=index_name, doc_type=doctype, body=query_body)['hits']['hits'] uid_list = [] for item in result: uid_list.append(item['_id']) portrait_result = es_portrait.mget(index="user_portrait", doc_type="user", body={"ids":uid_list}, _source=True)['docs'] profile_result = es_profile.mget(index="weibo_user",doc_type="user", body={"ids":uid_list}, _source=True)['docs'] return_list = [] rank = 1 for i in range(len(result)): info = ['','','','',''] info[0] = rank if profile_result[i]['found']: info[1] = profile_result[i]['_source'].get('photo_url','') info[3] = profile_result[i]['_source'].get('nick_name','') info[2] = result[i].get('_id','') info[4] = result[i]['_source']['vary'] if portrait_result[i]['found']: info.append('1') else: info.append('0') return_list.append(info) rank += 1 return return_list
def get_vary_detail_info(vary_detail_dict, uid_list): results = {} #get uname try: user_portrait_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type,\ body={'ids':uid_list})['docs'] except: user_portrait_result = [] uname_dict = {} for portrait_item in user_portrait_result: uid = portrait_item['_id'] if portrait_item['found']==True: uname = portrait_item['_source']['uname'] uname_dict[uid] = uname else: uname_dict[uid] = uid #get new vary detail information for vary_pattern in vary_detail_dict: user_info_list = vary_detail_dict[vary_pattern] new_pattern_list = [] for user_item in user_info_list: uid = user_item[0] uname= uname_dict[uid] start_date = ts2datetime(int(user_item[1])) end_date = ts2datetime(int(user_item[2])) new_pattern_list.append([uid, uname, start_date, end_date]) results[vary_pattern] = new_pattern_list return results
def get_group_list(task_name, submit_user): results = [] task_id = submit_user + '-' + task_name if RUN_TYPE == 0: group_index_name = 'test_group_result' try: es_results = es_group_result.get(index=group_index_name, doc_type=group_index_type, id=task_id)['_source'] #jln 现在的9200里没有 #es_results = es_group_result.get(index=group_index_name, doc_type=group_index_type, id=task_id)['_source'] except: return results uid_list = es_results['uid_list'] user_portrait_attribute = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type, body={'ids':uid_list})['docs'] evaluate_max = get_evaluate_max() for item in user_portrait_attribute: uid = item['_id'] try: source = item['_source'] uname = source['uname'] gender = source['gender'] location = source['location'] importance = source['importance'] normal_importance = math.log(importance / evaluate_max['importance'] * 9 + 1, 10) * 100 influence = source['influence'] normal_influence = math.log(influence / evaluate_max['influence'] * 9 + 1, 10) * 100 activeness = source['activeness'] normal_activeness = math.log(activeness / evaluate_max['activeness']* 9 + 1, 10) * 100 sensitive = source['sensitive'] normal_sensitive = math.log(sensitive/ evaluate_max['sensitive'] * 9 + 1, 10) * 100 results.append([uid, uname, gender, location, normal_importance, normal_influence, normal_activeness, normal_sensitive]) except: results.append([uid, '', '', '', '', '', '', '']) return results
def get_group_member_name(task_name, submit_user): results = [] task_id = submit_user + '-' + task_name #print es_group_result,group_index_name,group_index_type try: group_result = es_group_result.get(index=group_index_name, doc_type=group_index_type,\ id=task_id)['_source'] except: return results uid_list = group_result['uid_list'] print len(uid_list) try: user_portrait_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type ,\ body={'ids':uid_list})['docs'] except: return results print len(user_portrait_result) for item in user_portrait_result: uid = item['_id'] if item['found'] == True: source = item['_source'] uname = source['uname'] else: uname = 'unknown' #results[uid] = uname dic = {} dic['ID'] = uid dic['name'] = uname results.append(dic) return results
def ajax_get_group_detail(): task_name = request.args.get('task_name','') # task_name user = request.args.get('user', '') _id = user + '-' + task_name portrait_detail = [] top_activeness = get_top_influence("activeness") top_influence = get_top_influence("influence") top_importance = get_top_influence("importance") search_result = es.get(index=index_group_manage, doc_type=doc_type_group, id=_id).get('_source', {}) if search_result: try: uid_list = json.loads(search_result['uid_list']) except: uid_list = search_result['uid_list'] if uid_list: search_results = es.mget(index=portrait_index_name, doc_type=portrait_index_type, body={"ids":uid_list}, fields=SOCIAL_SENSOR_INFO)['docs'] for item in search_results: temp = [] if item['found']: for iter_item in SOCIAL_SENSOR_INFO: if iter_item == "topic_string": temp.append(item["fields"][iter_item][0].split('&')) temp.append(item["fields"][iter_item][0].split('&')) elif iter_item == "activeness": temp.append(math.log(item['fields']['activeness'][0]/float(top_activeness)*9+1, 10)*100) elif iter_item == "importance": temp.append(math.log(item['fields']['importance'][0]/float(top_importance)*9+1, 10)*100) elif iter_item == "influence": temp.append(math.log(item['fields']['influence'][0]/float(top_influence)*9+1, 10)*100) else: temp.append(item["fields"][iter_item][0]) portrait_detail.append(temp) return json.dumps(portrait_detail)
def get_group_member_name(task_name, submit_user): results = [] task_id = submit_user + '-' + task_name #print es_group_result,group_index_name,group_index_type try: group_result = es_group_result.get(index=group_index_name, doc_type=group_index_type,\ id=task_id)['_source'] except: return results uid_list = group_result['uid_list'] print len(uid_list) try: user_portrait_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type ,\ body={'ids':uid_list})['docs'] except: return results print len(user_portrait_result) for item in user_portrait_result: uid = item['_id'] if item['found'] == True: source = item['_source'] uname = source['uname'] else: uname = 'unknown' #results[uid] = uname dic = {} dic['ID'] = uid dic['name'] = uname results.append(dic) return results
def get_vary_detail_info(vary_detail_dict, uid_list): results = {} #get uname try: user_portrait_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type,\ body={'ids':uid_list})['docs'] except: user_portrait_result = [] uname_dict = {} for portrait_item in user_portrait_result: uid = portrait_item['_id'] if portrait_item['found'] == True: uname = portrait_item['_source']['uname'] uname_dict[uid] = uname else: uname_dict[uid] = uid #get new vary detail information for vary_pattern in vary_detail_dict: user_info_list = vary_detail_dict[vary_pattern] new_pattern_list = [] for user_item in user_info_list: uid = user_item[0] uname = uname_dict[uid] start_date = ts2datetime(int(user_item[1])) end_date = ts2datetime(int(user_item[2])) new_pattern_list.append([uid, uname, start_date, end_date]) results[vary_pattern] = new_pattern_list return results
def search_portrait_user(es, number, active_index, active_type, portrait_index, portrait_type, field="user_index"): return_list = [] index_exist = es.indices.exists(index=active_index) if not index_exist: return "no active_index exist" sys.exit(0) count_s = 0 count_c = 0 start = 0 rank = 1 while 1: search_list = [] user_list = search_k(es, active_index, active_type, start, field, 100) start += 100 for item in user_list: if field == "vary": uid = item.get('uid', '0') # obtain uid, notice "uid" or "user" else: uid = item.get('user', '0') search_list.append(uid) # uid list search_result = es_portrait.mget(index=portrait_index, doc_type=portrait_type, body={"ids": search_list}, _source=True)["docs"] profile_result = es_profile.mget(index="weibo_user", doc_type="user", body={"ids": search_list}, _source=True)["docs"] for item in search_result: if item["found"]: info = ['', '', '', '', '', ''] info[0] = rank index = search_result.index(item) if profile_result[index]['found']: info[1] = profile_result[index]['_source'].get( 'photo_url', '') info[3] = profile_result[index]['_source'].get( 'nick_name', '') info[2] = search_result[index].get('_id', '') info[4] = user_list[index][field] info[5] = "1" return_list.append(info) rank += 1 count_c += 1 if count_c >= int(number): return return_list
def search_portrait_user_in_activity(es, number, active_index, active_type, portrait_index, portrait_type, field="user_index"): return_list = [] index_exist = es.indices.exists(index=active_index) if not index_exist: return "no active_index exist" sys.exit(0) count_s = 0 count_c = 0 start = 0 rank = 1 while 1: search_list = [] user_list = search_k(es, active_index, active_type, start, field, 100) start += 100 for item in user_list: if field == "vary": uid = item.get('uid', '0') # obtain uid, notice "uid" or "user" else: uid = item.get('user', '0') search_list.append(uid) # uid list search_result = es_portrait.mget(index=portrait_index, doc_type=portrait_type, body={"ids": search_list}, _source=True)["docs"] profile_result = es_profile.mget(index="weibo_user", doc_type="user", body={"ids": search_list}, _source=True)["docs"] key_list = ["origin_weibo_retweeted_total_number", "origin_weibo_retweeted_average_number", "origin_weibo_retweeted_top_number", "origin_weibo_retweeted_brust_average", \ "origin_weibo_comment_total_number", "origin_weibo_comment_average_number", "origin_weibo_comment_top_number", "origin_weibo_retweeted_brust_average", \ "retweeted_weibo_retweeted_total_number", "retweeted_weibo_retweeted_average_number", "retweeted_weibo_retweeted_top_number", "retweeted_weibo_retweeted_brust_average", \ "retweeted_weibo_comment_total_number", "retweeted_weibo_comment_average_number", "retweeted_weibo_comment_top_number", "retweeted_weibo_retweeted_brust_average"] for item in search_result: if item["found"]: info = ['','','','','',''] info[0] = rank index = search_result.index(item) if profile_result[index]['found']: info[1] = profile_result[index]['_source'].get('photo_url','') info[3] = profile_result[index]['_source'].get('nick_name','') info[2] = search_result[index].get('_id','') info[4] = user_list[index]['user_index'] info[5] = "1" if field == 'origin_weibo_retweeted_brust_average': info.append(user_list[index]['origin_weibo_retweeted_brust_average']) for key in key_list: info.append(user_list[index][key]) elif field == 'origin_weibo_comment_brust_average': info.append(user_list[index]['origin_weibo_comment_brust_average']) for key in key_list: info.append(user_list[index][key]) else: pass return_list.append(info) rank += 1 count_c += 1 if count_c >= int(number): return return_list
def search_top_index(index_name, top_k=1, index_type="bci", top=False, sort_order="user_index"): query_body = { "query": { "match_all": {} }, "size": top_k, "sort": [{sort_order: {"order": "desc"}}] } if top: result = es.search(index=index_name, doc_type=index_type, body=query_body)['hits']['hits'][0]['_source'][sort_order] else: search_result = es.search(index=index_name, doc_type=index_type, body=query_body)['hits']['hits'] uid_list = [] for item in search_result: uid_list.append(item['_id']) profile_result = es_profile.mget(index=profile_index_name,doc_type=profile_index_type, body={"ids":uid_list}, _source=True)['docs'] portrait_result = es_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type, body={"ids":uid_list}, _source=True)['docs'] result = [] rank = 1 for i in range(len(search_result)): info = ['','','',''] info[0] = rank if profile_result[i]['found']: info[1] = profile_result[i]['_source'].get('photo_url','') info[3] = profile_result[i]['_source'].get('nick_name','') info[2] = search_result[i].get('_id','') if sort_order in ["user_index","origin_weibo_retweeted_brust_average","origin_weibo_comment_brust_average"]: info.append(search_result[i]['_source'][sort_order]) if portrait_result[i]['found']: info.append("1") else: info.append("0") elif sort_order == "origin_weibo_retweeted_top_number": info.append(search_result[i]['_source']['origin_weibo_retweeted_top_number']) mid = search_result[i]['_source']['origin_weibo_top_retweeted_id'] info.append(weiboinfo2url(info[2],mid)) if portrait_result[i]['found']: info.append("1") else: info.append("0") elif sort_order == "origin_weibo_comment_top_number": info.append(search_result[i]['_source']['origin_weibo_comment_top_number']) mid = search_result[i]['_source']['origin_weibo_top_comment_id'] info.append(weiboinfo2url(info[2],mid)) if portrait_result[i]['found']: info.append("1") else: info.append("0") rank += 1 result.append(info) return result
def get_social_inter_content(uid1, uid2, type_mark): weibo_list = [] #get two type relation about uid1 and uid2 #search weibo list now_ts = int(time.time()) #run_type if RUN_TYPE == 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = datetime2ts(RUN_TEST_TIME) #uid2uname uid2uname = {} try: portrait_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type ,\ body={'ids': [uid1, uid2]}, _source=False, fields=['uid', 'uname'])['docs'] except: portrait_result = [] for item in portrait_result: uid = item['_id'] if item['found'] == True: uname = item['fields']['uname'][0] uid2uname[uid] = uname else: uid2uname[uid] = 'unknown' #iter date to search weibo list for i in range(7, 0, -1): iter_date_ts = now_date_ts - i*DAY iter_date = ts2datetime(iter_date_ts) flow_text_index_name = flow_text_index_name_pre + str(iter_date) query = [] query.append({'bool':{'must':[{'term':{'uid':uid1}}, {'term':{'directed_uid': int(uid2)}}]}}) if type_mark=='out': query.append({'bool':{'must':[{'term':{'uid':uid2}}, {'term':{'directed_uid': int(uid1)}}]}}) try: flow_text_result = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\ body={'query': {'bool':{'should': query}}, 'sort':[{'timestamp':{'order': 'asc'}}], 'size':MAX_VALUE})['hits']['hits'] except: flow_text_result = [] for flow_text in flow_text_result: source = flow_text['_source'] weibo = {} weibo['timestamp'] = source['timestamp'] weibo['ip'] = source['ip'] weibo['geo'] = source['geo'] weibo['text'] = '\t'.join(source['text'].split('&')) weibo['uid'] = source['uid'] weibo['uname'] = uid2uname[weibo['uid']] weibo['directed_uid'] = str(source['directed_uid']) weibo['directed_uname'] = uid2uname[str(source['directed_uid'])] weibo_list.append(weibo) return weibo_list
def search_max_single_field(field, index_name, doctype, top_k=3): # field = "origin_weibo_retweeted_top_number", "origin_weibo_comment_top_number" query_body = { "query": { "match_all": {} }, "sort": [{field: {"order": "desc"}}], "size": top_k } return_list = [] rank = 1 count_c = 0 start = 0 while 1: search_list = [] user_list = search_k(es, index_name, doctype, start, field, 100) start += 100 for item in user_list: uid = item.get('user','0') search_list.append(uid) # uid list search_result = es_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type, body={"ids": search_list}, _source=True)["docs"] profile_result = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids": search_list}, _source=True)["docs"] for i in range(len(search_result)): if search_result[i]['found']: info = ['','','','','','','1'] info[0] = rank info[2] = search_result[i].get('_id','') if profile_result[i]['found']: info[1] = profile_result[i]['_source'].get('photo_url','') info[3] = profile_result[i]['_source'].get('nick_name','') if 'retweeted' in field: temp_mid = user_list[i]['origin_weibo_top_retweeted_id'] info[5] = weiboinfo2url(info[2], temp_mid) info[4] = user_list[i]['origin_weibo_retweeted_top_number'] else: temp_mid = user_list[i]['origin_weibo_top_comment_id'] info[5] = weiboinfo2url(info[2], temp_mid) info[4] = user_list[i]['origin_weibo_comment_top_number'] rank += 1 return_list.append(info) if rank >= int(top_k)+1: return return_list
def get_activity_weibo(task_name, start_ts, submit_user): results = [] task_id = submit_user + '-' + task_name #step1: get task_name uid try: group_result = es_group_result.get(index=group_index_name, doc_type=group_index_type ,\ id=task_id, _source=False, fields=['uid_list']) except: group_result = {} if group_result == {}: return 'task name invalid' try: uid_list = group_result['fields']['uid_list'] except: uid_list = [] if uid_list == []: return 'task uid list null' #step2: get uid2uname uid2uname = {} try: user_portrait_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type, \ body = {'ids':uid_list}, _source=False, fields=['uname'])['docs'] except: user_portrait_result = [] for item in user_portrait_result: uid = item['_id'] if item['found'] == True: uname = item['fields']['uname'][0] uid2uname[uid] = uname #step3: search time_segment weibo time_segment = FOUR_HOUR end_ts = start_ts + time_segment time_date = ts2datetime(start_ts) flow_text_index_name = flow_text_index_name_pre + time_date query = [] query.append({'terms': {'uid': uid_list}}) query.append({'range': {'timestamp': {'gte': start_ts, 'lt': end_ts}}}) try: flow_text_es_result = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type, \ body={'query':{'bool':{'must':query}}, 'sort':'timestamp', 'size':MAX_VALUE})['hits']['hits'] except: flow_text_es_result = [] for item in flow_text_es_result: weibo = {} source = item['_source'] weibo['timestamp'] = ts2date(source['timestamp']) weibo['ip'] = source['ip'] weibo['text'] = source['text'] weibo['geo'] = '\t'.join(source['geo']) results.append(weibo) return results
def search_tag(es, number, active_index, active_type, portrait_index, portrait_type, tag): #field_dict = {"domain":"art"} return_list = [] count_s = 0 count_c = 0 start = 0 rank = 1 while 1: search_list = [] user_list = search_k(es, active_index, active_type, start, "user_index", 10000) start += 10000 for item in user_list: uid = item.get('user', '0') search_list.append(uid) # uid list search_result = es_portrait.mget(index=portrait_index, doc_type=portrait_type, body={"ids": search_list}, _source=True)["docs"] profile_result = es_profile.mget(index="weibo_user", doc_type="user", body={"ids": search_list}, _source=True)["docs"] for item in search_result: count_s += 1 if item['found'] and tag in item['_source']['domain']: info = ['', '', '', '', '', '', ''] info[0] = rank index = search_result.index(item) if profile_result[index]['found']: info[1] = profile_result[index]['_source'].get( 'photo_url', '') info[3] = profile_result[index]['_source'].get( 'nick_name', '') info[2] = search_result[index].get('_id', '') info[4] = user_list[index]['user_index'] info[5] = search_result[index]['_source'].get('activeness', '') info[6] = search_result[index]['_source'].get('importance', '') rank += 1 return_list.append(info) if rank >= int(number) + 1: return return_list if count_s > 100000: return return_list
def get_activity_weibo(task_name, start_ts, submit_user): results = [] task_id = submit_user + '-' + task_name #step1: get task_name uid try: group_result = es_group_result.get(index=group_index_name, doc_type=group_index_type ,\ id=task_id, _source=False, fields=['uid_list']) except: group_result = {} if group_result == {}: return 'task name invalid' try: uid_list = group_result['fields']['uid_list'] except: uid_list = [] if uid_list == []: return 'task uid list null' #step2: get uid2uname uid2uname = {} try: user_portrait_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type, \ body = {'ids':uid_list}, _source=False, fields=['uname'])['docs'] except: user_portrait_result = [] for item in user_portrait_result: uid = item['_id'] if item['found']==True: uname = item['fields']['uname'][0] uid2uname[uid] = uname #step3: search time_segment weibo time_segment = FOUR_HOUR end_ts = start_ts + time_segment time_date = ts2datetime(start_ts) flow_text_index_name = flow_text_index_name_pre + time_date query = [] query.append({'terms':{'uid': uid_list}}) query.append({'range':{'timestamp':{'gte':start_ts, 'lt':end_ts}}}) try: flow_text_es_result = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type, \ body={'query':{'bool':{'must':query}}, 'sort':'timestamp', 'size':MAX_VALUE})['hits']['hits'] except: flow_text_es_result = [] for item in flow_text_es_result: weibo = {} source = item['_source'] weibo['timestamp'] = ts2date(source['timestamp']) weibo['ip'] = source['ip'] weibo['text'] = source['text'] weibo['geo'] = '\t'.join(source['geo']) results.append(weibo) return results
def delete_group_results(task_name, submit_user): task_id = submit_user + '-' + task_name #step1: get group uid list try: group_result = es_group_result.get(index=group_index_name, doc_type=group_index_type,\ id=task_id)['_source'] except: return False uid_list = group_result['uid_list'] #step2: update group_tag in user_portrait query_body = {'query': {'term': {'group': task_id}}} try: user_portrait_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type,\ body={'ids': uid_list})['docs'] except: user_portrait_result = [] bulk_action = [] for item in user_portrait_result: uid = item['_id'] if item['found'] == True: try: source = item['_source'] except: source = {} try: group_tag = source['group'] except: group_tag = '' if group_tag != '': new_group_tag_list = [] group_tag_list = group_tag.split('&') for group_tag_item in group_tag_list: if group_tag_item != task_id and group_tag_item != '[email protected]': new_group_tag_list.append(group_tag_item) new_group_tag = '&'.join(new_group_tag_list) else: new_group_tag = '' action = {'update': {'_id': uid}} bulk_action.extend([action, {'doc': {'group': new_group_tag}}]) if bulk_action: # print 'bulk_action:', bulk_action es_user_portrait.bulk(bulk_action, index=portrait_index_name, doc_type=portrait_index_type) #step3: delete group results in group_manage try: print 'yes delete' result = es.delete(index=index_name, doc_type=index_type, id=task_id) except: return False return True
def ajax_get_group_detail(): task_name = request.args.get('task_name', '') # task_name user = request.args.get('user', '') _id = user + '-' + task_name portrait_detail = [] top_activeness = get_top_influence("activeness") top_influence = get_top_influence("influence") top_importance = get_top_influence("importance") search_result = es.get(index=index_group_manage, doc_type=doc_type_group, id=_id).get('_source', {}) if search_result: try: uid_list = json.loads(search_result['uid_list']) except: uid_list = search_result['uid_list'] if uid_list: search_results = es.mget(index=portrait_index_name, doc_type=portrait_index_type, body={"ids": uid_list}, fields=SOCIAL_SENSOR_INFO)['docs'] for item in search_results: temp = [] if item['found']: for iter_item in SOCIAL_SENSOR_INFO: if iter_item == "topic_string": temp.append( item["fields"][iter_item][0].split('&')) temp.append( item["fields"][iter_item][0].split('&')) elif iter_item == "activeness": temp.append( math.log( item['fields']['activeness'][0] / float(top_activeness) * 9 + 1, 10) * 100) elif iter_item == "importance": temp.append( math.log( item['fields']['importance'][0] / float(top_importance) * 9 + 1, 10) * 100) elif iter_item == "influence": temp.append( math.log( item['fields']['influence'][0] / float(top_influence) * 9 + 1, 10) * 100) else: temp.append(item["fields"][iter_item][0]) portrait_detail.append(temp) return json.dumps(portrait_detail)
def delete_group_results(task_name, submit_user): task_id = submit_user + '-' + task_name #step1: get group uid list try: group_result = es_group_result.get(index=group_index_name, doc_type=group_index_type,\ id=task_id)['_source'] except: return False uid_list = group_result['uid_list'] #step2: update group_tag in user_portrait query_body = {'query':{'term':{'group': task_id}}} try: user_portrait_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type,\ body={'ids': uid_list})['docs'] except: user_portrait_result = [] bulk_action = [] for item in user_portrait_result: uid = item['_id'] if item['found'] == True: try: source = item['_source'] except: source = {} try: group_tag = source['group'] except: group_tag = '' if group_tag != '': new_group_tag_list = [] group_tag_list = group_tag.split('&') for group_tag_item in group_tag_list: if group_tag_item != task_id and group_tag_item != '[email protected]': new_group_tag_list.append(group_tag_item) new_group_tag = '&'.join(new_group_tag_list) else: new_group_tag = '' action = {'update':{'_id': uid}} bulk_action.extend([action, {'doc': {'group': new_group_tag}}]) if bulk_action: print 'bulk_action:', bulk_action es_user_portrait.bulk(bulk_action, index=portrait_index_name, doc_type=portrait_index_type) #step3: delete group results in group_manage try: print 'yes delete' result = es.delete(index=index_name, doc_type=index_type, id=task_id) except: return False return True
def show_vary_detail(task_name, submit_user, vary_pattern): results = [] task_id = submit_user + '-' + task_name #identify the task_id exist try: source = es_group_result.get(index=group_index_name, doc_type=group_index_type,\ id=task_id)['_source'] except: return 'group task is not exist' #identify the task status=1 status = source['status'] if status != 1: return 'group task is not completed' #get vary detail geo try: vary_detail_geo = json.loads(source['vary_detail_geo']) except: vary_detail_geo = {} if vary_detail_geo == {}: return 'vary detail geo none' #get vary_detail vary_pattern_list = vary_pattern.split('-') vary_pattern_key = '&'.join(vary_pattern_list) uid_ts_list = vary_detail_geo[vary_pattern_dict] uid_list = [item[0] for item in uid_ts_list] #get user name try: user_portrait_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type,\ body={'ids':uid_list})['docs'] except: user_portrait_result = [] uname_dict = {} for portrait_item in user_portrait_result: uid = portrait_item['_id'] if portrait_item['found'] == True: uname = portrait_item['_source']['uname'] uname_dict[uid] = uname else: uname_dict[uid] = uid #get vary detail new_detail = [] for vary_item in uid_ts_list: uname = uname_dict[vary_item[0]] start_date = ts2datetime(vary_item[1]) end_date = ts2datetime(vary_item[2]) new_detail.append([vary_item[0], uname, start_date, end_date]) return new_detail
def ajax_get_task_detail_info(): task_name = request.args.get('task_name', '') # task_name user = request.args.get('user', 'admin') _id = user + "-" + task_name task_detail = es.get(index=index_manage_sensing_task, doc_type=task_doc_type, id=_id)['_source'] task_detail["social_sensors"] = json.loads(task_detail["social_sensors"]) #task_detail['keywords'] = json.loads(task_detail['keywords']) #task_detail["sensitive_words"]= json.loads(task_detail["sensitive_words"]) history_status = json.loads(task_detail['history_status']) if history_status: temp_list = [] """ temp_list.append(history_status[-1]) print history_status for item in history_status[:-1]: temp_list.append(item) """ sorted_list = sorted(history_status, key=lambda x: x, reverse=True) task_detail['history_status'] = sorted_list else: task_detail['history_status'] = [] task_detail['social_sensors_portrait'] = [] portrait_detail = [] if task_detail["social_sensors"]: search_results = es.mget(index=portrait_index_name, doc_type=portrait_index_type, body={"ids": task_detail["social_sensors"]})['docs'] if search_results: for item in search_results: temp = [] if item['found']: for iter_item in SOCIAL_SENSOR_INFO: if iter_item == "topic_string": temp.append(item["_source"][iter_item].split('&')) else: temp.append(item["_source"][iter_item]) portrait_detail.append(temp) if portrait_detail: portrait_detail = sorted(portrait_detail, key=lambda x: x[5], reverse=True) task_detail['social_sensors_portrait'] = portrait_detail return json.dumps(task_detail)
def show_vary_detail(task_name, submit_user, vary_pattern): results = [] task_id = submit_user + '-' + task_name #identify the task_id exist try: source = es_group_result.get(index=group_index_name, doc_type=group_index_type,\ id=task_id)['_source'] except: return 'group task is not exist' #identify the task status=1 status = source['status'] if status != 1: return 'group task is not completed' #get vary detail geo try: vary_detail_geo = json.loads(source['vary_detail_geo']) except: vary_detail_geo = {} if vary_detail_geo == {}: return 'vary detail geo none' #get vary_detail vary_pattern_list = vary_pattern.split('-') vary_pattern_key = '&'.join(vary_pattern_list) uid_ts_list = vary_detail_geo[vary_pattern_dict] uid_list = [item[0] for item in uid_ts_list] #get user name try: user_portrait_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type,\ body={'ids':uid_list})['docs'] except: user_portrait_result = [] uname_dict = {} for portrait_item in user_portrait_result: uid = portrait_item['_id'] if portrait_item['found']==True: uname = portrait_item['_source']['uname'] uname_dict[uid] = uname else: uname_dict[uid] = uid #get vary detail new_detail = [] for vary_item in uid_ts_list: uname = uname_dict[vary_item[0]] start_date = ts2datetime(vary_item[1]) end_date = ts2datetime(vary_item[2]) new_detail.append([vary_item[0], uname, start_date, end_date]) return new_detail
def get_group_list(task_name, submit_user): results = [] task_id = submit_user + '-' + task_name if RUN_TYPE == 0: group_index_name = 'test_group_result' try: es_results = es_group_result.get(index=group_index_name, doc_type=group_index_type, id=task_id)['_source'] #jln 现在的9200里没有 #es_results = es_group_result.get(index=group_index_name, doc_type=group_index_type, id=task_id)['_source'] except: return results uid_list = es_results['uid_list'] user_portrait_attribute = es_user_portrait.mget( index=portrait_index_name, doc_type=portrait_index_type, body={'ids': uid_list})['docs'] evaluate_max = get_evaluate_max() for item in user_portrait_attribute: uid = item['_id'] try: source = item['_source'] uname = source['uname'] gender = source['gender'] location = source['location'] importance = source['importance'] normal_importance = math.log( importance / evaluate_max['importance'] * 9 + 1, 10) * 100 influence = source['influence'] normal_influence = math.log( influence / evaluate_max['influence'] * 9 + 1, 10) * 100 activeness = source['activeness'] normal_activeness = math.log( activeness / evaluate_max['activeness'] * 9 + 1, 10) * 100 sensitive = source['sensitive'] normal_sensitive = math.log( sensitive / evaluate_max['sensitive'] * 9 + 1, 10) * 100 results.append([ uid, uname, gender, location, normal_importance, normal_influence, normal_activeness, normal_sensitive ]) except: results.append([uid, '', '', '', '', '', '', '']) return results
def query_vary_top_k(index_name, doctype, top_k, sort_index="vary"): query_body = { "query": { "match_all": {} }, "size": top_k, "sort": [{ sort_index: { "order": "desc" } }] } result = es.search(index=index_name, doc_type=doctype, body=query_body)['hits']['hits'] uid_list = [] for item in result: uid_list.append(item['_id']) portrait_result = es_portrait.mget(index="user_portrait", doc_type="user", body={"ids": uid_list}, _source=True)['docs'] profile_result = es_profile.mget(index="weibo_user", doc_type="user", body={"ids": uid_list}, _source=True)['docs'] return_list = [] rank = 1 for i in range(len(result)): info = ['', '', '', '', ''] info[0] = rank if profile_result[i]['found']: info[1] = profile_result[i]['_source'].get('photo_url', '') info[3] = profile_result[i]['_source'].get('nick_name', '') info[2] = result[i].get('_id', '') info[4] = result[i]['_source']['vary'] if portrait_result[i]['found']: info.append('1') else: info.append('0') return_list.append(info) rank += 1 return return_list
def portrait_user_vary(es, number, active_index, active_type, portrait_index, portrait_type, field="vary"): return_list = [] index_exist = es.indices.exists(index=active_index) if not index_exist: return "no active_index exist" sys.exit(0) count_s = 0 count_c = 0 start = 0 rank = 1 try: while 1: search_list = [] user_list = search_k(es, active_index, active_type, start, field, 100) start += 100 for item in user_list: uid = item.get('uid', '0') # obtain uid, notice "uid" or "user" search_list.append(uid) # uid list search_result = es_portrait.mget(index="user_portrait", doc_type="user", body={"ids": search_list}, _source=True)["docs"] profile_result = es_profile.mget(index="weibo_user", doc_type="user", body={"ids": search_list}, _source=True)["docs"] for item in search_result: count_c += 1 if item["found"]: info = ['','','','','','1'] info[0] = rank index = search_result.index(item) if profile_result[index]['found']: info[1] = profile_result[index]['_source'].get('photo_url','') info[3] = profile_result[index]['_source'].get('nick_name','') info[2] = search_result[index].get('_id','') info[4] = user_list[index]['vary'] return_list.append(info) rank += 1 if rank == int(number)+1: return return_list if count_c > 10000: break except RequestError: print "timeout" return return_list
def filter_in_uid(input_dict): input_uid = input_dict.keys() all_count = len(input_uid) iter_count = 0 in_portrait_result = [] while iter_count < all_count: iter_user_list = input_uid[iter_count: iter_count+FILTER_ITER_COUNT] try: portrait_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type,\ body={'ids': iter_user_list}, _source=False, fields=['photo_url', 'uname'])['docs'] except: portrait_result = [] if portrait_result: iter_in_portrait = [[item['_id'], item['fields']['uname'][0], item['fields']['photo_url'][0],input_dict[item['_id']]] for item in portrait_result if item['found']==True] in_portrait_result.extend(iter_in_portrait) iter_count += FILTER_ITER_COUNT return in_portrait_result
def search_tag(es, number, active_index, active_type, portrait_index, portrait_type, tag): #field_dict = {"domain":"art"} return_list = [] count_s = 0 count_c = 0 start = 0 rank = 1 while 1: search_list = [] user_list = search_k(es, active_index, active_type, start, "user_index", 10000) start += 10000 for item in user_list: uid = item.get('user', '0') search_list.append(uid) # uid list search_result = es_portrait.mget(index=portrait_index, doc_type=portrait_type, body={"ids": search_list}, _source=True)["docs"] profile_result = es_profile.mget(index="weibo_user", doc_type="user", body={"ids": search_list}, _source=True)["docs"] for item in search_result: count_s += 1 if item['found'] and tag in item['_source']['domain']: info = ['','','','','','',''] info[0] = rank index = search_result.index(item) if profile_result[index]['found']: info[1] = profile_result[index]['_source'].get('photo_url','') info[3] = profile_result[index]['_source'].get('nick_name','') info[2] = search_result[index].get('_id','') info[4] = user_list[index]['user_index'] info[5] = search_result[index]['_source'].get('activeness','') info[6] = search_result[index]['_source'].get('importance','') rank += 1 return_list.append(info) if rank >= int(number)+1: return return_list if count_s > 100000: return return_list
def ajax_get_task_detail_info(): task_name = request.args.get('task_name','') # task_name user = request.args.get('user', 'admin') _id = user + "-" + task_name task_detail = es.get(index=index_manage_sensing_task, doc_type=task_doc_type, id=_id)['_source'] task_detail["social_sensors"] = json.loads(task_detail["social_sensors"]) #task_detail['keywords'] = json.loads(task_detail['keywords']) #task_detail["sensitive_words"]= json.loads(task_detail["sensitive_words"]) history_status = json.loads(task_detail['history_status']) if history_status: temp_list = [] """ temp_list.append(history_status[-1]) print history_status for item in history_status[:-1]: temp_list.append(item) """ sorted_list = sorted(history_status, key=lambda x:x, reverse=True) task_detail['history_status'] = sorted_list else: task_detail['history_status'] = [] task_detail['social_sensors_portrait'] = [] portrait_detail = [] if task_detail["social_sensors"]: search_results = es.mget(index=portrait_index_name, doc_type=portrait_index_type, body={"ids": task_detail["social_sensors"]})['docs'] if search_results: for item in search_results: temp = [] if item['found']: for iter_item in SOCIAL_SENSOR_INFO: if iter_item == "topic_string": temp.append(item["_source"][iter_item].split('&')) else: temp.append(item["_source"][iter_item]) portrait_detail.append(temp) if portrait_detail: portrait_detail = sorted(portrait_detail, key=lambda x:x[5], reverse=True) task_detail['social_sensors_portrait'] = portrait_detail return json.dumps(task_detail)
def get_group_member_name(task_name, submit_user): results = {} task_id = submit_user + '-' + task_name try: group_result = es_group_result.get(index=group_index_name, doc_type=group_index_type,\ id=task_id)['_source'] except: return results uid_list = group_result['uid_list'] try: user_portrait_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type ,\ body={'ids':uid_list})['docs'] except: return results for item in user_portrait_result: uid = item['_id'] if item['found'] == True: source = item['_source'] uname = source['uname'] else: uname = 'unkown' results[uid] = uname return results
def get_group_member_name(task_name, submit_user): results = {} task_id = submit_user + '-' + task_name try: group_result = es_group_result.get(index=group_index_name, doc_type=group_index_type,\ id=task_id)['_source'] except: return results uid_list = group_result['uid_list'] try: user_portrait_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type ,\ body={'ids':uid_list})['docs'] except: return results for item in user_portrait_result: uid = item['_id'] if item['found'] == True: source = item['_source'] uname = source['uname'] else: uname = 'unkown' results[uid] = uname return results
def group_user_weibo(task_name, submit_user, sort_type): weibo_list = [] now_date = ts2datetime(time.time()) if sort_type == 'retweet': sort_type = 'retweeted' #run_type if RUN_TYPE == 0: now_date = RUN_TEST_TIME sort_type = 'timestamp' #step1: get group user task_id = submit_user + '-' + task_name try: group_exist_result = es_group_result.get(index=group_index_name, doc_type=group_index_type,\ id=task_id)['_source'] except: group_exist_result = {} if not group_exist_result: return 'group no exist' #step2: get user weibo list uid_list = group_exist_result['uid_list'] for i in range(6, -1, -1): iter_date = ts2datetime(datetime2ts(now_date) - i * DAY) index_name = flow_text_index_name_pre + iter_date try: weibo_result = es_flow_text.search(index=index_name, doc_type=flow_text_index_type,\ body={'query':{'filtered':{'filter':{'terms':{'uid': uid_list}}}}, 'sort':[{sort_type: {'order': 'desc'}}], 'size':100})['hits']['hits'] except: weibo_result = [] if weibo_result: weibo_list.extend(weibo_result) #sort_weibo_list = sorted(weibo_list, key=lambda x:x['_source'][sort_type], reverse=True)[:100] sort_weibo_list = weibo_list #step3: get user name try: portrait_exist_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type, \ body={'ids':uid_list})['docs'] except: portrait_exist_result = [] uid2uname_dict = {} for portrait_item in portrait_exist_result: uid = portrait_item['_id'] if portrait_item['found'] == True: source = portrait_item['_source'] uname = source['uname'] else: uname = 'unknown' uid2uname_dict[uid] = uname weibo_list = [] for weibo_item in sort_weibo_list: source = weibo_item['_source'] mid = source['mid'] uid = source['uid'] uname = uid2uname_dict[uid] text = source['text'] ip = source['geo'] timestamp = source['timestamp'] date = ts2date(timestamp) sentiment = source['sentiment'] weibo_url = weiboinfo2url(uid, mid) #run_type: if RUN_TYPE == 1: try: retweet_count = source['retweeted'] except: retweet_count = 0 try: comment_count = source['comment'] except: comment_count = 0 try: sensitive_score = source['sensitive'] except: sensitive_score = 0 else: retweet_count = 0 comment_count = 0 sensitive_score = 0 city = ip2city(ip) weibo_list.append([ mid, uid, uname, text, ip, city, timestamp, date, retweet_count, comment_count, sensitive_score, weibo_url ]) if sort_type == 'timestamp': new_weibo_list = sorted(weibo_list, key=lambda x: x[6], reverse=True) elif sort_type == 'retweeted': new_weibo_list = sorted(weibo_list, key=lambda x: x[8], reverse=True) elif sort_type == 'comment': new_weibo_list = sorted(weibo_list, key=lambda x: x[9], reverse=True) elif sort_type == 'sensitive': new_weibo_list = sorted(weibo_list, key=lambda x: x[10], reverse=True) return new_weibo_list
def get_temporal_rank(task_type, sort="retweeted", number=100): number = int(number) - 1 if int(task_type) == 0: # 到目前位置 sort_list = r.zrange("influence_%s" % sort, 0, number, withscores=True, desc=True) elif int(task_type) == 1: sort_list = r.zrange("influence_%s_1" % sort, 0, number, withscores=True, desc=True) elif int(task_type) == 2: sort_list = r.zrange("influence_%s_2" % sort, 0, number, withscores=True, desc=True) elif int(task_type) == 3: sort_list = r.zrange("influence_%s_3" % sort, 0, number, withscores=True, desc=True) else: sort_list = r.zrange("influence_%s_4" % sort, 0, number, withscores=True, desc=True) uid_list = [] for item in sort_list: uid_list.append(item[0]) if sort == "retweeted": other = "comment" else: other = "retweeted" results = [] # 查看背景信息 if uid_list: profile_result = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids": uid_list})["docs"] bci_result = es_user_profile.mget( index="bci_history", doc_type="bci", body={"ids": uid_list}, _source=False, fields=['user_fansnum', "weibo_month_sum"])["docs"] count = 0 for item in profile_result: _id = item['_id'] index = profile_result.index(item) tmp = [] tmp.append(item['_id']) if item['found']: item = item['_source'] tmp.append(item['nick_name']) tmp.append(item['statusnum']) tmp.append(item['user_location']) tmp.append(item['fansnum']) else: tmp.extend(['', 0, '', 0]) try: user_fansnum = bci_result[count]['fields']['user_fansnum'][0] tmp[4] = user_fansnum except: pass try: weibo_number = bci_result[count]['fields']["weibo_month_sum"][ 0] tmp[2] = weibo_number except: pass count_1 = int(sort_list[index][1]) if int(task_type) == 0: tmp_count = r.zscore("influence_%s" % other, _id) if tmp_count: count_2 = int(tmp_count) else: count_2 = 0 else: tmp_count = r.zscore("influence_%s_%s" % (other, task_type), _id) if tmp_count: count_2 = int(tmp_count) else: count_2 = 0 if sort == "retweeted": tmp.append(count_1) tmp.append(count_2) else: tmp.append(count_2) tmp.append(count_1) results.append(tmp) count += 1 if uid_list: count = 0 portrait_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type, body={"ids": uid_list})["docs"] for item in portrait_result: if item['found']: results[count].append("1") else: results[count].append("0") count += 1 return results
def search_group_results(task_name, module, submit_user): result = {} if RUN_TYPE == 0: #jln #task_id = '媒体' #group_index_type='text' task_id = submit_user + '-' + task_name group_index_type = 'group' else: task_id = submit_user + '-' + task_name #print es_group_result,group_index_name,group_index_type,task_id #step1:identify the task_name exist try: source = es_group_result.get(index=group_index_name, doc_type=group_index_type, \ id=task_id)['_source'] print source except: return 'group task is not exist' #step2: identify the task status=1(analysis completed) status = source['status'] if status != 1: return 'group task is not completed' #step3:get module result if module == 'overview': result['task_name'] = source['task_name'] result['submit_date'] = ts2datetime(source['submit_date']) result['state'] = source['state'] result['submit_user'] = source['submit_user'] result['density_star'] = source['density_star'] result['activeness_star'] = source['activeness_star'] result['influence_star'] = source['influence_star'] result['importance_star'] = source['importance_star'] #need to delete result['tag_vector'] = json.loads(source['tag_vector']) elif module == 'basic': result['gender'] = json.loads(source['gender']) result['verified'] = json.loads(source['verified']) result['user_tag'] = json.loads(source['user_tag']) result['count'] = source['count'] result['domain'] = json.loads(source['domain']) result['topic'] = json.loads(source['topic']) elif module == 'activity': result['activity_trend'] = json.loads(source['activity_trend']) result['activity_time'] = json.loads(source['activity_time']) result['activity_geo_disribution'] = json.loads( source['activity_geo_distribution']) result['activiy_geo_vary'] = json.loads(source['activity_geo_vary']) result['activeness_trend'] = json.loads(source['activeness']) result['activeness_his'] = json.loads(source['activeness_his']) result['activeness_description'] = source['activeness_description'] result['online_pattern'] = json.loads(source['online_pattern']) #yuanhuiru uid_list = source['uid_list'] user_photo_result = es_user_portrait.mget(index='user_portrait_1222', doc_type='user', body={'ids': uid_list}, fields=['photo_url'])['docs'] influ_value_result = es_user_portrait.mget(index='user_portrait_1222', doc_type='user', body={'ids': uid_list}, fields=['influence' ])['docs'] result['photo_url'] = [] result['influence'] = [] for item in user_photo_result: #uid = item['_id'] if item['found'] == True: source = item['fields'] photo_url = source['photo_url'] else: photo_url = 'unknown' result['photo_url'].append(photo_url) #print 'user_photo', result['photo_url'] for item in influ_value_result: #uid = item['_id'] if item['found'] == True: source = item['fields'] influence = source['influence'] else: influence = 'unknown' result['influence'].append(influence) #print 'influence', result['influence'] new_geo = {} for uid, geos in result['activity_geo_disribution'].iteritems(): for geo, count in geos.iteritems(): geo = geo.split('\t') if geo[0] == u'中国': if len(geo) == 1: geo.append(u'未知', u'未知') elif len(geo) == 2: geo.append(u'未知') try: new_geo[geo[1]]['total'] += count except: new_geo[geo[1]] = {'total': count} try: new_geo[geo[1]][geo[2]] += count except: new_geo[geo[1]][geo[2]] = count result['new_geo'] = new_geo try: vary_detail_geo_dict = json.loads(source['vary_detail_geo']) except: vary_detail_geo_dict = {} #uid_list = source['uid_list'] if vary_detail_geo_dict != {}: result['vary_detail_geo'] = get_vary_detail_info( vary_detail_geo_dict, uid_list) else: result['vary_detail_geo'] = {} try: main_start_geo_dict = json.loads(source['main_start_geo']) except: main_start_geo_dict = {} result['main_start_geo'] = sorted(main_start_geo_dict.items(), key=lambda x: x[1], reverse=True) try: main_end_geo_dict = json.loads(source['main_end_geo']) except: main_end_geo_dict = {} result['main_end_geo'] = sorted(main_end_geo_dict.items(), key=lambda x: x[1], reverse=True) #all_geo_list = list(set(main_start_geo_dict.keys()) | set(main_end_geo_dict.keys())) #result['geo_lat_lng'] = get_lat_lng(all_geo_list) print 'result!!!!!!', result elif module == 'preference': try: result['keywords'] = json.loads(source['filter_keyword']) except: f_keyword = json.loads(source['keywords']) key_str = ','.join([key[0] for key in f_keyword]) filter_dict = get_weibo_single(key_str, n_count=100) result['keywords'] = sorted(filter_dict.iteritems(), key=lambda x: x[1], reverse=True) ''' keyword_list = json.loads(source['keywords']) keyword_dict = dict() for item in keyword_list: keyword_dict[item[0]] = item[1] filter_keyword_dict = keyword_filter(keyword_dict) sort_keyword = sorted(filter_keyword_dict.items(), key=lambda x:x[1], reverse=True) result['keywords'] = sort_keyword ''' result['hashtag'] = json.loads(source['hashtag']) result['sentiment_word'] = json.loads(source['sentiment_word']) try: result['topic_model'] = json.loads(source['topic_model']) except: result['topic_model'] = [] #need to delete result['domain'] = json.loads(source['domain']) result['topic'] = json.loads(source['topic']) elif module == 'influence': result['influence_his'] = json.loads(source['influence_his']) result['influence_trend'] = json.loads(source['influence']) result['influence_in_user'] = json.loads(source['influence_in_user']) result['influence_out_user'] = json.loads(source['influence_out_user']) elif module == 'social': result['in_density'] = source['in_density'] result['in_inter_user_ratio'] = source['in_inter_user_ratio'] result['in_inter_weibo_ratio'] = source['in_inter_weibo_ratio'] result['social_in_record'] = json.loads(source['social_in_record']) result['out_inter_user_ratio'] = source['out_inter_user_ratio'] result['out_inter_weibo_ratio'] = source['out_inter_weibo_ratio'] result['social_out_record'] = json.loads(source['social_out_record']) result['density_description'] = source['density_description'] result['mention'] = source['mention'] elif module == 'think': result['sentiment_trend'] = json.loads(source['sentiment_trend']) result['sentiment_pie'] = json.loads(source['sentiment_pie']) result['character'] = json.loads(source['character']) return result
def portrait_user_vary(es, number, active_index, active_type, portrait_index, portrait_type, field="vary"): return_list = [] index_exist = es.indices.exists(index=active_index) if not index_exist: return "no active_index exist" sys.exit(0) count_s = 0 count_c = 0 start = 0 rank = 1 try: while 1: search_list = [] user_list = search_k(es, active_index, active_type, start, field, 100) start += 100 for item in user_list: uid = item.get('uid', '0') # obtain uid, notice "uid" or "user" search_list.append(uid) # uid list search_result = es_portrait.mget(index="user_portrait", doc_type="user", body={"ids": search_list}, _source=True)["docs"] profile_result = es_profile.mget(index="weibo_user", doc_type="user", body={"ids": search_list}, _source=True)["docs"] for item in search_result: count_c += 1 if item["found"]: info = ['', '', '', '', '', '1'] info[0] = rank index = search_result.index(item) if profile_result[index]['found']: info[1] = profile_result[index]['_source'].get( 'photo_url', '') info[3] = profile_result[index]['_source'].get( 'nick_name', '') info[2] = search_result[index].get('_id', '') info[4] = user_list[index]['vary'] return_list.append(info) rank += 1 if rank == int(number) + 1: return return_list if count_c > 10000: break except RequestError: print "timeout" return return_list
def search_group_results(task_name, module, submit_user): result = {} if RUN_TYPE == 0: #jln #task_id = '媒体' #group_index_type='text' task_id = submit_user + '-' + task_name group_index_type = 'group' else: task_id = submit_user + '-' + task_name #print es_group_result,group_index_name,group_index_type,task_id #step1:identify the task_name exist try: source = es_group_result.get(index=group_index_name, doc_type=group_index_type, \ id=task_id)['_source'] print source except: return 'group task is not exist' #step2: identify the task status=1(analysis completed) status = source['status'] if status != 1: return 'group task is not completed' #step3:get module result if module == 'overview': result['task_name'] = source['task_name'] result['submit_date'] = ts2datetime(source['submit_date']) result['state'] = source['state'] result['submit_user'] = source['submit_user'] result['density_star'] = source['density_star'] result['activeness_star'] = source['activeness_star'] result['influence_star'] = source['influence_star'] result['importance_star'] = source['importance_star'] #need to delete result['tag_vector'] = json.loads(source['tag_vector']) elif module == 'basic': result['gender'] = json.loads(source['gender']) result['verified'] = json.loads(source['verified']) result['user_tag'] = json.loads(source['user_tag']) result['count'] = source['count'] result['domain'] = json.loads(source['domain']) result['topic'] = json.loads(source['topic']) elif module == 'activity': result['activity_trend'] = json.loads(source['activity_trend']) result['activity_time'] = json.loads(source['activity_time']) result['activity_geo_disribution'] = json.loads(source['activity_geo_distribution']) result['activiy_geo_vary'] = json.loads(source['activity_geo_vary']) result['activeness_trend'] = json.loads(source['activeness']) result['activeness_his'] = json.loads(source['activeness_his']) result['activeness_description'] = source['activeness_description'] result['online_pattern'] = json.loads(source['online_pattern']) #yuanhuiru uid_list = source['uid_list'] user_photo_result= es_user_portrait.mget(index='user_portrait_1222', doc_type='user', body={'ids':uid_list}, fields=['photo_url'])['docs'] influ_value_result= es_user_portrait.mget(index='user_portrait_1222', doc_type='user', body={'ids':uid_list}, fields=['influence'])['docs'] result['photo_url']=[] result['influence']=[] for item in user_photo_result: #uid = item['_id'] if item['found']==True: source = item['fields'] photo_url = source['photo_url'] else: photo_url = 'unknown' result['photo_url'].append(photo_url) #print 'user_photo', result['photo_url'] for item in influ_value_result: #uid = item['_id'] if item['found']==True: source = item['fields'] influence = source['influence'] else: influence = 'unknown' result['influence'].append(influence) #print 'influence', result['influence'] new_geo = {} for uid,geos in result['activity_geo_disribution'].iteritems(): for geo,count in geos.iteritems(): geo = geo.split('\t') if geo[0] == u'中国': if len(geo) == 1: geo.append(u'未知',u'未知') elif len(geo) == 2: geo.append(u'未知') try: new_geo[geo[1]]['total'] += count except: new_geo[geo[1]] = {'total':count} try: new_geo[geo[1]][geo[2]] += count except: new_geo[geo[1]][geo[2]] = count result['new_geo'] = new_geo try: vary_detail_geo_dict = json.loads(source['vary_detail_geo']) except: vary_detail_geo_dict = {} #uid_list = source['uid_list'] if vary_detail_geo_dict != {}: result['vary_detail_geo'] = get_vary_detail_info(vary_detail_geo_dict, uid_list) else: result['vary_detail_geo'] = {} try: main_start_geo_dict = json.loads(source['main_start_geo']) except: main_start_geo_dict = {} result['main_start_geo'] = sorted(main_start_geo_dict.items(), key=lambda x:x[1], reverse=True) try: main_end_geo_dict = json.loads(source['main_end_geo']) except: main_end_geo_dict = {} result['main_end_geo'] = sorted(main_end_geo_dict.items(), key=lambda x:x[1], reverse=True) #all_geo_list = list(set(main_start_geo_dict.keys()) | set(main_end_geo_dict.keys())) #result['geo_lat_lng'] = get_lat_lng(all_geo_list) print 'result!!!!!!',result elif module == 'preference': try: result['keywords'] = json.loads(source['filter_keyword']) except: f_keyword = json.loads(source['keywords']) key_str = ','.join([key[0] for key in f_keyword]) filter_dict = get_weibo_single(key_str,n_count=100) result['keywords'] = sorted(filter_dict.iteritems(),key=lambda x:x[1],reverse= True) ''' keyword_list = json.loads(source['keywords']) keyword_dict = dict() for item in keyword_list: keyword_dict[item[0]] = item[1] filter_keyword_dict = keyword_filter(keyword_dict) sort_keyword = sorted(filter_keyword_dict.items(), key=lambda x:x[1], reverse=True) result['keywords'] = sort_keyword ''' result['hashtag'] = json.loads(source['hashtag']) result['sentiment_word'] = json.loads(source['sentiment_word']) try: result['topic_model'] = json.loads(source['topic_model']) except: result['topic_model'] = [] #need to delete result['domain'] = json.loads(source['domain']) result['topic'] = json.loads(source['topic']) elif module == 'influence': result['influence_his'] = json.loads(source['influence_his']) result['influence_trend'] = json.loads(source['influence']) result['influence_in_user'] = json.loads(source['influence_in_user']) result['influence_out_user'] = json.loads(source['influence_out_user']) elif module == 'social': result['in_density'] = source['in_density'] result['in_inter_user_ratio'] = source['in_inter_user_ratio'] result['in_inter_weibo_ratio'] = source['in_inter_weibo_ratio'] result['social_in_record'] = json.loads(source['social_in_record']) result['out_inter_user_ratio'] = source['out_inter_user_ratio'] result['out_inter_weibo_ratio'] = source['out_inter_weibo_ratio'] result['social_out_record'] = json.loads(source['social_out_record']) result['density_description'] = source['density_description'] result['mention'] = source['mention'] elif module == 'think': result['sentiment_trend'] = json.loads(source['sentiment_trend']) result['sentiment_pie'] = json.loads(source['sentiment_pie']) result['character'] = json.loads(source['character']) return result
def influenced_people(uid, mid, influence_style, date, default_number=20): # uid # which weibo----mid, retweeted weibo ---seek for root_mid # influence_style: retweeted(0) or comment(1) date1 = ts2datetime(datetime2ts(date)).replace('-', '') index_name = pre_index + date1 index_flow_text = pre_text_index + date text_result = es.get(index=index_flow_text, doc_type=flow_text_index_type, id=mid)["_source"] temp_mid = text_result.get("root_mid",'') #判断微博是否是原创微博 if temp_mid: mid_type = 1 # 非原创微博 else: mid_type = 0 # 原创微博 query_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ ] } } } }, "size": 30000 } if mid_type == 0: if int(influence_style) == 0: # origin weibo, all retweeted people query_body["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term": {"root_uid": uid}}, {"term": {"message_type": 3}}, {"term": {"root_mid": mid}}]) else: # commented people query_body["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term": {"directed_uid": uid}}, {"term": {"message_type": 2}}, {"term": {"root_mid": mid}}]) else: if int(influence_style) == 0: # origin weibo, all retweeted people query_body["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term": {"directed_uid": uid}}, {"term": {"message_type": 3}}, {"term": {"root_mid": temp_mid}}]) else: # commented people query_body["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term": {"directed_uid": uid}}, {"term": {"message_type": 2}}, {"term": {"root_mid": temp_mid}}]) search_results = es.search(index=index_flow_text, doc_type=flow_text_index_type, body=query_body, fields=["uid"], timeout=30)["hits"]["hits"] results = [] # uid_list if search_results: for item in search_results: if int(item["fields"]["uid"][0]) == int(uid): pass else: results.append(item["fields"]["uid"][0]) results = list(set(results)) else: results = [] bci_index = "bci_" + date.replace('-','') if results: portrait_results = es_user_portrait.mget(index=user_portrait, doc_type=portrait_index_type, body={"ids": results}, fields=["domain", "topic_string", "activity_geo_dict","importance", "influence"])["docs"] bci_results = es_user_portrait.mget(index=bci_index, doc_type='bci', body={"ids":results}, fields=['user_index'])['docs'] else: portrait_results = {} bci_results = {} in_portrait = [] out_portrait = [] in_portrait_info = [] retweeted_domain = {} retweeted_topic = {} retweeted_geo = {} average_influence = 0 total_influence = 0 count = 0 if bci_results: total_influence = 0 for item in bci_results: if item['found']: total_influence += item['fields']['user_index'][0] try: average_influence = total_influence/len(results) except: average_influence = 0 if portrait_results: for item in portrait_results: if item["found"]: temp = [] count += 1 temp.append(item['_id']) temp.append(item["fields"]["importance"][0]) in_portrait.append(temp) temp_domain = item["fields"]["domain"][0].split('&') temp_topic = item["fields"]["topic_string"][0].split('&') temp_geo = json.loads(item["fields"]["activity_geo_dict"][0])[-1].keys() #total_influence += item["fields"]["influence"][0] retweeted_domain = aggregation(temp_domain, retweeted_domain) retweeted_topic = aggregation(temp_topic, retweeted_topic) retweeted_geo = aggregation(temp_geo, retweeted_geo) else: out_portrait.append(item['_id']) retweeted_domain = proportion(retweeted_domain) retweeted_topic = proportion(retweeted_topic) retweeted_geo = proportion(retweeted_geo) #try: # average_influence = total_influence/count #except: # average_influence = 0 sorted_retweeted_domain = sorted(retweeted_domain.items(),key=lambda x:x[1], reverse=True) sorted_retweeted_topic = sorted(retweeted_topic.items(),key=lambda x:x[1], reverse=True) sorted_retweeted_geo = sorted(retweeted_geo.items(), key=lambda x:x[1], reverse=True) retweeted_results = dict() retweeted_results["domian"] = sorted_retweeted_domain[:5] retweeted_results["topic"] = sorted_retweeted_topic[:5] retweeted_results["geo"] = sorted_retweeted_geo[:5] retweeted_results["influence"] = average_influence in_portrait = sorted(in_portrait, key=lambda x:x[1], reverse=True) temp_list = [] for item in in_portrait: temp_list.append(item[0]) retweeted_results['in_portrait_number'] = len(temp_list) retweeted_results['out_portrait_number'] = len(out_portrait) in_portrait_url = get_user_url(temp_list[:default_number]) out_portrait_url = get_user_url(out_portrait[:default_number]) return_results = dict() return_results["influence_users"] = [in_portrait_url, out_portrait_url] return_results["influence_distribution"] = retweeted_results return return_results
def influenced_user_detail(uid, date, origin_retweeted_mid, retweeted_retweeted_mid, message_type, default_number=20): query_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must": [ ] } } } }, "size":100000, "sort":{"user_fansnum":{"order":"desc"}} } #详细影响到的人 date1 = str(date).replace('-', '') index_name = pre_index + date1 index_flow_text = pre_text_index + date origin_retweeted_uid = [] # influenced user uid_list retweeted_retweeted_uid = [] origin_comment_uid = [] retweeted_comment_uid = [] query_origin = copy.deepcopy(query_body) query_retweeted = copy.deepcopy(query_body) if origin_retweeted_mid: # 所有转发该条原创微博的用户 query_origin["query"]["filtered"]["filter"]["bool"]["must"].append({"terms": {"root_mid": origin_retweeted_mid}}) query_origin["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term":{"message_type": message_type}}, {"term":{"root_uid": uid}}]) origin_retweeted_result = es.search(index=index_flow_text, doc_type=flow_text_index_type, body=query_origin, fields=["uid"])["hits"]["hits"] if origin_retweeted_result: for item in origin_retweeted_result: origin_retweeted_uid.append(item["fields"]["uid"][0]) if retweeted_retweeted_mid: # 所有评论该条原创微博的用户 query_retweeted["query"]["filtered"]["filter"]["bool"]["must"].append({"terms": {"root_mid": retweeted_retweeted_mid}}) query_retweeted["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term":{"message_type": message_type}},{"term": {"directed_uid": uid}}]) retweeted_retweeted_result = es.search(index=index_flow_text, doc_type=flow_text_index_type, body=query_retweeted, fields=["uid"])["hits"]["hits"] if retweeted_retweeted_result: for item in retweeted_retweeted_result: retweeted_retweeted_uid.append(item["fields"]["uid"][0]) retweeted_uid_list = [] # all retweeted user list retweeted_results = {} # statistics of all retweeted uid information retweeted_domain = {} retweeted_topic = {} retweeted_geo = {} bci_results = {} in_portrait = [] out_portrait = [] average_influence = 0 total_influence = 0 count = 0 all_uid_set = set(origin_retweeted_uid) | set(retweeted_retweeted_uid) retweeted_uid_list.extend(origin_retweeted_uid) retweeted_uid_list.extend(retweeted_retweeted_uid) retweeted_uid_list = list(set(retweeted_uid_list) - set([uid])) # filter uids if retweeted_uid_list: user_portrait_result = es_user_portrait.mget(index=user_portrait, doc_type=portrait_index_type, body={"ids": retweeted_uid_list}, fields=["domain", "topic_string", "activity_geo_dict","importance", "influence"])["docs"] bci_index = "bci_" + date.replace('-', '') bci_results = es_cluster.mget(index=bci_index, doc_type="bci", body={"ids":retweeted_uid_list}, fields=['user_index'])["docs"] for item in user_portrait_result: if item["found"]: temp = [] count += 1 temp.append(item['_id']) temp.append(item["fields"]["importance"][0]) in_portrait.append(temp) temp_domain = item["fields"]["domain"][0].split('&') temp_topic = item["fields"]["topic_string"][0].split('&') temp_geo = json.loads(item["fields"]["activity_geo_dict"][0])[-1].keys() #total_influence += item["fields"]["influence"][0] retweeted_domain = aggregation(temp_domain, retweeted_domain) retweeted_topic = aggregation(temp_topic, retweeted_topic) retweeted_geo = aggregation(temp_geo, retweeted_geo) else: out_portrait.append(item['_id']) retweeted_domain = proportion(retweeted_domain) retweeted_topic = proportion(retweeted_topic) retweeted_geo = proportion(retweeted_geo) if bci_results: total_influence = 0 for item in bci_results: if item['found']: total_influence += item['fields']['user_index'][0] try: average_influence = total_influence/len(retweeted_uid_list) except: average_influence = 0 sorted_retweeted_domain = sorted(retweeted_domain.items(),key=lambda x:x[1], reverse=True) sorted_retweeted_topic = sorted(retweeted_topic.items(),key=lambda x:x[1], reverse=True) sorted_retweeted_geo = sorted(retweeted_geo.items(), key=lambda x:x[1], reverse=True) retweeted_results["domian"] = sorted_retweeted_domain[:5] retweeted_results["topic"] = sorted_retweeted_topic[:5] retweeted_results["geo"] = sorted_retweeted_geo[:5] retweeted_results["influence"] = average_influence in_portrait = sorted(in_portrait, key=lambda x:x[1], reverse=True) temp_list = [] for item in in_portrait: temp_list.append(item[0]) retweeted_results['in_portrait_number'] = len(temp_list) retweeted_results['out_portrait_number'] = len(out_portrait) in_portrait_url = get_user_url(temp_list[:default_number]) out_portrait_url = get_user_url(out_portrait[:default_number]) retweeted_results["in_portrait"] = in_portrait_url retweeted_results["out_portrait"] = out_portrait_url retweeted_results["total_number"] = len(temp_list) + len(out_portrait) return retweeted_results
def group_user_weibo(task_name, submit_user, sort_type): weibo_list = [] now_date = ts2datetime(time.time()) if sort_type == 'retweet': sort_type = 'retweeted' #run_type if RUN_TYPE == 0: now_date = RUN_TEST_TIME sort_type = 'timestamp' #step1: get group user task_id = submit_user + '-' + task_name try: group_exist_result = es_group_result.get(index=group_index_name, doc_type=group_index_type,\ id=task_id)['_source'] except: group_exist_result = {} if not group_exist_result: return 'group no exist' #step2: get user weibo list uid_list = group_exist_result['uid_list'] for i in range(6,-1,-1): iter_date = ts2datetime(datetime2ts(now_date) - i * DAY) index_name = flow_text_index_name_pre + iter_date try: weibo_result = es_flow_text.search(index=index_name, doc_type=flow_text_index_type,\ body={'query':{'filtered':{'filter':{'terms':{'uid': uid_list}}}}, 'sort':[{sort_type: {'order': 'desc'}}], 'size':100})['hits']['hits'] except: weibo_result = [] if weibo_result: weibo_list.extend(weibo_result) #sort_weibo_list = sorted(weibo_list, key=lambda x:x['_source'][sort_type], reverse=True)[:100] sort_weibo_list = weibo_list #step3: get user name try: portrait_exist_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type, \ body={'ids':uid_list})['docs'] except: portrait_exist_result = [] uid2uname_dict = {} for portrait_item in portrait_exist_result: uid = portrait_item['_id'] if portrait_item['found'] == True: source = portrait_item['_source'] uname = source['uname'] else: uname = 'unknown' uid2uname_dict[uid] = uname weibo_list = [] for weibo_item in sort_weibo_list: source = weibo_item['_source'] mid = source['mid'] uid = source['uid'] uname = uid2uname_dict[uid] text = source['text'] ip = source['geo'] timestamp = source['timestamp'] date = ts2date(timestamp) sentiment = source['sentiment'] weibo_url = weiboinfo2url(uid, mid) #run_type: if RUN_TYPE == 1: try: retweet_count = source['retweeted'] except: retweet_count = 0 try: comment_count = source['comment'] except: comment_count = 0 try: sensitive_score = source['sensitive'] except: sensitive_score = 0 else: retweet_count = 0 comment_count = 0 sensitive_score = 0 city = ip2city(ip) weibo_list.append([mid, uid, uname, text, ip, city, timestamp, date, retweet_count, comment_count, sensitive_score, weibo_url]) if sort_type == 'timestamp': new_weibo_list = sorted(weibo_list, key=lambda x:x[6], reverse=True) elif sort_type == 'retweeted': new_weibo_list = sorted(weibo_list, key=lambda x:x[8], reverse=True) elif sort_type == 'comment': new_weibo_list = sorted(weibo_list, key=lambda x:x[9], reverse=True) elif sort_type == 'sensitive': new_weibo_list = sorted(weibo_list, key=lambda x:x[10], reverse=True) return new_weibo_list
def search_group_sentiment_weibo(task_name, start_ts, sentiment, submit_user): weibo_list = [] task_id = submit_user + '-' + task_name #print es_group_result,group_index_name,group_index_type #step1:get task_name uid try: group_result = es_group_result.get(index=group_index_name, doc_type=group_index_type,\ id=task_id, _source=False, fields=['uid_list']) except: group_result = {} if group_result == {}: return 'task name invalid' try: uid_list = group_result['fields']['uid_list'] except: uid_list = [] if uid_list == []: return 'task uid list null' #step3: get ui2uname uid2uname = {} try: user_portrait_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type,\ body={'ids':uid_list}, _source=False, fields=['uname'])['docs'] except: user_portrait_result = [] for item in user_portrait_result: uid = item['_id'] if item['found']==True: uname = item['fields']['uname'][0] uid2uname[uid] = uname else: uid2uname[uid] = 'unknown' #step4:iter date to search weibo weibo_list = [] iter_date = ts2datetime(start_ts) flow_text_index_name = flow_text_index_name_pre + str(iter_date) #step4: get query_body if sentiment != '2': query_body = [{'terms': {'uid': uid_list}}, {'term':{'sentiment': sentiment}}, \ {'range':{'timestamp':{'gte':start_ts, 'lt': start_ts+DAY}}}] else: query_body = [{'terms':{'uid':uid_list}}, {'terms':{'sentiment': SENTIMENT_SECOND}},\ {'range':{'timestamp':{'gte':start_ts, 'lt':start_ts+DAY}}}] try: flow_text_result = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\ body={'query':{'bool':{'must': query_body}}, 'sort': [{'timestamp':{'order':'asc'}}], 'size': MAX_VALUE})['hits']['hits'] except: flow_text_result = [] for flow_text_item in flow_text_result: source = flow_text_item['_source'] weibo = {} weibo['uid'] = source['uid'] weibo['uname'] = uid2uname[weibo['uid']] weibo['ip'] = source['ip'] try: weibo['geo'] = '\t'.join(source['geo'].split('&')) except: weibo['geo'] = '' weibo['text'] = source['text'] weibo['timestamp'] = source['timestamp'] weibo['sentiment'] = source['sentiment'] weibo_list.append(weibo) return weibo_list
def get_task_detail_2(task_name, ts, user): results = dict() index_name = task_name _id = user + "-" + task_name task_detail = es.get(index=index_manage_sensing_task, doc_type=task_doc_type, id=_id)["_source"] task_name = task_detail['task_name'] social_sensors = json.loads(task_detail['social_sensors']) history_status = json.loads(task_detail['history_status']) start_time = task_detail['create_at'] create_by = task_detail['create_by'] stop_time = task_detail['stop_time'] remark = task_detail.get('remark', '') portrait_detail = [] count = 0 # 计数 top_influence = get_top_influence("influence") top_activeness = get_top_influence("activeness") top_importance = get_top_influence("importance") if social_sensors: search_results = es.mget(index=portrait_index_name, doc_type=portrait_index_type, body={"ids":social_sensors}, fields=SOCIAL_SENSOR_INFO)['docs'] for item in search_results: temp = [] if item['found']: for iter_item in SOCIAL_SENSOR_INFO: if iter_item == "topic_string": temp.append(item["fields"][iter_item][0].split('&')) elif iter_item == "activeness": temp.append(math.log(item['fields']['activeness'][0]/float(top_activeness)*9+1, 10)*100) elif iter_item == "importance": temp.append(math.log(item['fields']['importance'][0]/float(top_importance)*9+1, 10)*100) elif iter_item == "influence": temp.append(math.log(item['fields']['influence'][0]/float(top_influence)*9+1, 10)*100) else: temp.append(item["fields"][iter_item][0]) portrait_detail.append(temp) portrait_detail = sorted(portrait_detail, key=lambda x:x[5], reverse=True) time_series = [] # 时间 #positive_sentiment_list = [] # 情绪列表 #neutral_sentiment_list = [] #negetive_sentiment_list = [] all_weibo_list = [] origin_weibo_list = [] # 微博列表 retweeted_weibo_list = [] #retweeted_weibo_count = [] # 别人转发他的数量 #comment_weibo_count = [] #total_number_count = [] #burst_time_list = [] # 爆发时间列表 important_user_set = set() # 重要人物列表 out_portrait_users = set() # 未入库 ts = int(ts) time_series = history_status #for item in history_status: # if int(item[0]) <= ts: # time_series.append(item[0]) # 到目前为止的所有的时间戳 # get detail task information from es if time_series: flow_detail = es.mget(index=index_sensing_task, doc_type=_id, body={"ids": time_series})['docs'] else: flow_detail = {} if flow_detail: for item in flow_detail: item = item['_source'] timestamp = item['timestamp'] #sentiment_distribution = json.loads(item["sentiment_distribution"]) #positive_sentiment_list.append(int(sentiment_distribution['1'])) #negetive_sentiment_list.append(int(sentiment_distribution['2'])+int(sentiment_distribution['3']) \ # +int(sentiment_distribution['4'])+int(sentiment_distribution['5'])+int(sentiment_distribution['6'])) #neutral_sentiment_list.append(int(sentiment_distribution['0'])) origin_weibo_list.append(item["origin_weibo_number"]) # real retweeted_weibo_list.append(item['retweeted_weibo_number']) # real all_weibo_list.append(item["origin_weibo_number"]+item['retweeted_weibo_number']) #retweeted_weibo_count.append(item['retweeted_weibo_count']) #comment_weibo_count.append(item['comment_weibo_count']) #total_number_count.append(item['weibo_total_number']) temp_important_user_list = json.loads(item['important_users']) unfiltered_users = json.loads(item['unfilter_users']) temp_out_portrait_users = set(unfiltered_users) - set(temp_important_user_list) # 未入库 important_user_set = important_user_set | set(temp_important_user_list) out_portrait_users = out_portrait_users | set(temp_out_portrait_users) #burst_reason = item.get("burst_reason", "") #if burst_reason: # burst_time_list.append([timestamp, count, burst_reason]) count += 1 #################################################################################### # 统计爆发原因,下相应的结论 """ weibo_variation_count = 0 weibo_variation_time = [] sentiment_variation_count = 0 sentiment_variation_time = [] sensitive_variation_count = 0 # sensitive sensitive_variation_time = [] # sensitive common_variation_count = 0 common_variation_time = [] if burst_time_list: for item in burst_time_list: tmp_common = 0 x1 = 0 x2 = 0 x3 = 0 if signal_count_varition in item[2]: weibo_variation_count += 1 weibo_variation_time.append([ts2date_min(item[0]), total_number_count[item[1]]]) x1 = total_number_count[item[1]] tmp_common += 1 if signal_sentiment_varition in item[2]: tmp_common += 1 sentiment_variation_count += 1 x2 = negetive_sentiment_list[item[1]] sentiment_variation_time.append([ts2date_min(item[0]), negetive_sentiment_list[item[1]]]) if signal_sensitive_variation in item[2]: tmp_common += 1 sensitive_variation_count += 1 x3 = sensitive_total_number_list[item[1]] sensitive_variation_time.append([ts2date_min(item[0]), all_weibo_list[item[1]]]) if tmp_common >= 2: common_variation_count += 1 common_variation_time.append([ts2date_min(item[0]), x1, x2, x3]) warning_conclusion = remark variation_distribution = [] if weibo_variation_count: variation_distribution.append(weibo_variation_time) else: variation_distribution.append([]) if sentiment_variation_count: variation_distribution.append(sentiment_variation_time) else: variation_distribution.append([]) if sensitive_variation_count: variation_distribution.append(sensitive_variation_time) else: variation_distribution.append([]) if common_variation_count: variation_distribution.append(common_variation_time) else: variation_distribution.append([]) results['warning_conclusion'] = warning_conclusion results['variation_distribution'] = variation_distribution # 每个用户的热度 """ # 获取重要用户的个人信息 important_uid_list = list(important_user_set) out_portrait_users_list = list(out_portrait_users) social_sensor_set = set(social_sensors) user_detail_info = [] # out_user_detail_info = [] if important_uid_list: user_results = es.mget(index=portrait_index_name, doc_type=portrait_index_type, body={"ids":important_uid_list}, fields=['uid', 'uname', 'domain', 'topic_string', "photo_url", 'importance', 'influence', 'activeness'])['docs'] for item in user_results: if item['found']: temp = [] #if int(item['fields']['importance'][0]) < IMPORTANT_USER_THRESHOULD: # continue temp.append(item['fields']['uid'][0]) uname = item['fields']['uname'][0] if not uname or uname == "未知": uname = item['fields']['uid'][0] temp.append(uname) temp.append(item['fields']['photo_url'][0]) temp.append(item['fields']['domain'][0]) temp.append(item['fields']['topic_string'][0].split('&')) #hot_count = count_hot_uid(item['fields']['uid'][0], start_time, stop_time) #temp.append(hot_count) temp.append(math.log(item['fields']['importance'][0]/float(top_importance)*9+1, 10)*100) temp.append(math.log(item['fields']['influence'][0]/float(top_influence)*9+1, 10)*100) temp.append(math.log(item['fields']['activeness'][0]/float(top_activeness)*9+1, 10)*100) if item['fields']['uid'][0] in social_sensor_set: temp.append(1) else: temp.append(0) user_detail_info.append(temp) # 排序 if user_detail_info: user_detail_info = sorted(user_detail_info, key=lambda x:x[6], reverse=True) else: user_detail_info = [] if out_portrait_users_list: profile_results = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":out_portrait_users_list})["docs"] bci_index = "bci_" + ts2datetime(ts-DAY).replace('-','') influence_results = es.mget(index=bci_index, doc_type="bci", body={"ids":out_portrait_users_list}, fields=["user_index"])['docs'] bci_results = es_profile.mget(index="bci_history", doc_type="bci", body={"ids":out_portrait_users_list}, fields=['user_fansnum'])['docs'] top_influence = get_top_all_influence("user_index", ts) count = 0 if profile_results: for item in profile_results: temp = [] if item['found']: temp.append(item['_source']['uid']) if item['_source']['nick_name']: temp.append(item['_source']['nick_name']) else: temp.append(item['_source']['uid']) temp.append(item['_source']['user_location']) #temp.append(item['_source']['fansnum']) else: temp.append(item['_id']) temp.append(item['_id']) temp.extend(['']) try: user_fansnum = bci_results[count]["fields"]["user_fansnum"][0] except: user_fansnum = 0 temp.append(user_fansnum) temp_influ = influence_results[count] if temp_influ.get('found', 0): user_index = temp_influ['fields']['user_index'][0] temp.append(math.log(user_index/float(top_influence)*9+1, 10)*100) else: temp.append(0) count += 1 out_user_detail_info.append(temp) print len(out_user_detail_info) if len(out_user_detail_info): print "sort" out_user_detail_info = sorted(out_user_detail_info, key=lambda x:x[4], reverse=True) revise_time_series = [] for item in time_series: revise_time_series.append(ts2date_min(item)) results['important_user_detail'] = user_detail_info results['out_portrait_user_detail'] = out_user_detail_info #results['burst_time'] = burst_time_list # 爆发时间点,以及爆发原因 results['time_series'] = revise_time_series #results['positive_sentiment_list'] = positive_sentiment_list #esults['negetive_sentiment_list'] = negetive_sentiment_list #results['neutral_sentiment_list'] = neutral_sentiment_list results['all_weibo_list'] = all_weibo_list results['origin_weibo_list'] = origin_weibo_list results['retweeted_weibo_list'] = retweeted_weibo_list #results['comment_weibo_count'] = comment_weibo_count #results['retweeted_weibo_count'] = retweeted_weibo_count #results['total_number_list'] = total_number_count results['social_sensors_detail'] = portrait_detail return results
def search_group_sentiment_weibo(task_name, start_ts, sentiment, submit_user): weibo_list = [] task_id = submit_user + '-' + task_name #step1:get task_name uid try: group_result = es_group_result.get(index=group_index_name, doc_type=group_index_type,\ id=task_id, _source=False, fields=['uid_list']) except: group_result = {} if group_result == {}: return 'task name invalid' try: uid_list = group_result['fields']['uid_list'] except: uid_list = [] if uid_list == []: return 'task uid list null' #step3: get ui2uname uid2uname = {} try: user_portrait_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type,\ body={'ids':uid_list}, _source=False, fields=['uname'])['docs'] except: user_portrait_result = [] for item in user_portrait_result: uid = item['_id'] if item['found'] == True: uname = item['fields']['uname'][0] uid2uname[uid] = uname #step4:iter date to search weibo weibo_list = [] iter_date = ts2datetime(start_ts) flow_text_index_name = flow_text_index_name_pre + str(iter_date) #step4: get query_body if sentiment != '2': query_body = [{'terms': {'uid': uid_list}}, {'term':{'sentiment': sentiment}}, \ {'range':{'timestamp':{'gte':start_ts, 'lt': start_ts+DAY}}}] else: query_body = [{'terms':{'uid':uid_list}}, {'terms':{'sentiment': SENTIMENT_SECOND}},\ {'range':{'timestamp':{'gte':start_ts, 'lt':start_ts+DAY}}}] try: flow_text_result = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\ body={'query':{'bool':{'must': query_body}}, 'sort': [{'timestamp':{'order':'asc'}}], 'size': MAX_VALUE})['hits']['hits'] except: flow_text_result = [] for flow_text_item in flow_text_result: source = flow_text_item['_source'] weibo = {} weibo['uid'] = source['uid'] weibo['uname'] = uid2uname[weibo['uid']] weibo['ip'] = source['ip'] try: weibo['geo'] = '\t'.join(source['geo'].split('&')) except: weibo['geo'] = '' weibo['text'] = source['text'] weibo['timestamp'] = source['timestamp'] weibo['sentiment'] = source['sentiment'] weibo_list.append(weibo) return weibo_list
def get_social_inter_content(uid1, uid2, type_mark): weibo_list = [] #get two type relation about uid1 and uid2 #search weibo list now_ts = int(time.time()) #run_type if RUN_TYPE == 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = datetime2ts(RUN_TEST_TIME) #uid2uname uid2uname = {} try: portrait_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type ,\ body={'ids': [uid1, uid2]}, _source=False, fields=['uid', 'uname'])['docs'] except: portrait_result = [] for item in portrait_result: uid = item['_id'] if item['found'] == True: uname = item['fields']['uname'][0] uid2uname[uid] = uname else: uid2uname[uid] = 'unknown' #iter date to search weibo list for i in range(7, 0, -1): iter_date_ts = now_date_ts - i * DAY iter_date = ts2datetime(iter_date_ts) flow_text_index_name = flow_text_index_name_pre + str(iter_date) query = [] query.append({ 'bool': { 'must': [{ 'term': { 'uid': uid1 } }, { 'term': { 'directed_uid': int(uid2) } }] } }) if type_mark == 'out': query.append({ 'bool': { 'must': [{ 'term': { 'uid': uid2 } }, { 'term': { 'directed_uid': int(uid1) } }] } }) try: flow_text_result = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\ body={'query': {'bool':{'should': query}}, 'sort':[{'timestamp':{'order': 'asc'}}], 'size':MAX_VALUE})['hits']['hits'] except: flow_text_result = [] for flow_text in flow_text_result: source = flow_text['_source'] weibo = {} weibo['timestamp'] = source['timestamp'] weibo['ip'] = source['ip'] weibo['geo'] = source['geo'] weibo['text'] = '\t'.join(source['text'].split('&')) weibo['uid'] = source['uid'] weibo['uname'] = uid2uname[weibo['uid']] weibo['directed_uid'] = str(source['directed_uid']) weibo['directed_uname'] = uid2uname[str(source['directed_uid'])] weibo_list.append(weibo) return weibo_list