def retweet_dict2results(uid, item_results): results = [] uid_list = [] sort_list = [] sorted_list = sorted(item_results.iteritems(), key = lambda x:x[0], reverse=True) count = 0 for key, value in sorted_list: if (key == 'None' or key == uid): continue count += 1 uid_list.append(key) sort_list.append(value) if (count == 100): break # 查看背景信息 if uid_list: profile_result = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list})["docs"] for item in profile_result: _id = item['_id'] index = profile_result.index(item) tmp = [] if item['found']: item = item['_source'] tmp.append(item['uid']) tmp.append(item['nick_name']) else: tmp.extend([_id,'']) value = sort_list[index] tmp.append(value) results.append(tmp) return results
def get_user_profile_weibo(user_list): user_info_dict = {} try: user_profile_dict = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type, \ body={'ids': user_list})['docs'] except: user_profile_dict = [] if user_profile_dict: for user_dict in user_profile_dict: if user_dict['found'] == True: source = user_dict['_source'] source_dict['uid'] = source['uid'] source_dict['uname'] = source['nick_name'] source_dict['location'] = source['location'] source_dict['photo_url'] = source['photo_url'] source_dict['fansnum'] = source['fansnum'] source_dict['friendsnum'] = source['friendsnum'] source_dict['statusnum'] = source['statusnum'] source_dict['description'] = source['description'] else: source_dict['uid'] = source['uid'] source_dict['uname'] = 'unknown' source_dict['location'] = 'unknown' source_dict['photo_url'] = '' source_dict['fansnum'] = 0 source_dict['friendsnum'] = 0 source_dict['statusnum'] = 0 source_dict['description'] = '' user_info_dict[user_dict['_id']] = source_dict return user_info_dict
def retweet_dict2results(uid, item_results): results = [] uid_list = [] sort_list = [] for key in item_results: if (key == uid): continue uid_list.append(key) sort_list.append(item_results[key]) # 查看背景信息 if uid_list: profile_result = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list})["docs"] for item in profile_result: _id = item['_id'] index = profile_result.index(item) tmp = [] if item['found']: item = item['_source'] tmp.append(item['uid']) tmp.append(item['nick_name']) else: tmp.extend([_id,'']) value = sort_list[index] tmp.append(value) results.append(tmp) return results
def query_vary_top_k(index_name, doctype, top_k, sort_index="vary"): query_body = { "query": { "match_all": {} }, "size": top_k, "sort": [{sort_index: {"order": "desc"}}] } result = es.search(index=index_name, doc_type=doctype, body=query_body)['hits']['hits'] uid_list = [] for item in result: uid_list.append(item['_id']) portrait_result = es_portrait.mget(index="user_portrait", doc_type="user", body={"ids":uid_list}, _source=True)['docs'] profile_result = es_profile.mget(index="weibo_user",doc_type="user", body={"ids":uid_list}, _source=True)['docs'] return_list = [] rank = 1 for i in range(len(result)): info = ['','','','',''] info[0] = rank if profile_result[i]['found']: info[1] = profile_result[i]['_source'].get('photo_url','') info[3] = profile_result[i]['_source'].get('nick_name','') info[2] = result[i].get('_id','') info[4] = result[i]['_source']['vary'] if portrait_result[i]['found']: info.append('1') else: info.append('0') return_list.append(info) rank += 1 return return_list
def search_portrait_user(es, number, active_index, active_type, portrait_index, portrait_type, field="user_index"): return_list = [] index_exist = es.indices.exists(index=active_index) if not index_exist: return "no active_index exist" sys.exit(0) count_s = 0 count_c = 0 start = 0 rank = 1 while 1: search_list = [] user_list = search_k(es, active_index, active_type, start, field, 100) start += 100 for item in user_list: if field == "vary": uid = item.get('uid', '0') # obtain uid, notice "uid" or "user" else: uid = item.get('user', '0') search_list.append(uid) # uid list search_result = es_portrait.mget(index=portrait_index, doc_type=portrait_type, body={"ids": search_list}, _source=True)["docs"] profile_result = es_profile.mget(index="weibo_user", doc_type="user", body={"ids": search_list}, _source=True)["docs"] for item in search_result: if item["found"]: info = ['', '', '', '', '', ''] info[0] = rank index = search_result.index(item) if profile_result[index]['found']: info[1] = profile_result[index]['_source'].get( 'photo_url', '') info[3] = profile_result[index]['_source'].get( 'nick_name', '') info[2] = search_result[index].get('_id', '') info[4] = user_list[index][field] info[5] = "1" return_list.append(info) rank += 1 count_c += 1 if count_c >= int(number): return return_list
def get_user_detail(date, input_result, status): results = [] if status=='show_in': uid_list = input_result if status=='show_compute': uid_list = input_result.keys() if status=='show_in_history': uid_list = input_result.keys() if date!='all': index_name = 'bci_' + ''.join(date.split('-')) else: now_ts = time.time() now_date = ts2datetime(now_ts) index_name = 'bci_' + ''.join(now_date.split('-')) index_type = 'bci' user_bci_result = es_cluster.mget(index=index_name, doc_type=index_type, body={'ids':uid_list}, _source=True)['docs'] user_profile_result = es_user_profile.mget(index='weibo_user', doc_type='user', body={'ids':uid_list}, _source=True)['docs'] max_evaluate_influ = get_evaluate_max(index_name) for i in range(0, len(uid_list)): uid = uid_list[i] bci_dict = user_bci_result[i] profile_dict = user_profile_result[i] try: bci_source = bci_dict['_source'] except: bci_source = None if bci_source: influence = bci_source['user_index'] influence = math.log(influence/max_evaluate_influ['user_index'] * 9 + 1 ,10) influence = influence * 100 else: influence = '' try: profile_source = profile_dict['_source'] except: profile_source = None if profile_source: uname = profile_source['nick_name'] location = profile_source['user_location'] fansnum = profile_source['fansnum'] statusnum = profile_source['statusnum'] else: uname = '' location = '' fansnum = '' statusnum = '' if status == 'show_in': results.append([uid, uname, location, fansnum, statusnum, influence]) if status == 'show_compute': in_date = json.loads(input_result[uid])[0] compute_status = json.loads(input_result[uid])[1] if compute_status == '1': compute_status = '3' results.append([uid, uname, location, fansnum, statusnum, influence, in_date, compute_status]) if status == 'show_in_history': in_status = input_result[uid] results.append([uid, uname, location, fansnum, statusnum, influence, in_status]) return results
def get_recommentation(submit_user): if RUN_TYPE: now_ts = time.time() else: now_ts = datetime2ts(RUN_TEST_TIME) in_portrait_set = set(r.hkeys("compute")) result = [] for i in range(7): iter_ts = now_ts - i*DAY iter_date = ts2datetime(iter_ts) submit_user_recomment = "recomment_" + submit_user + "_" + str(iter_date) bci_date = ts2datetime(iter_ts - DAY) submit_user_recomment = r.hkeys(submit_user_recomment) bci_index_name = "bci_" + bci_date.replace('-', '') exist_bool = es_cluster.indices.exists(index=bci_index_name) if not exist_bool: continue if submit_user_recomment: user_bci_result = es_cluster.mget(index=bci_index_name, doc_type="bci", body={'ids':submit_user_recomment}, _source=True)['docs'] user_profile_result = es_user_profile.mget(index='weibo_user', doc_type='user', body={'ids':submit_user_recomment}, _source=True)['docs'] max_evaluate_influ = get_evaluate_max(bci_index_name) for i in range(len(submit_user_recomment)): uid = submit_user_recomment[i] bci_dict = user_bci_result[i] profile_dict = user_profile_result[i] try: bci_source = bci_dict['_source'] except: bci_source = None if bci_source: influence = bci_source['user_index'] influence = math.log(influence/max_evaluate_influ['user_index'] * 9 + 1 ,10) influence = influence * 100 else: influence = '' try: profile_source = profile_dict['_source'] except: profile_source = None if profile_source: uname = profile_source['nick_name'] location = profile_source['user_location'] fansnum = profile_source['fansnum'] statusnum = profile_source['statusnum'] else: uname = '' location = '' fansnum = '' statusnum = '' if uid in in_portrait_set: in_portrait = "1" else: in_portrait = "0" recomment_day = iter_date result.append([iter_date, uid, uname, location, fansnum, statusnum, influence, in_portrait]) return result
def search_portrait_user_in_activity(es, number, active_index, active_type, portrait_index, portrait_type, field="user_index"): return_list = [] index_exist = es.indices.exists(index=active_index) if not index_exist: return "no active_index exist" sys.exit(0) count_s = 0 count_c = 0 start = 0 rank = 1 while 1: search_list = [] user_list = search_k(es, active_index, active_type, start, field, 100) start += 100 for item in user_list: if field == "vary": uid = item.get('uid', '0') # obtain uid, notice "uid" or "user" else: uid = item.get('user', '0') search_list.append(uid) # uid list search_result = es_portrait.mget(index=portrait_index, doc_type=portrait_type, body={"ids": search_list}, _source=True)["docs"] profile_result = es_profile.mget(index="weibo_user", doc_type="user", body={"ids": search_list}, _source=True)["docs"] key_list = ["origin_weibo_retweeted_total_number", "origin_weibo_retweeted_average_number", "origin_weibo_retweeted_top_number", "origin_weibo_retweeted_brust_average", \ "origin_weibo_comment_total_number", "origin_weibo_comment_average_number", "origin_weibo_comment_top_number", "origin_weibo_retweeted_brust_average", \ "retweeted_weibo_retweeted_total_number", "retweeted_weibo_retweeted_average_number", "retweeted_weibo_retweeted_top_number", "retweeted_weibo_retweeted_brust_average", \ "retweeted_weibo_comment_total_number", "retweeted_weibo_comment_average_number", "retweeted_weibo_comment_top_number", "retweeted_weibo_retweeted_brust_average"] for item in search_result: if item["found"]: info = ['','','','','',''] info[0] = rank index = search_result.index(item) if profile_result[index]['found']: info[1] = profile_result[index]['_source'].get('photo_url','') info[3] = profile_result[index]['_source'].get('nick_name','') info[2] = search_result[index].get('_id','') info[4] = user_list[index]['user_index'] info[5] = "1" if field == 'origin_weibo_retweeted_brust_average': info.append(user_list[index]['origin_weibo_retweeted_brust_average']) for key in key_list: info.append(user_list[index][key]) elif field == 'origin_weibo_comment_brust_average': info.append(user_list[index]['origin_weibo_comment_brust_average']) for key in key_list: info.append(user_list[index][key]) else: pass return_list.append(info) rank += 1 count_c += 1 if count_c >= int(number): return return_list
def search_user_info(es,index_name,doc_type,uid,result_name): try: retweet_result = es.get(index=index_name, doc_type=doc_type, id=uid)['_source'] except: return None if retweet_result: retweet_dict = json.loads(retweet_result[result_name]) sorted_list = sorted(retweet_dict.iteritems(),key=lambda x:x[1],reverse=True)[:20] uid_list = [i[0] for i in sorted_list if i[0] != uid] portrait_result = [] try: user_result = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={'ids':uid_list})['docs'] except: user_result = [] try: bci_history_result = es_bci_history.mget(index=bci_history_index_name, doc_type=bci_history_index_type, body={'ids':uid_list}, fields=fields)['docs'] except: bci_history_result = [] #print bci_history_result iter_count = 0 out_portrait_list = [] for out_user_item in user_result: uid = out_user_item['_id'] if out_user_item['found'] == True: source = out_user_item['_source'] uname = source['nick_name'] photo_url = source['photo_url'] if uname == '': uname = u'未知' #location = source['user_location'] friendsnum = source['friendsnum'] else: uname = u'未知' location = '' friendsnum = '' photo_url = 'unknown' #add index from bci_history try: bci_history_item = bci_history_result[iter_count] except: bci_history_item = {'found': False} if bci_history_item['found']==True: fansnum = bci_history_item['fields'][fields[0]][0] user_weibo_count = bci_history_item['fields'][fields[1]][0] user_friendsnum = bci_history_item['fields'][fields[2]][0] influence = bci_history_item['fields'][fields[3]][0] else: fansnum = '' user_weibo_count = '' user_friendsnum = '' influence = '' #retweet_count = int(retweet_dict[uid]) count = retweet_dict[uid] out_portrait_list.append({'uid':uid,'photo_url':photo_url,'count':count,'uname':uname,'influence':influence,'fansnum':fansnum, 'friendsnum':user_friendsnum,'weibo_count':user_weibo_count})#location, iter_count += 1 return out_portrait_list else: return None
def show_keywords_rank(task_id, sort_type, count): try: task_found = es_network_task.get(index=network_keywords_index_name, \ doc_type=network_keywords_index_type, id=task_id)['_source'] except: task_found = {} return task_found search_results = json.loads(task_found['results']) sort_results = search_results[sort_type] results = [] uid_list = [] sort_list = [] for source_uid, sort_value in sort_results: uid_list.append(source_uid) sort_list.append(sort_value) # 查看背景信息 if uid_list: profile_result = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list})["docs"] for item in profile_result: _id = item['_id'] index = profile_result.index(item) tmp = [] if item['found']: item = item['_source'] tmp.append(item['uid']) tmp.append(item['nick_name']) tmp.append(item['user_location']) else: tmp.extend([_id,'','']) value = sort_list[index] tmp.append(value) results.append(tmp) if uid_list: count = 0 history_result = es_bci_history.mget(index=bci_history_index_name, doc_type=bci_history_index_type, body={"ids":uid_list})["docs"] for item in history_result: if item['found']: item = item['_source'] results[count].extend([item['user_fansnum'], item['weibo_month_sum']]) else: results[count].extend(['','']) count += 1 if uid_list: count = 0 portrait_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type, body={"ids":uid_list})["docs"] for item in portrait_result: if item['found']: results[count].append("1") else: results[count].append("0") count += 1 return results
def search_top_index(index_name, top_k=1, index_type="bci", top=False, sort_order="user_index"): query_body = { "query": { "match_all": {} }, "size": top_k, "sort": [{sort_order: {"order": "desc"}}] } if top: result = es.search(index=index_name, doc_type=index_type, body=query_body)['hits']['hits'][0]['_source'][sort_order] else: search_result = es.search(index=index_name, doc_type=index_type, body=query_body)['hits']['hits'] uid_list = [] for item in search_result: uid_list.append(item['_id']) profile_result = es_profile.mget(index="weibo_user",doc_type="user", body={"ids":uid_list}, _source=True)['docs'] portrait_result = es_portrait.mget(index="user_portrait", doc_type="user", body={"ids":uid_list}, _source=True)['docs'] result = [] rank = 1 for i in range(len(search_result)): info = ['','','',''] info[0] = rank if profile_result[i]['found']: info[1] = profile_result[i]['_source'].get('photo_url','') info[3] = profile_result[i]['_source'].get('nick_name','') info[2] = search_result[i].get('_id','') if sort_order in ["user_index","origin_weibo_retweeted_brust_average","origin_weibo_comment_brust_average"]: info.append(search_result[i]['_source'][sort_order]) if portrait_result[i]['found']: info.append("1") else: info.append("0") elif sort_order == "origin_weibo_retweeted_top_number": info.append(search_result[i]['_source']['origin_weibo_retweeted_top_number']) mid = search_result[i]['_source']['origin_weibo_top_retweeted_id'] info.append(weiboinfo2url(info[2],mid)) if portrait_result[i]['found']: info.append("1") else: info.append("0") elif sort_order == "origin_weibo_comment_top_number": info.append(search_result[i]['_source']['origin_weibo_comment_top_number']) mid = search_result[i]['_source']['origin_weibo_top_comment_id'] info.append(weiboinfo2url(info[2],mid)) if portrait_result[i]['found']: info.append("1") else: info.append("0") rank += 1 result.append(info) return result
def show_daily_rank(period, sort_type, count): index_name = 'user_portrait_network' index_type = 'network' if (len(sort_type.split('_')) > 1): sort = 'rank_' + sort_type + '_' + str(period) #pr_0 else: sort = sort_type + '_' + str(period) #pr_0 query_body = { 'sort':[{sort:{'order': 'desc'}}], 'size': count } try: search_results = es_network_task.search(index=index_name, doc_type=index_type, body=query_body)['hits']['hits'] except: search_results = [] results = [] uid_list = [] sort_list = [] for item in search_results: source = item['_source'] if sort in source: uid_list.append(source['uid']) sort_list.append(source[sort]) # 查看背景信息 if uid_list: profile_result = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list})["docs"] for item in profile_result: _id = item['_id'] index = profile_result.index(item) tmp = [] if item['found']: item = item['_source'] tmp.append(item['uid']) tmp.append(item['nick_name']) tmp.append(item['statusnum']) tmp.append(item['user_location']) tmp.append(item['fansnum']) else: tmp.extend([_id,'','','','']) value = sort_list[index] tmp.append(value) results.append(tmp) if uid_list: count = 0 portrait_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type, body={"ids":uid_list})["docs"] for item in portrait_result: if item['found']: results[count].append("1") else: results[count].append("0") count += 1 return results
def search_tag(es, number, active_index, active_type, portrait_index, portrait_type, tag): #field_dict = {"domain":"art"} return_list = [] count_s = 0 count_c = 0 start = 0 rank = 1 while 1: search_list = [] user_list = search_k(es, active_index, active_type, start, "user_index", 10000) start += 10000 for item in user_list: uid = item.get('user', '0') search_list.append(uid) # uid list search_result = es_portrait.mget(index=portrait_index, doc_type=portrait_type, body={"ids": search_list}, _source=True)["docs"] profile_result = es_profile.mget(index="weibo_user", doc_type="user", body={"ids": search_list}, _source=True)["docs"] for item in search_result: count_s += 1 if item['found'] and tag in item['_source']['domain']: info = ['', '', '', '', '', '', ''] info[0] = rank index = search_result.index(item) if profile_result[index]['found']: info[1] = profile_result[index]['_source'].get( 'photo_url', '') info[3] = profile_result[index]['_source'].get( 'nick_name', '') info[2] = search_result[index].get('_id', '') info[4] = user_list[index]['user_index'] info[5] = search_result[index]['_source'].get('activeness', '') info[6] = search_result[index]['_source'].get('importance', '') rank += 1 return_list.append(info) if rank >= int(number) + 1: return return_list if count_s > 100000: return return_list
def search_max_single_field(field, index_name, doctype, top_k=3): # field = "origin_weibo_retweeted_top_number", "origin_weibo_comment_top_number" query_body = { "query": { "match_all": {} }, "sort": [{field: {"order": "desc"}}], "size": top_k } return_list = [] rank = 1 count_c = 0 start = 0 while 1: search_list = [] user_list = search_k(es, index_name, doctype, start, field, 100) start += 100 for item in user_list: uid = item.get('user','0') search_list.append(uid) # uid list search_result = es_portrait.mget(index="user_portrait", doc_type="user", body={"ids": search_list}, _source=True)["docs"] profile_result = es_profile.mget(index="weibo_user", doc_type="user", body={"ids": search_list}, _source=True)["docs"] for i in range(len(search_result)): if search_result[i]['found']: info = ['','','','','','','1'] info[0] = rank info[2] = search_result[i].get('_id','') if profile_result[i]['found']: info[1] = profile_result[i]['_source'].get('photo_url','') info[3] = profile_result[i]['_source'].get('nick_name','') if 'retweeted' in field: temp_mid = user_list[i]['origin_weibo_top_retweeted_id'] info[5] = weiboinfo2url(info[2], temp_mid) info[4] = user_list[i]['origin_weibo_retweeted_top_number'] else: temp_mid = user_list[i]['origin_weibo_top_comment_id'] info[5] = weiboinfo2url(info[2], temp_mid) info[4] = user_list[i]['origin_weibo_comment_top_number'] rank += 1 return_list.append(info) if rank >= int(top_k)+1: return return_list
def search_yangshi_attention(uid, top_count): results = {} now_ts = time.time() db_number = get_db_num(now_ts) index_name = retweet_index_name_pre + str(db_number) center_uid = uid # print es_retweet,index_name,retweet_index_type,uid try: retweet_result = es_retweet.get(index=index_name, doc_type=retweet_index_type, id=uid)['_source'] except: return None if retweet_result: retweet_dict = json.loads(retweet_result['uid_retweet']) sorted_list = sorted(retweet_dict.iteritems(), key=lambda x: x[1], reverse=True)[:20] uid_list = [i[0] for i in sorted_list] portrait_result = [] try: user_result = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={'ids': uid_list})['docs'] except: user_result = [] iter_count = 0 out_portrait_list = [] for out_user_item in user_result: uid = out_user_item['_id'] if out_user_item['found'] == True: source = out_user_item['_source'] uname = source['nick_name'] if uname == '': uname = u'未知' else: uname = u'未知' count = retweet_dict[uid] out_portrait_list.append({ 'uid': uid, 'count': count, 'uname': uname, }) #location, iter_count += 1 return out_portrait_list else: return None
def identify_user_out(input_uid_list): out_user_list = [] in_user_list = [] input_len = len(input_uid_list) iter_count = 0 print 'identify user out' #get user list who is out user_portrait while iter_count < input_len: iter_user_list = input_uid_list[iter_count: iter_count+DETECT_ITER_COUNT] try: portrait_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type, body={'ids':iter_user_list}, _source=False)['docs'] except: portrait_result = [] for item in portrait_result: uid = item['_id'] if item['found'] != True: out_user_list.append(uid) else: in_user_list.append(uid) iter_count += DETECT_ITER_COUNT print 'get out user portrait information' #get user profile information for out user_portrait iter_count = 0 out_user_count = len(out_user_list) out_user_result = [] while iter_count < out_user_count: iter_user_list = out_user_list[iter_count: iter_count+DETECT_ITER_COUNT] try: profile_result = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={'ids':iter_user_list}, _source=True)['docs'] except: profile_result = [] for item in profile_result: uid = item['_id'] if item['found']==True: source = item['_source'] uname = source['nick_name'] fansnum = source['fansnum'] statusnum = source['statusnum'] friendsnum = source['friendsnum'] else: uname = u'未知' fansnum = u'未知' statusnum = u'未知' friendsnum = u'未知' out_user_result.append([uid, uname, fansnum, statusnum, friendsnum]) iter_count += DETECT_ITER_COUNT sort_out_user_result = sorted(out_user_result, key=lambda x:x[2], reverse=True) return in_user_list, sort_out_user_result
def search_user_profile_by_user_ids(users): users = list(users) user_profile_return = dict() try: user_result = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={'ids': users})['docs'] except: user_result = [] for out_user_item in user_result: if out_user_item['found']: uid = out_user_item['_id'] user_profile_return[uid] = out_user_item['_source'] return user_profile_return
def compare_user_profile(uid_list): results = {} index_name = 'weibo_user' index_type = 'user' search_results = es_user_profile.mget(index=index_name, doc_type=index_type, body={'ids':uid_list})['docs'] #print 'results:', search_results for result in search_results: uid = result['_id'] results[uid] = [] try: item = result['_source'] except: next photo_url = item['photo_url'] results[uid] = photo_url #print 'results:', results return results
def portrait_user_vary(es, number, active_index, active_type, portrait_index, portrait_type, field="vary"): return_list = [] index_exist = es.indices.exists(index=active_index) if not index_exist: return "no active_index exist" sys.exit(0) count_s = 0 count_c = 0 start = 0 rank = 1 try: while 1: search_list = [] user_list = search_k(es, active_index, active_type, start, field, 100) start += 100 for item in user_list: uid = item.get('uid', '0') # obtain uid, notice "uid" or "user" search_list.append(uid) # uid list search_result = es_portrait.mget(index="user_portrait", doc_type="user", body={"ids": search_list}, _source=True)["docs"] profile_result = es_profile.mget(index="weibo_user", doc_type="user", body={"ids": search_list}, _source=True)["docs"] for item in search_result: count_c += 1 if item["found"]: info = ['','','','','','1'] info[0] = rank index = search_result.index(item) if profile_result[index]['found']: info[1] = profile_result[index]['_source'].get('photo_url','') info[3] = profile_result[index]['_source'].get('nick_name','') info[2] = search_result[index].get('_id','') info[4] = user_list[index]['vary'] return_list.append(info) rank += 1 if rank == int(number)+1: return return_list if count_c > 10000: break except RequestError: print "timeout" return return_list
def query_vary_top_k(index_name, doctype, top_k, sort_index="vary"): query_body = { "query": { "match_all": {} }, "size": top_k, "sort": [{ sort_index: { "order": "desc" } }] } result = es.search(index=index_name, doc_type=doctype, body=query_body)['hits']['hits'] uid_list = [] for item in result: uid_list.append(item['_id']) portrait_result = es_portrait.mget(index="user_portrait", doc_type="user", body={"ids": uid_list}, _source=True)['docs'] profile_result = es_profile.mget(index="weibo_user", doc_type="user", body={"ids": uid_list}, _source=True)['docs'] return_list = [] rank = 1 for i in range(len(result)): info = ['', '', '', '', ''] info[0] = rank if profile_result[i]['found']: info[1] = profile_result[i]['_source'].get('photo_url', '') info[3] = profile_result[i]['_source'].get('nick_name', '') info[2] = result[i].get('_id', '') info[4] = result[i]['_source']['vary'] if portrait_result[i]['found']: info.append('1') else: info.append('0') return_list.append(info) rank += 1 return return_list
def get_user_url(uid_list): results = [] try: es_results = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids": uid_list})['docs'] except: es_results = {} for item in es_results: temp = [] if item['found']: temp.append(item['_source']["photo_url"]) temp.append(item['_source']['nick_name']) temp.append(item['_id']) else: temp.append("unknown") temp.append("unknown") temp.append(item['_id']) results.append(temp) return results
def search_attention(uid): stat_results = dict() results = dict() for db_num in R_DICT: r = R_DICT[db_num] ruid_results = r.hgetall('retweet_'+str(uid)) if ruid_results: for ruid in ruid_results: if ruid != uid: try: stat_results[ruid] += ruid_results[ruid] except: stat_results[ruid] = ruid_results[ruid] # print 'results:', stat_results if not results: return [None, 0] try: sort_state_results = sorted(stat_results.items(), key=lambda x:x[1], reverse=True)[:20] except: return [None, 0] print 'sort_state_results:', sort_state_results uid_list = [item[0] for item in sort_state_results] es_profile_results = es_user_profile.mget(index='weibo_user', doc_type='user', body={'ids':uid_list})['docs'] es_portrait_results = es_user_portrait.mget(index='user_portrait', doc_type='user', body={'ids':uid_list})['docs'] result_list = [] for i in range(len(es_profile_results)): item = es_profile_results[i] uid = item['_id'] try: source = item['_source'] uname = source['nick_name'] except: uname = u'未知' # identify uid is in the user_portrait portrait_item = es_portrait_results[i] try: source = portrait_item[i] in_status = 1 except: in_status = 0 result_list.append([uid,[uname, stat_results[uid], in_status]]) return [result_list[:20], len(stat_results)]
def search_tag(es, number, active_index, active_type, portrait_index, portrait_type, field_dict): #field_dict = {"domain":"art"} return_list = [] count_s = 0 count_c = 0 start = 0 rank = 1 while 1: search_list = [] user_list = search_k(es, active_index, active_type, start, "user_index", 1000) start += 1000 for item in user_list: uid = item.get('user', '0') search_list.append(uid) # uid list search_result = es_portrait.mget(index=portrait_index, doc_type=portrait_type, body={"ids": search_list}, _source=True)["docs"] profile_result = es_profile.mget(index="weibo_user", doc_type="user", body={"ids": search_list}, _source=True)["docs"] for item in search_result: count_s += 1 if item['found'] and field_dict.values()[0] in item['_source'][field_dict.keys()[0]]: info = ['','','','','','',''] info[0] = rank index = search_result.index(item) if profile_result[index]['found']: info[1] = profile_result[index]['_source'].get('photo_url','') info[3] = profile_result[index]['_source'].get('nick_name','') info[2] = search_result[index].get('_id','') info[4] = user_list[index]['user_index'] info[5] = search_result[index]['_source'].get('activeness','') info[6] = search_result[index]['_source'].get('importance','') rank += 1 return_list.append(info) if rank >= int(number)+1: return return_list if count_s > 10000: return return_list
def compare_user_profile(uid_list): results = {} search_results = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type,\ body={'ids':uid_list})['docs'] for result in search_results: uid = result['_id'] results[uid] = [] try: item = result['_source'] except: item = {} try: photo_url = item['photo_url'] except: photo_url = 'unkown' results[uid] = photo_url return results
def search_follower(uid): results = dict() stat_results = dict() for db_num in R_DICT: r = R_DICT[db_num] br_uid_results = r.hgetall('be_retweet_'+str(uid)) if br_uid_results: for br_uid in br_uid_results: if br_uid != uid: try: stat_results[br_uid] += br_uid_results[br_uid] except: stat_results[br_uid] = br_uid_results[br_uid] if not stat_results: return [None, 0] try: sort_stat_results = sorted(stat_results.items(), key=lambda x:x[1], reverse=True)[:20] except: return [None, 0] uid_list = [item[0] for item in sort_stat_results] es_profile_results = es_user_profile.mget(index='weibo_user', doc_type='user', body={'ids':uid_list})['docs'] es_portrait_results = es_user_portrait.mget(index='user_portrait', doc_type='user', body={'ids':uid_list})['docs'] result_list = [] for i in range(len(es_profile_results)): item = es_profile_results[i] uid = item['_id'] try: source = item['_source'] uname = source['nick_name'] except: uname = u'未知' portrait_item = es_portrait_results[i] try: source = portrait_item['_source'] in_status = 1 except: in_status = 0 result_list.append([uid,[uname, stat_results[uid], in_status]]) return [result_list[:20], len(stat_results)]
def get_origin_weibo_detail(ts, user, task_name, size, order, message_type=1): _id = user + '-' + task_name task_detail = es_user_portrait.get(index=index_sensing_task, doc_type=_id, id=ts)['_source'] mid_value = json.loads(task_detail['mid_topic_value']) duplicate_dict = json.loads(task_detail['duplicate_dict']) tmp_duplicate_dict = dict() for k, v in duplicate_dict.iteritems(): try: tmp_duplicate_dict[v].append(k) except: tmp_duplicate_dict[v] = [k, v] if message_type == 1: weibo_detail = json.loads(task_detail['origin_weibo_detail']) elif message_type == 2: weibo_detail = json.loads(task_detail['retweeted_weibo_detail']) else: weibo_detail = json.loads(task_detail['sensitive_weibo_detail']) weibo_detail_list = [] if weibo_detail: for iter_mid, item in weibo_detail.iteritems(): tmp = [] tmp.append(iter_mid) tmp.append(item[iter_mid]) tmp.append(item['retweeted']) tmp.append(item['comment']) weibo_detail_list.append(tmp) mid_list = weibo_detail.keys() print len(mid_list) results = [] query_body = { "query": { "filtered": { "filter": { "terms": { "mid": mid_list } } } }, "size": 1000, "sort": { "timestamp": { "order": "desc" } } } index_list = [] datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts - DAY) index_name = flow_text_index_name_pre + datetime print es_text exist_es = es_text.indices.exists(index_name) print exist_es if exist_es: index_list.append(index_name) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es_1 = es_text.indices.exists(index_name_1) if exist_es_1: index_list.append(index_name_1) if index_list and mid_list: search_results = es_text.search(index=index_list, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] else: search_results = [] uid_list = [] text_dict = dict() # 文本信息 portrait_dict = dict() # 背景信息 sort_results = [] if search_results: for item in search_results: uid_list.append(item["_source"]['uid']) text_dict[item['_id']] = item['_source'] # _id是mid if uid_list: portrait_result = es_profile.mget( index=profile_index_name, doc_type=profile_index_type, body={"ids": uid_list}, fields=['nick_name', 'photo_url'])["docs"] for item in portrait_result: if item['found']: portrait_dict[item['_id']] = { "nick_name": item["fields"]["nick_name"][0], "photo_url": item["fields"]["photo_url"][0] } else: portrait_dict[item['_id']] = { "nick_name": item['_id'], "photo_url": "" } if order == "total": sorted_list = sorted(weibo_detail_list, key=lambda x: x[1], reverse=True) elif order == "retweeted": sorted_list = sorted(weibo_detail_list, key=lambda x: x[2], reverse=True) elif order == "comment": sorted_list = sorted(weibo_detail_list, key=lambda x: x[3], reverse=True) else: sorted_list = weibo_detail_list count_n = 0 results_dict = dict() mid_index_dict = dict() for item in sorted_list: # size mid = item[0] iter_text = text_dict.get(mid, {}) temp = [] # uid, nick_name, photo_url, text, sentiment, timestamp, geo, keywords_string, message_type if iter_text: uid = iter_text['uid'] temp.append(uid) iter_portrait = portrait_dict.get(uid, {}) if iter_portrait: temp.append(iter_portrait['nick_name']) temp.append(iter_portrait['photo_url']) else: temp.extend([uid, '']) temp.append(iter_text["text"]) temp.append(iter_text["sentiment"]) temp.append(ts2date(iter_text['timestamp'])) temp.append(iter_text['geo']) if message_type == 1: temp.append(1) elif message_type == 2: temp.append(3) else: temp.append(iter_text['message_type']) temp.append(iter_text['keywords_string']) temp.append(item[2]) temp.append(item[3]) temp.append(iter_text.get('sensitive', 0)) temp.append(iter_text['timestamp']) temp.append(mid_value[mid]) temp.append(mid) results.append(temp) count_n += 1 results = sorted(results, key=operator.itemgetter(-4, -2, -6), reverse=True) # -4 -2 -3 sort_results = [] count = 0 for item in results: sort_results.append([item]) mid_index_dict[item[-1]] = count count += 1 if tmp_duplicate_dict: remove_list = [] value_list = tmp_duplicate_dict.values() # [[mid, mid], ] for item in value_list: tmp = [] for mid in item: if mid_index_dict.get(mid, 0): tmp.append(mid_index_dict[mid]) if len(tmp) > 1: tmp_min = min(tmp) else: continue tmp.remove(tmp_min) for iter_count in tmp: sort_results[tmp_min].extend(sort_results[iter_count]) remove_list.append(sort_results[iter_count]) if remove_list: for item in remove_list: sort_results.remove(item) return sort_results
def identify_user_portrait(user_set, filter_type): in_portrait_result = [] out_portrait_result = [] user_list = list(user_set) #identify the user_portrait iter_count = 0 all_user_count = len(user_list) all_in_portrait_user = dict() all_out_portrait_user_list = [] max_result = get_evaluate_max() while iter_count <= all_user_count: iter_user_list = user_list[iter_count: iter_count + SENTIMENT_ITER_USER_COUNT] #search user in user_portrait try: in_portrait_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type,\ body={'ids': iter_user_list}, _source=False, \ fields=['uname', 'influence', 'activeness', 'importance', 'sensitive'])['docs'] except: in_portrait_result = [] #add all hit user for in_portrait_item in in_portrait_result: if in_portrait_item['found'] == True: uname = in_portrait_item['fields']['uname'][0] influence = in_portrait_item['fields']['influence'][0] normal_influence = math.log(influence / max_result['influence'] * 9 + 1 , 10) * 100 activeness = in_portrait_item['fields']['activeness'][0] normal_activeness = math.log(activeness / max_result['activeness'] * 9 + 1 , 10) * 100 importance = in_portrait_item['fields']['importance'][0] normal_importance = math.log(importance / max_result['importance'] * 9 + 1 , 10) * 100 try: sensitive = in_portrait_item['fields']['sensitive'][0] normal_sensitive = math.log(sensitive / max_result['sensitive'] * 9 + 1 , 10) * 100 except: normal_sensitive = 0 all_in_portrait_user[in_portrait_item['_id']] = [uname, normal_influence, normal_activeness, \ normal_importance, normal_sensitive] else: all_out_portrait_user_list.append(int(in_portrait_item['_id'])) iter_count += SENTIMENT_ITER_USER_COUNT if filter_type == 'in': return all_in_portrait_user #get out portrait user info iter_count = 0 all_out_portrait_user = dict() all_out_user_count = len(all_out_portrait_user_list) while iter_count < all_out_user_count: iter_uid_list = all_out_portrait_user_list[iter_count: iter_count+SENTIMENT_ITER_USER_COUNT] try: profile_result = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type,\ body={'ids':iter_uid_list}, _source=False, fields=['nick_name', 'statusnum', 'friendsnum', 'fansnum'])['docs'] except: profile_result = [] for profile_item in profile_result: if profile_item['found'] == True: uname = profile_item['fields']['nick_name'][0] statusnum = profile_item['fields']['statusnum'][0] friendsnum = profile_item['fields']['friendsnum'][0] fansnum = profile_item['fields']['fansnum'][0] else: uname= profile_item['_id'] statusnum = 0 friendsnum = 0 fansnum = 0 all_out_portrait_user[str(profile_item['_id'])] = [uname, statusnum, friendsnum, fansnum] iter_count += SENTIMENT_ITER_USER_COUNT return all_in_portrait_user, all_out_portrait_user
#step1 : get group user list by task_name group_index_name = 'group_result' group_index_type = 'group' try: group_task = es.get(index=group_index_name, doc_type=group_index_type, id=task_name)['_source'] except Exception, e: raise e user_list = group_task['uid_list'] #step2: get group user name profile_index_name = 'weibo_user' profile_index_type = 'user' try: user_name_result = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={'ids': user_list})['docs'] except Exception, e: raise e #print 'user_name_result:', user_name_result #step3 : get group user weibo file_list = set(os.listdir(DEFAULT_LEVELDBPATH)) count = 0 for user in user_list: user_nick_name = user_name_result[count]['_source']['nick_name'] for i in range(1, 25): leveldb_folder = date + str(i) if leveldb_folder in file_list: leveldb_bucket = dynamic_leveldb(leveldb_folder) try: user_weibo = leveldb_bucket.Get(str(user))
def get_final_submit_user_info(uid_list): final_results = [] try: profile_results = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={'ids': uid_list})['docs'] except: profile_results = [] try: bci_history_results =es_bci_history.mget(index=bci_history_index_name, doc_type=bci_history_index_type, body={'ids': uid_list})['docs'] except: bci_history_results = [] #get bci_history max value now_time_ts = time.time() search_date_ts = datetime2ts(ts2datetime(now_time_ts - DAY)) bci_key = 'bci_' + str(search_date_ts) query_body = { 'query':{ 'match_all':{} }, 'sort': [{bci_key:{'order': 'desc'}}], 'size': 1 } #try: bci_max_result = es_bci_history.search(index=bci_history_index_name, doc_type=bci_history_index_type, body=query_body, _source=False, fields=[bci_key])['hits']['hits'] #except: # bci_max_result = {} if bci_max_result: bci_max_value = bci_max_result[0]['fields'][bci_key][0] else: bci_max_value = MAX_VALUE iter_count = 0 for uid in uid_list: try: profile_item = profile_results[iter_count] except: profile_item = {} try: bci_history_item = bci_history_results[iter_count] except: bci_history_item = {} if profile_item and profile_item['found'] == True: uname = profile_item['_source']['nick_name'] location = profile_item['_source']['user_location'] else: uname = '' location = '' if bci_history_item and bci_history_item['found'] == True: fansnum = bci_history_item['_source']['user_fansnum'] statusnum = bci_history_item['_source']['weibo_month_sum'] try: bci = bci_history_item['_source'][bci_key] normal_bci = math.log(bci / bci_max_value * 9 + 1, 10) * 100 except: normal_bci = '' else: fansnum = '' statusnum = '' normal_bci = '' final_results.append([uid, uname, location, fansnum, statusnum, normal_bci]) iter_count += 1 return final_results
def get_retweet_weibo_detail(ts, user, task_name, size, text_type, type_value): _id = user + '-' + task_name task_detail = es_user_portrait.get(index=index_sensing_task, doc_type=_id, id=ts)['_source'] origin_weibo_detail = json.loads(task_detail['origin_weibo_detail']) retweeted_weibo_detail = json.loads(task_detail['retweeted_weibo_detail']) mid_list = [] mid_list.extend(origin_weibo_detail.keys()) mid_list.extend(retweeted_weibo_detail.keys()) query_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ {"range":{ "timestamp":{ "gte": ts - time_interval, "lt": ts } }}, {"terms": {"root_mid": mid_list}} ] } } } }, "sort": {"timestamp": {"order": "desc"}}, "size": 100 } if text_type == "message_type": query_body['query']['filtered']['filter']['bool']['must'].append({"term":{text_type: type_value}}) if text_type == "sentiment": #if isinstance(type_value, str): if len(type_value) == 1: query_body['query']['filtered']['filter']['bool']['must'].append({"term":{text_type: type_value}}) else: query_body['query']['filtered']['filter']['bool']['must'].append({"terms":{text_type: type_value}}) datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts-time_interval) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es_1 = es_text.indices.exists(index_name_1) # 1. 查询微博 if datetime == datetime_1 and exist_es: search_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] elif datetime != datetime_1 and exist_es_1: search_results = es_text.search(index=index_name_1, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] else: search_results = [] #print search_results # 2. 获取微博相关信息 results = [] uid_list = [] if search_results: for item in search_results: uid_list.append(item["_source"]['uid']) if uid_list: portrait_result = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list}, fields=['nick_name', 'photo_url'])["docs"] for i in range(len(uid_list)): item = search_results[i]['_source'] temp = [] # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type temp.append(item['uid']) if portrait_result[i]['found']: temp.append(portrait_result[i]["fields"]["nick_name"][0]) temp.append(portrait_result[i]["fields"]["photo_url"][0]) else: temp.append(item['uid']) temp.append("") temp.append(item["text"]) #print item['text'] temp.append(item["sentiment"]) temp.append(ts2date(item['timestamp'])) temp.append(item['geo']) temp.append(item["message_type"]) results.append(temp) return results
def get_sensitive_text_detail(task_name, ts, user, order): _id = user + '-' + task_name task_detail = es.get(index=index_sensing_task, doc_type=_id, id=ts)['_source'] weibo_detail = json.loads(task_detail['sensitive_weibo_detail']) weibo_detail_list = [] if weibo_detail: for iter_mid, item in weibo_detail.iteritems(): tmp = [] tmp.append(iter_mid) tmp.append(item[iter_mid]) tmp.append(item['retweeted']) tmp.append(item['comment']) weibo_detail_list.append(tmp) mid_list = weibo_detail.keys() results = [] query_body = { "query":{ "filtered":{ "filter":{ "terms":{"mid": mid_list} } } } } index_list = [] datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts-DAY) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) if exist_es: index_list.append(index_name) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es_1 = es_text.indices.exists(index_name_1) if exist_es_1: index_list.append(index_name_1) if index_list and mid_list: search_results = es_text.search(index=index_list, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] else: search_results = [] uid_list = [] text_dict = dict() # 文本信息 portrait_dict = dict() # 背景信息 if search_results: for item in search_results: uid_list.append(item["_source"]['uid']) text_dict[item['_id']] = item['_source'] # _id是mid if uid_list: portrait_result = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list}, fields=['nick_name', 'photo_url'])["docs"] for item in portrait_result: if item['found']: portrait_dict[item['_id']] = {"nick_name": item["fields"]["nick_name"][0], "photo_url": item["fields"]["photo_url"][0]} else: portrait_dict[item['_id']] = {"nick_name": item['_id'], "photo_url":""} if order == "total": sorted_list = sorted(weibo_detail_list, key=lambda x:x[1], reverse=True) elif order == "retweeted": sorted_list = sorted(weibo_detail_list, key=lambda x:x[2], reverse=True) elif order == "comment": sorted_list = sorted(weibo_detail_list, key=lambda x:x[3], reverse=True) else: sorted_list = weibo_detail_list count_n = 0 for item in sorted_list: mid = item[0] iter_text = text_dict.get(mid, {}) temp = [] # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type if iter_text: uid = iter_text['uid'] temp.append(uid) iter_portrait = portrait_dict.get(uid, {}) if iter_portrait: temp.append(iter_portrait['nick_name']) temp.append(iter_portrait['photo_url']) else: temp.extend([uid,'']) temp.append(iter_text["text"]) temp.append(iter_text["sentiment"]) temp.append(ts2date(iter_text['timestamp'])) temp.append(iter_text['geo']) temp.append(iter_text['message_type']) temp.append(item[2]) temp.append(item[3]) temp.append(iter_text.get('sensitive', 0)) count_n += 1 results.append(temp) if results and order == "ts": results = sorted(results, key=lambda x:x[5], reverse=True) if results and order == "sensitive": results = sorted(results, key=lambda x:x[-1], reverse=True) return results
def get_user_detail(date, input_result, status, user_type="influence", auth=""): results = [] if status=='show_in': uid_list = input_result if status=='show_compute': uid_list = input_result.keys() if status=='show_in_history': uid_list = input_result.keys() if date!='all': index_name = 'bci_' + ''.join(date.split('-')) else: now_ts = time.time() now_date = ts2datetime(now_ts) index_name = 'bci_' + ''.join(now_date.split('-')) index_type = 'bci' user_bci_result = es_cluster.mget(index=index_name, doc_type=index_type, body={'ids':uid_list}, _source=True)['docs'] user_profile_result = es_user_profile.mget(index='weibo_user', doc_type='user', body={'ids':uid_list}, _source=True)['docs'] max_evaluate_influ = get_evaluate_max(index_name) for i in range(0, len(uid_list)): uid = uid_list[i] bci_dict = user_bci_result[i] profile_dict = user_profile_result[i] try: bci_source = bci_dict['_source'] except: bci_source = None if bci_source: influence = bci_source['user_index'] influence = math.log(influence/max_evaluate_influ['user_index'] * 9 + 1 ,10) influence = influence * 100 else: influence = '' try: profile_source = profile_dict['_source'] except: profile_source = None if profile_source: uname = profile_source['nick_name'] location = profile_source['user_location'] fansnum = profile_source['fansnum'] statusnum = profile_source['statusnum'] else: uname = '' location = '' fansnum = '' statusnum = '' if status == 'show_in': if user_type == "sensitive": tmp_ts = datetime2ts(date) - DAY tmp_data = r_cluster.hget("sensitive_"+str(tmp_ts), uid) if tmp_data: sensitive_dict = json.loads(tmp_data) sensitive_words = sensitive_dict.keys() else: senstive_words = [] results.append([uid, uname, location, fansnum, statusnum, influence, sensitive_words]) else: results.append([uid, uname, location, fansnum, statusnum, influence]) if auth: hashname_submit = "submit_recomment_" + date tmp_data = json.loads(r.hget(hashname_submit, uid)) recommend_list = (tmp_data['operation']).split('&') admin_list = [] admin_list.append(tmp_data['system']) admin_list.append(list(set(recommend_list))) admin_list.append(len(recommend_list)) results[-1].extend(admin_list) if status == 'show_compute': in_date = json.loads(input_result[uid])[0] compute_status = json.loads(input_result[uid])[1] if compute_status == '1': compute_status = '3' results.append([uid, uname, location, fansnum, statusnum, influence, in_date, compute_status]) if status == 'show_in_history': in_status = input_result[uid] if user_type == "sensitive": tmp_ts = datetime2ts(date) - DAY tmp_data = r_cluster.hget("sensitive_"+str(tmp_ts), uid) if tmp_data: sensitive_dict = json.loads(tmp_data) sensitive_words = sensitive_dict.keys() results.append([uid, uname, location, fansnum, statusnum, influence, in_status, sensitive_words]) else: results.append([uid, uname, location, fansnum, statusnum, influence, in_status]) return results
def get_task_detail_2(task_name, ts, user): results = dict() index_name = task_name _id = user + "-" + task_name task_detail = es.get(index=index_manage_sensing_task, doc_type=task_doc_type, id=_id)["_source"] task_name = task_detail['task_name'] social_sensors = json.loads(task_detail['social_sensors']) history_status = json.loads(task_detail['history_status']) start_time = task_detail['create_at'] create_by = task_detail['create_by'] stop_time = task_detail['stop_time'] remark = task_detail['remark'] portrait_detail = [] count = 0 # 计数 if social_sensors: search_results = es.mget(index=portrait_index_name, doc_type=portrait_index_type, body={"ids":social_sensors}, fields=SOCIAL_SENSOR_INFO)['docs'] for item in search_results: temp = [] if item['found']: for iter_item in SOCIAL_SENSOR_INFO: if iter_item == "topic_string": temp.append(item["fields"][iter_item][0].split('&')) else: temp.append(item["fields"][iter_item][0]) portrait_detail.append(temp) portrait_detail = sorted(portrait_detail, key=lambda x:x[5], reverse=True) time_series = [] # 时间 positive_sentiment_list = [] # 情绪列表 neutral_sentiment_list = [] negetive_sentiment_list = [] all_weibo_list = [] origin_weibo_list = [] # 微博列表 retweeted_weibo_list = [] retweeted_weibo_count = [] # 别人转发他的数量 comment_weibo_count = [] total_number_count = [] burst_time_list = [] # 爆发时间列表 important_user_set = set() # 重要人物列表 out_portrait_users = set() # 未入库 ts = int(ts) for item in history_status: if int(item[0]) <= ts: time_series.append(item[0]) # 到目前为止的所有的时间戳 # get detail task information from es if time_series: #print time_series flow_detail = es.mget(index=index_sensing_task, doc_type=_id, body={"ids": time_series})['docs'] else: flow_detail = {} if flow_detail: for item in flow_detail: item = item['_source'] timestamp = item['timestamp'] sentiment_distribution = json.loads(item["sentiment_distribution"]) positive_sentiment_list.append(int(sentiment_distribution['1'])) negetive_sentiment_list.append(int(sentiment_distribution['2'])+int(sentiment_distribution['3']) \ +int(sentiment_distribution['4'])+int(sentiment_distribution['5'])+int(sentiment_distribution['6'])) neutral_sentiment_list.append(int(sentiment_distribution['0'])) origin_weibo_list.append(item["origin_weibo_number"]) # real retweeted_weibo_list.append(item['retweeted_weibo_number']) # real all_weibo_list.append(item["origin_weibo_number"]+item['retweeted_weibo_number']) retweeted_weibo_count.append(item['retweeted_weibo_count']) comment_weibo_count.append(item['comment_weibo_count']) total_number_count.append(item['weibo_total_number']) temp_important_user_list = json.loads(item['important_users']) unfiltered_users = json.loads(item['unfilter_users']) temp_out_portrait_users = set(unfiltered_users) - set(temp_important_user_list) # 未入库 important_user_set = important_user_set | set(temp_important_user_list) out_portrait_users = out_portrait_users | set(temp_out_portrait_users) burst_reason = item.get("burst_reason", "") if burst_reason: burst_time_list.append([timestamp, count, burst_reason]) count += 1 #################################################################################### # 统计爆发原因,下相应的结论 weibo_variation_count = 0 weibo_variation_time = [] sentiment_variation_count = 0 sentiment_variation_time = [] common_variation_count = 0 common_variation_time = [] if burst_time_list: for item in burst_time_list: tmp_common = 0 x1 = 0 x2 = 0 if signal_count_varition in item[2]: weibo_variation_count += 1 weibo_variation_time.append([ts2date_min(item[0]), total_number_count[item[1]]]) x1 = total_number_count[item[1]] tmp_common += 1 if signal_sentiment_varition in item[2]: tmp_common += 1 sentiment_variation_count += 1 x2 = negetive_sentiment_list[item[1]] sentiment_variation_time.append([ts2date_min(item[0]), negetive_sentiment_list[item[1]]]) if tmp_common == 2: common_variation_count += 1 common_variation_time.append([ts2date_min(item[0]), x1, x2]) warning_conclusion = remark variation_distribution = [] if weibo_variation_count: variation_distribution.append(weibo_variation_time) else: variation_distribution.append([]) if sentiment_variation_count: variation_distribution.append(sentiment_variation_time) else: variation_distribution.append([]) if common_variation_count: variation_distribution.append(common_variation_time) else: variation_distribution.append([]) results['warning_conclusion'] = warning_conclusion results['variation_distribution'] = variation_distribution # 每个用户的热度 # 获取重要用户的个人信息 top_influence = get_top_influence("influence") top_activeness = get_top_influence("activeness") top_importance = get_top_influence("importance") important_uid_list = list(important_user_set) out_portrait_users_list = list(out_portrait_users) user_detail_info = [] # out_user_detail_info = [] if important_uid_list: user_results = es.mget(index=portrait_index_name, doc_type=portrait_index_type, body={"ids":important_uid_list}, fields=['uid', 'uname', 'domain', 'topic_string', "photo_url", 'importance', 'influence', 'activeness'])['docs'] for item in user_results: if item['found']: temp = [] #if int(item['fields']['importance'][0]) < IMPORTANT_USER_THRESHOULD: # continue temp.append(item['fields']['uid'][0]) temp.append(item['fields']['uname'][0]) temp.append(item['fields']['photo_url'][0]) temp.append(item['fields']['domain'][0]) temp.append(item['fields']['topic_string'][0].split('&')) #hot_count = count_hot_uid(item['fields']['uid'][0], start_time, stop_time) #temp.append(hot_count) temp.append(math.ceil(item['fields']['importance'][0]/float(top_importance)*100)) temp.append(math.ceil(item['fields']['influence'][0]/float(top_influence)*100)) temp.append(math.ceil(item['fields']['activeness'][0]/float(top_activeness)*100)) user_detail_info.append(temp) # 排序 user_detail_info = sorted(user_detail_info, key=lambda x:x[6], reverse=True) if out_portrait_users_list: profile_results = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":out_portrait_users_list})["docs"] bci_index = "bci_" + ts2datetime(ts-DAY).replace('-','') influence_results = es.mget(index=bci_index, doc_type="bci", body={"ids":out_portrait_users_list})['docs'] top_influence = get_top_all_influence("influence", ts) count = 0 if profile_results: for item in profile_results: temp = [] if item['found']: temp.append(item['_source']['uid']) if item['_source']['nick_name']: temp.append(item['_source']['nick_name']) else: temp.append(item['_source']['uid']) temp.append(item['_source']['user_location']) temp.append(item['_source']['fansnum']) else: temp.append(item['_id']) temp.append(item['_id']) temp.extend(['','']) temp_influ = influence_results[count] if temp_influ.get('found', 0): user_index = temp_influ['_source']['user_index'] temp.append(math.ceil(item['_source']['user_index']/float(top_influence)*100)) else: temp.append(0) count += 1 out_user_detail_info.append(temp) revise_time_series = [] for item in time_series: revise_time_series.append(ts2date_min(item)) results['important_user_detail'] = user_detail_info results['out_portrait_user_detail'] = out_user_detail_info results['burst_time'] = burst_time_list # 爆发时间点,以及爆发原因 results['time_series'] = revise_time_series results['positive_sentiment_list'] = positive_sentiment_list results['negetive_sentiment_list'] = negetive_sentiment_list results['neutral_sentiment_list'] = neutral_sentiment_list results['all_weibo_list'] = all_weibo_list results['origin_weibo_list'] = origin_weibo_list results['retweeted_weibo_list'] = retweeted_weibo_list results['comment_weibo_count'] = comment_weibo_count results['retweeted_weibo_count'] = retweeted_weibo_count results['total_number_list'] = total_number_count results['social_sensors_detail'] = portrait_detail return results
def search_fans(uid,top_count): results = {} now_ts = time.time() now_date_ts = datetime2ts(ts2datetime(now_ts)) db_number = get_db_num(now_date_ts) be_comment_index_name = be_comment_index_name_pre + str(db_number) be_retweet_index_name = be_retweet_index_name_pre + str(db_number) result = {} be_retweet_inter_dict = {} be_comment_inter_dict = {} center_uid = uid try: be_retweet_result = es_retweet.get(index = be_retweet_index_name,doc_type=be_retweet_index_type,id=uid)['_source'] except: be_retweet_result = {} if be_retweet_result: be_retweet_uid_dict = json.loads(be_retweet_result['uid_be_retweet']) else: be_retweet_uid_dict = {} # print "be_retweet_uid_dict", be_retweet_uid_dict try: be_comment_result = es_be_comment.get(index=be_comment_index_name, doc_type=be_comment_index_type, id=uid)['_source'] except: be_comment_result = {} if be_comment_result: be_comment_uid_dict = json.loads(be_comment_result['uid_be_comment']) else: be_comment_uid_dict = {} # print "be_comment_uid_dict", be_comment_uid_dict fans_result = union_dict(be_retweet_uid_dict,be_comment_uid_dict) fans_user_set = set(fans_result.keys()) fans_list = list(fans_user_set) # print "fans_list", fans_list all_fans_dict = {} for fans_user in fans_list: if fans_user != center_uid: all_fans_dict[fans_user] = fans_result[fans_user] sort_all_fans_dict = sorted(all_fans_dict.items(), key=lambda x:x[1], reverse=True) all_fans_uid_list=[] all_fans_uid_list_all = [item[0] for item in sort_all_fans_dict] print all_fans_uid_list_all count = 0 for i in all_fans_uid_list_all: count += 1 all_fans_uid_list.append(i) if count == 1000: break print all_fans_uid_list out_portrait_list = all_fans_uid_list #use to get user information from user profile out_portrait_result = {} try: out_user_result = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={'ids':out_portrait_list})['docs'] except: out_user_result = [] #add index from bci_history try: bci_history_result = es_bci_history.mget(index=bci_history_index_name, doc_type=bci_history_index_type, body={'ids': out_portrait_list}, fields=fields)['docs'] except: bci_history_result = [] iter_count = 0 out_portrait_list = [] for out_user_item in out_user_result: uid = out_user_item['_id'] if out_user_item['found'] == True: source = out_user_item['_source'] uname = source['nick_name'] photo_url = source['photo_url'] if uname == '': uname = u'未知' location = source['user_location'] friendsnum = source['friendsnum'] else: uname = u'未知' location = '' friendsnum = '' photo_url = 'unknown' #add index from bci_history try: bci_history_item = bci_history_result[iter_count] except: bci_history_item = {'found': False} # print bci_history_item if bci_history_item['found'] == True: fansnum = bci_history_item['fields'][fields[0]][0] user_weibo_count = bci_history_item['fields'][fields[1]][0] user_friendsnum = bci_history_item['fields'][fields[2]][0] influence = bci_history_item['fields'][fields[3]][0] else: fansnum = '' user_weibo_count = '' user_friendsnum = '' fans_count = int(all_fans_dict[uid]) out_portrait_list.append({'uid':uid,'photo_url':photo_url,'uname':uname, 'count':fans_count, 'fansnum':fansnum,'friendsnum': user_friendsnum,'weibo_count': user_weibo_count}) iter_count += 1 return out_portrait_list
def search_bidirect_interaction(uid, top_count): now_ts = time.time() now_date_ts = datetime2ts(ts2datetime(now_ts)) db_number = get_db_num(now_date_ts) retweet_index_name = retweet_index_name_pre + str(db_number) be_retweet_index_name = be_retweet_index_name_pre + str(db_number) comment_index_name = comment_index_name_pre + str(db_number) be_comment_index_name = be_comment_index_name_pre + str(db_number) results = {} retweet_inter_dict = {} comment_inter_dict = {} center_uid = uid #bidirect interaction in retweet and be_retweet try: retweet_result = es_retweet.get(index=retweet_index_name, doc_type=retweet_index_type, id=uid)['_source'] except: retweet_result = {} if retweet_result: retweet_uid_dict = json.loads(retweet_result['uid_retweet']) else: retweet_uid_dict = {} retweet_uid_list = retweet_uid_dict.keys() try: be_retweet_result = es_retweet.get(index=be_retweet_index_name, doc_type=be_retweet_index_type, id=uid)['_source'] except: be_retweet_result = {} if be_retweet_result: be_retweet_uid_dict = json.loads(be_retweet_result['uid_be_retweet']) else: be_retweet_uid_dict = {} #bidirect interaction in comment and be_comment try: comment_result = es_comment.get(index=comment_index_name, doc_type=comment_index_type, id=uid)['_source'] except: comment_result = {} if comment_result: comment_uid_dict = json.loads(comment_result['uid_comment']) else: comment_uid_dict = {} comment_uid_list = comment_uid_dict.keys() try: be_comment_result = es_comment.get(index=be_coment_index_name, doc_type=be_comment_index_type, id=uid)['_source'] except: be_comment_result = {} if be_comment_result: be_comment_uid_dict = json.loads(be_comment_result['uid_be_comment']) else: be_comment_uid_dict = {} #get bidirect_interaction dict #all_interaction_dict = union_dict(retweet_inter_dict, comment_inter_dict) retweet_comment_result = union_dict(retweet_uid_dict, comment_uid_dict) be_retweet_comment_result = union_dict(be_retweet_uid_dict, be_comment_uid_dict) interaction_user_set = set(retweet_comment_result.keys()) & set(be_retweet_comment_result.keys()) interaction_user_list = list(interaction_user_set) all_interaction_dict = {} for interaction_user in interaction_user_list: if interaction_user != center_uid: all_interaction_dict[interaction_user] = retweet_comment_result[interaction_user] + be_retweet_comment_result[interaction_user] sort_all_interaction_dict = sorted(all_interaction_dict.items(), key=lambda x:x[1], reverse=True) #get in_portrait_list, in_portrait_results and out_portrait_list all_interaction_uid_list = [item[0] for item in sort_all_interaction_dict] #print all_interaction_uid_list # if RUN_TYPE == 0: # all_interaction_dict = {'2029036025':3,'1282005885':2,'2549228714':2,'1809833450':1} # all_interaction_uid_list = ['2029036025', '1282005885', '2549228714', '1809833450'] out_portrait_list = all_interaction_uid_list #use to get user information from user profile out_portrait_result = {} try: out_user_result = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={'ids':out_portrait_list})['docs'] except: out_user_result = [] #add index from bci_history try: bci_history_result = es_bci_history.mget(index=bci_history_index_name, doc_type=bci_history_index_type, body={'ids': out_portrait_list}, fields=fields)['docs'] except: bci_history_result = [] iter_count = 0 out_portrait_list = [] for out_user_item in out_user_result: uid = out_user_item['_id'] if out_user_item['found'] == True: source = out_user_item['_source'] uname = source['nick_name'] photo_url = source['photo_url'] if uname == '': uname = u'未知' location = source['user_location'] friendsnum = source['friendsnum'] else: uname = u'未知' location = '' friendsnum = '' photo_url = 'unknown' #add index from bci_history try: bci_history_item = bci_history_result[iter_count] except: bci_history_item = {'found': False} # print bci_history_item if bci_history_item['found'] == True: fansnum = bci_history_item['fields'][fields[0]][0] user_weibo_count = bci_history_item['fields'][fields[1]][0] user_friendsnum = bci_history_item['fields'][fields[2]][0] influence = bci_history_item['fields'][fields[3]][0] else: fansnum = '' user_weibo_count = '' user_friendsnum = '' interaction_count = int(all_interaction_dict[uid]) out_portrait_list.append({'uid':uid,'photo_url':photo_url,'uname':uname, 'count':interaction_count, 'fansnum':fansnum,'friendsnum': user_friendsnum,'weibo_count': user_weibo_count}) iter_count += 1 return out_portrait_list
def get_sensitive_weibo_detail(ts, social_sensors, sensitive_words_list, message_type, size=100): results = [] query_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ {"range":{ "timestamp":{ "gte": ts - time_interval, "lt": ts } }}, {"term": {"message_type": message_type}}, {"terms":{"keywords_string": sensitive_words_list}} ] } } } }, "size": size, "sort": {"timestamp": {"order": "desc"}} } if social_sensors: query_body['query']['filtered']['filter']['bool']['must'].append({"terms": {"uid": social_sensors}}) datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts-time_interval) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es_1 = es_text.indices.exists(index_name_1) if datetime == datetime_1 and exist_es: search_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] elif datetime != datetime_1 and exist_es_1: search_results = es_text.search(index=index_name_1, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] else: search_results = [] uid_list = [] if search_results: for item in search_results: uid_list.append(item["_source"]['uid']) if uid_list: portrait_result = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list}, fields=['nick_name', 'photo_url'])["docs"] for i in range(len(uid_list)): item = search_results[i]['_source'] temp = [] # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type temp.append(item['uid']) if portrait_result[i]['found']: temp.append(portrait_result[i]["fields"]["nick_name"][0]) temp.append(portrait_result[i]["fields"]["photo_url"][0]) else: temp.append("unknown") temp.append("") temp.append(item["text"]) #print item['text'] temp.append(item["sentiment"]) temp.append(ts2date(item['timestamp'])) temp.append(item['geo']) keywords_set = set(item['keywords_string'].split('&')) common_keywords = set(sensitive_words_list) & keywords_set temp.append(list(common_keywords)) temp.append(item['message_type']) results.append(temp) return results
def get_positive_weibo_detail(ts, social_sensors, keywords_list, size, sentiment_type=1): former_mid_list = query_mid_list(ts-time_interval, keywords_list, time_segment, social_sensors) # 前一段时间内的微博mid list current_mid_list = query_mid_list(ts, keywords_list, time_interval, social_sensors) mid_list = [] mid_list.extend(former_mid_list) mid_list.extend(current_mid_list) query_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ {"range":{ "timestamp":{ "gte": ts - time_interval, "lt": ts } }} ], "should":[ {"terms": {"root_mid": mid_list}}, {"terms": {"mid": mid_list}}, ] } } } }, "sort": {"timestamp": {"order": "desc"}}, "size": 100 } #if social_sensors and int(sentiment_type) == 1: # query_body["query"]["filtered"]["filter"]["bool"]["must"].append({"terms":{"uid": social_sensors}}) if int(sentiment_type) == 1 or int(sentiment_type) == 0: query_body["query"]["filtered"]["filter"]["bool"]["must"].append({"term":{"sentiment":sentiment_type}}) else: query_body["query"]["filtered"]["filter"]["bool"]["must"].append([{"terms":{"sentiment": ["2", "3"]}}]) # 判断当前ts和ts-time_interval是否属于同一天,确定查询哪个es datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts-time_interval) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es_1 = es_text.indices.exists(index_name_1) # 1. 聚合原创微博mid list if datetime == datetime_1 and exist_es: search_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] elif datetime != datetime_1 and exist_es_1: search_results = es_text.search(index=index_name_1, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] else: search_results = [] uid_list = [] results = [] if search_results: for item in search_results: uid_list.append(item["_source"]['uid']) if uid_list: portrait_result = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list}, fields=['nick_name', 'photo_url'])["docs"] for i in range(len(uid_list)): item = search_results[i]['_source'] temp = [] # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type temp.append(item['uid']) if portrait_result[i]['found']: temp.append(portrait_result[i]["fields"]["nick_name"][0]) temp.append(portrait_result[i]["fields"]["photo_url"][0]) else: temp.append("unknown") temp.append("") temp.append(item["text"]) temp.append(item["sentiment"]) temp.append(ts2date(item['timestamp'])) temp.append(item['geo']) keywords_set = set(item['keywords_string'].split('&')) common_keywords = set(keywords_list) & keywords_set temp.append(list(common_keywords)) temp.append(item['message_type']) results.append(temp) return results
def get_retweet_weibo_detail(ts, user, task_name, size, text_type, type_value): _id = user + '-' + task_name task_detail = es_user_portrait.get(index=index_sensing_task, doc_type=_id, id=ts)['_source'] origin_weibo_detail = json.loads(task_detail['origin_weibo_detail']) retweeted_weibo_detail = json.loads(task_detail['retweeted_weibo_detail']) mid_list = [] mid_list.extend(origin_weibo_detail.keys()) mid_list.extend(retweeted_weibo_detail.keys()) query_body = { "query": { "filtered": { "filter": { "bool": { "must": [{ "range": { "timestamp": { "gte": ts - time_interval, "lt": ts } } }, { "terms": { "root_mid": mid_list } }] } } } }, "sort": { "timestamp": { "order": "desc" } }, "size": 100 } if text_type == "message_type": query_body['query']['filtered']['filter']['bool']['must'].append( {"term": { text_type: type_value }}) if text_type == "sentiment": #if isinstance(type_value, str): if len(type_value) == 1: query_body['query']['filtered']['filter']['bool']['must'].append( {"term": { text_type: type_value }}) else: query_body['query']['filtered']['filter']['bool']['must'].append( {"terms": { text_type: type_value }}) datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts - time_interval) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es_1 = es_text.indices.exists(index_name_1) # 1. 查询微博 if datetime == datetime_1 and exist_es: search_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] elif datetime != datetime_1 and exist_es_1: search_results = es_text.search(index=index_name_1, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] else: search_results = [] #print search_results # 2. 获取微博相关信息 results = [] uid_list = [] if search_results: for item in search_results: uid_list.append(item["_source"]['uid']) if uid_list: portrait_result = es_profile.mget( index=profile_index_name, doc_type=profile_index_type, body={"ids": uid_list}, fields=['nick_name', 'photo_url'])["docs"] for i in range(len(uid_list)): item = search_results[i]['_source'] temp = [] # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type temp.append(item['uid']) if portrait_result[i]['found']: temp.append(portrait_result[i]["fields"]["nick_name"][0]) temp.append(portrait_result[i]["fields"]["photo_url"][0]) else: temp.append(item['uid']) temp.append("") temp.append(item["text"]) #print item['text'] temp.append(item["sentiment"]) temp.append(ts2date(item['timestamp'])) temp.append(item['geo']) temp.append(item["message_type"]) results.append(temp) return results
def get_recommentation(submit_user): if RUN_TYPE: now_ts = time.time() else: now_ts = datetime2ts(RUN_TEST_TIME) in_portrait_set = set(r.hkeys("compute")) result = [] for i in range(7): iter_ts = now_ts - i * DAY iter_date = ts2datetime(iter_ts) submit_user_recomment = "recomment_" + submit_user + "_" + str( iter_date) bci_date = ts2datetime(iter_ts - DAY) submit_user_recomment = r.hkeys(submit_user_recomment) bci_index_name = "bci_" + bci_date.replace('-', '') exist_bool = es_cluster.indices.exists(index=bci_index_name) if not exist_bool: continue if submit_user_recomment: user_bci_result = es_cluster.mget( index=bci_index_name, doc_type="bci", body={'ids': submit_user_recomment}, _source=True)['docs'] user_profile_result = es_user_profile.mget( index='weibo_user', doc_type='user', body={'ids': submit_user_recomment}, _source=True)['docs'] max_evaluate_influ = get_evaluate_max(bci_index_name) for i in range(len(submit_user_recomment)): uid = submit_user_recomment[i] bci_dict = user_bci_result[i] profile_dict = user_profile_result[i] try: bci_source = bci_dict['_source'] except: bci_source = None if bci_source: influence = bci_source['user_index'] influence = math.log( influence / max_evaluate_influ['user_index'] * 9 + 1, 10) influence = influence * 100 else: influence = '' try: profile_source = profile_dict['_source'] except: profile_source = None if profile_source: uname = profile_source['nick_name'] location = profile_source['user_location'] fansnum = profile_source['fansnum'] statusnum = profile_source['statusnum'] else: uname = '' location = '' fansnum = '' statusnum = '' if uid in in_portrait_set: in_portrait = "1" else: in_portrait = "0" recomment_day = iter_date result.append([ iter_date, uid, uname, location, fansnum, statusnum, influence, in_portrait ]) return result
def get_user_detail(date, input_result, status, user_type="influence", auth=""): bci_date = ts2datetime(datetime2ts(date) - DAY) results = [] if status=='show_in': uid_list = input_result if status=='show_compute': uid_list = input_result.keys() if status=='show_in_history': uid_list = input_result.keys() if date!='all': index_name = 'bci_' + ''.join(bci_date.split('-')) else: now_ts = time.time() now_date = ts2datetime(now_ts) index_name = 'bci_' + ''.join(now_date.split('-')) index_type = 'bci' user_bci_result = es_cluster.mget(index=index_name, doc_type=index_type, body={'ids':uid_list}, _source=True)['docs'] user_profile_result = es_user_profile.mget(index='weibo_user', doc_type='user', body={'ids':uid_list}, _source=True)['docs'] max_evaluate_influ = get_evaluate_max(index_name) for i in range(0, len(uid_list)): uid = uid_list[i] bci_dict = user_bci_result[i] profile_dict = user_profile_result[i] try: bci_source = bci_dict['_source'] except: bci_source = None if bci_source: influence = bci_source['user_index'] influence = math.log(influence/max_evaluate_influ['user_index'] * 9 + 1 ,10) influence = influence * 100 else: influence = '' try: profile_source = profile_dict['_source'] except: profile_source = None if profile_source: uname = profile_source['nick_name'] location = profile_source['user_location'] fansnum = profile_source['fansnum'] statusnum = profile_source['statusnum'] else: uname = '' location = '' fansnum = '' statusnum = '' if status == 'show_in': if user_type == "sensitive": tmp_ts = datetime2ts(date) - DAY tmp_data = r_cluster.hget("sensitive_"+str(tmp_ts), uid) if tmp_data: sensitive_dict = json.loads(tmp_data) sensitive_words = sensitive_dict.keys() else: sensitive_words = [] results.append([uid, uname, location, fansnum, statusnum, influence, sensitive_words]) else: results.append([uid, uname, location, fansnum, statusnum, influence]) if auth: hashname_submit = "submit_recomment_" + date tmp_data = json.loads(r.hget(hashname_submit, uid)) recommend_list = (tmp_data['operation']).split('&') admin_list = [] admin_list.append(tmp_data['system']) admin_list.append(list(set(recommend_list))) admin_list.append(len(recommend_list)) results[-1].extend(admin_list) if status == 'show_compute': in_date = json.loads(input_result[uid])[0] compute_status = json.loads(input_result[uid])[1] if compute_status == '1': compute_status = '3' results.append([uid, uname, location, fansnum, statusnum, influence, in_date, compute_status]) if status == 'show_in_history': in_status = input_result[uid] if user_type == "sensitive": tmp_ts = datetime2ts(date) - DAY tmp_data = r_cluster.hget("sensitive_"+str(tmp_ts), uid) if tmp_data: sensitive_dict = json.loads(tmp_data) sensitive_words = sensitive_dict.keys() results.append([uid, uname, location, fansnum, statusnum, influence, in_status, sensitive_words]) else: results.append([uid, uname, location, fansnum, statusnum, influence, in_status]) return results
query_sensitive_body = { "query":{ "match_all":{} }, "size":1, "sort":{sensitive_string:{"order":"desc"}} } try: top_sensitive_result = es_bci_history.search(index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX, body=query_sensitive_body, _source=False, fields=[sensitive_string])['hits']['hits'] top_sensitive = top_sensitive_result[0]['fields'][sensitive_string][0] except Exception, reason: print Exception, reason top_sensitive = 400 index_type = 'bci' user_bci_result = es_cluster.mget(index=index_name, doc_type=index_type, body={'ids':uid_list}, _source=True)['docs'] user_profile_result = es_user_profile.mget(index='weibo_user', doc_type='user', body={'ids':uid_list}, _source=True)['docs'] bci_history_result = es_bci_history.mget(index=bci_history_index_name, doc_type=bci_history_index_type, body={"ids":uid_list}, fields=['user_fansnum', 'weibo_month_sum'])['docs'] sensitive_history_result = es_bci_history.mget(index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX, body={'ids':uid_list}, fields=[sensitive_string], _source=False)['docs'] max_evaluate_influ = get_evaluate_max(index_name) for i in range(0, len(uid_list)): uid = uid_list[i] bci_dict = user_bci_result[i] profile_dict = user_profile_result[i] bci_history_dict = bci_history_result[i] sensitive_history_dict = sensitive_history_result[i] #print sensitive_history_dict try: bci_source = bci_dict['_source'] except: bci_source = None if bci_source:
def get_temporal_rank(task_type, sort="retweeted"): if int(task_type) == 0: # 到目前位置 sort_list = r.zrange("influence_%s" %sort, 0, 100, withscores=True, desc=True) elif int(task_type) == 1: sort_list = r.zrange("influence_%s_1" %sort, 0, 100, withscores=True, desc=True) elif int(task_type) == 2: sort_list = r.zrange("influence_%s_2" %sort, 0, 100, withscores=True, desc=True) elif int(task_type) == 3: sort_list = r.zrange("influence_%s_3" %sort, 0, 100, withscores=True, desc=True) else: sort_list = r.zrange("influence_%s_4" %sort, 0, 100, withscores=True, desc=True) uid_list = [] for item in sort_list: uid_list.append(item[0]) if sort == "retweeted": other = "comment" else: other = "retweeted" results = [] # 查看背景信息 if uid_list: profile_result = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list})["docs"] for item in profile_result: _id = item['_id'] index = profile_result.index(item) tmp = [] if item['found']: item = item['_source'] tmp.append(item['uid']) tmp.append(item['nick_name']) tmp.append(item['statusnum']) tmp.append(item['user_location']) tmp.append(item['fansnum']) else: tmp.extend([_id,'','','','']) count_1 = int(sort_list[index][1]) if int(task_type) == 0: count_2 = int(r.zscore("influence_%s" %other, _id)) else: count_2 = int(r.zscore("influence_%s_%s" %(other,task_type), _id)) if sort == "retweeted": tmp.append(count_1) tmp.append(count_2) else: tmp.append(count_2) tmp.append(count_1) results.append(tmp) if uid_list: count = 0 portrait_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type, body={"ids":uid_list})["docs"] for item in portrait_result: if item['found']: results[count].append("1") else: results[count].append("0") count += 1 return results
def identify_user_portrait(user_set, filter_type): in_portrait_result = [] out_portrait_result = [] user_list = list(user_set) #identify the user_portrait iter_count = 0 all_user_count = len(user_list) all_in_portrait_user = dict() all_out_portrait_user_list = [] max_result = get_evaluate_max() while iter_count <= all_user_count: iter_user_list = user_list[iter_count: iter_count + SENTIMENT_ITER_USER_COUNT] #search user in user_portrait try: in_portrait_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type,\ body={'ids': iter_user_list}, _source=False, \ fields=['uname', 'influence', 'activeness', 'importance', 'sensitive'])['docs'] except: in_portrait_result = [] #add all hit user for in_portrait_item in in_portrait_result: if in_portrait_item['found'] == True: uname = in_portrait_item['fields']['uname'][0] if uname == '' or uname == 'unknown': uname = in_portrait_item['_id'] influence = in_portrait_item['fields']['influence'][0] normal_influence = math.log(influence / max_result['influence'] * 9 + 1 , 10) * 100 activeness = in_portrait_item['fields']['activeness'][0] normal_activeness = math.log(activeness / max_result['activeness'] * 9 + 1 , 10) * 100 importance = in_portrait_item['fields']['importance'][0] normal_importance = math.log(importance / max_result['importance'] * 9 + 1 , 10) * 100 try: sensitive = in_portrait_item['fields']['sensitive'][0] normal_sensitive = math.log(sensitive / max_result['sensitive'] * 9 + 1 , 10) * 100 except: normal_sensitive = 0 all_in_portrait_user[in_portrait_item['_id']] = [uname, normal_influence, normal_activeness, \ normal_importance, normal_sensitive] else: all_out_portrait_user_list.append(int(in_portrait_item['_id'])) iter_count += SENTIMENT_ITER_USER_COUNT if filter_type == 'in': return all_in_portrait_user #get out portrait user info iter_count = 0 all_out_portrait_user = dict() all_out_user_count = len(all_out_portrait_user_list) while iter_count <= all_out_user_count: iter_uid_list = all_out_portrait_user_list[iter_count: iter_count+SENTIMENT_ITER_USER_COUNT] bci_iter_uid_list = [str(item) for item in iter_uid_list] try: profile_result = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type,\ body={'ids':iter_uid_list}, _source=False, fields=['nick_name'])['docs'] except: profile_result = [] #bci_history try: bci_history_result = es_bci_history.mget(index=bci_history_index_name, doc_type=bci_history_index_type, body={'ids': bci_iter_uid_list}, _source=False, fields=['user_fansnum', 'weibo_month_sum', 'user_friendsnum'])['docs'] except: bci_history_result = [] bci_iter_count = 0 for uid in iter_uid_list: try: profile_item = profile_result[bci_iter_count] except: profile_item = {'found': False} if profile_item['found'] == True: uname = profile_item['fields']['nick_name'][0] else: uname= profile_item['_id'] try: bci_history_item = bci_history_result[bci_iter_count] except: bci_history_item = {'found': False} if bci_history_item['found'] == True: statusnum = bci_history_item['fields']['weibo_month_sum'][0] fansnum = bci_history_item['fields']['user_fansnum'][0] friendsnum = bci_history_item['fields']['user_friendsnum'][0] else: statusnum = 0 fansnum = 0 friendsnum = 0 all_out_portrait_user[str(uid)] = [uname, statusnum, friendsnum, fansnum] bci_iter_count += 1 iter_count += SENTIMENT_ITER_USER_COUNT return all_in_portrait_user, all_out_portrait_user
# get group weibo by date def get_group_weibo(task_name, date): group_weibo = [] #step1 : get group user list by task_name group_index_name = 'group_result' group_index_type = 'group' try: group_task = es.get(index=group_index_name, doc_type=group_index_type, id=task_name)['_source'] except Exception ,e: raise e user_list = group_task['uid_list'] #step2: get group user name profile_index_name = 'weibo_user' profile_index_type = 'user' try: user_name_result = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={'ids':user_list})['docs'] except Exception, e: raise e #print 'user_name_result:', user_name_result #step3 : get group user weibo file_list = set(os.listdir(DEFAULT_LEVELDBPATH)) count = 0 for user in user_list: user_nick_name = user_name_result[count]['_source']['nick_name'] for i in range(1, 25): leveldb_folder = date + str(i) if leveldb_folder in file_list: leveldb_bucket = dynamic_leveldb(leveldb_folder) try: user_weibo = leveldb_bucket.Get(str(user)) weibo_list = json.loads(user_weibo)
def get_origin_weibo_detail(ts, user, task_name, size, order, message_type=1): _id = user + '-' + task_name task_detail = es_user_portrait.get(index=index_sensing_task, doc_type=_id, id=ts)['_source'] mid_value = json.loads(task_detail['mid_topic_value']) duplicate_dict = json.loads(task_detail['duplicate_dict']) tmp_duplicate_dict = dict() for k,v in duplicate_dict.iteritems(): try: tmp_duplicate_dict[v].append(k) except: tmp_duplicate_dict[v] = [k, v] if message_type == 1: weibo_detail = json.loads(task_detail['origin_weibo_detail']) elif message_type == 2: weibo_detail = json.loads(task_detail['retweeted_weibo_detail']) else: weibo_detail = json.loads(task_detail['sensitive_weibo_detail']) weibo_detail_list = [] if weibo_detail: for iter_mid, item in weibo_detail.iteritems(): tmp = [] tmp.append(iter_mid) tmp.append(item[iter_mid]) tmp.append(item['retweeted']) tmp.append(item['comment']) weibo_detail_list.append(tmp) mid_list = weibo_detail.keys() results = [] query_body = { "query":{ "filtered":{ "filter":{ "terms":{"mid": mid_list} } } }, "size": 1000, "sort": {"timestamp": {"order": "desc"}} } index_list = [] datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts-DAY) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) if exist_es: index_list.append(index_name) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es_1 = es_text.indices.exists(index_name_1) if exist_es_1: index_list.append(index_name_1) if index_list and mid_list: search_results = es_text.search(index=index_list, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] else: search_results = [] uid_list = [] text_dict = dict() # 文本信息 portrait_dict = dict() # 背景信息 sort_results = [] if search_results: for item in search_results: uid_list.append(item["_source"]['uid']) text_dict[item['_id']] = item['_source'] # _id是mid if uid_list: portrait_result = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list}, fields=['nick_name', 'photo_url'])["docs"] for item in portrait_result: if item['found']: portrait_dict[item['_id']] = {"nick_name": item["fields"]["nick_name"][0], "photo_url": item["fields"]["photo_url"][0]} else: portrait_dict[item['_id']] = {"nick_name": item['_id'], "photo_url":""} if order == "total": sorted_list = sorted(weibo_detail_list, key=lambda x:x[1], reverse=True) elif order == "retweeted": sorted_list = sorted(weibo_detail_list, key=lambda x:x[2], reverse=True) elif order == "comment": sorted_list = sorted(weibo_detail_list, key=lambda x:x[3], reverse=True) else: sorted_list = weibo_detail_list count_n = 0 results_dict = dict() mid_index_dict = dict() for item in sorted_list: # size mid = item[0] iter_text = text_dict.get(mid, {}) temp = [] # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type if iter_text: uid = iter_text['uid'] temp.append(uid) iter_portrait = portrait_dict.get(uid, {}) if iter_portrait: temp.append(iter_portrait['nick_name']) temp.append(iter_portrait['photo_url']) else: temp.extend([uid,'']) temp.append(iter_text["text"]) temp.append(iter_text["sentiment"]) temp.append(ts2date(iter_text['timestamp'])) temp.append(iter_text['geo']) if message_type == 1: temp.append(1) elif message_type == 2: temp.append(3) else: temp.append(iter_text['message_type']) temp.append(item[2]) temp.append(item[3]) temp.append(iter_text.get('sensitive', 0)) temp.append(iter_text['timestamp']) temp.append(mid_value[mid]) temp.append(mid) results.append(temp) count_n += 1 results = sorted(results, key=operator.itemgetter(-4, -2, -6), reverse=True) # -4 -2 -3 sort_results = [] count = 0 for item in results: sort_results.append([item]) mid_index_dict[item[-1]] = count count += 1 if tmp_duplicate_dict: remove_list = [] value_list = tmp_duplicate_dict.values() # [[mid, mid], ] for item in value_list: tmp = [] for mid in item: if mid_index_dict.get(mid, 0): tmp.append(mid_index_dict[mid]) if len(tmp) > 1: tmp_min = min(tmp) else: continue tmp.remove(tmp_min) for iter_count in tmp: sort_results[tmp_min].extend(sort_results[iter_count]) remove_list.append(sort_results[iter_count]) if remove_list: for item in remove_list: sort_results.remove(item) return sort_results
def get_task_detail_2(task_name, ts, user): results = dict() index_name = task_name _id = user + "-" + task_name task_detail = es.get(index=index_manage_sensing_task, doc_type=task_doc_type, id=_id)["_source"] task_name = task_detail['task_name'] social_sensors = json.loads(task_detail['social_sensors']) history_status = json.loads(task_detail['history_status']) start_time = task_detail['create_at'] create_by = task_detail['create_by'] stop_time = task_detail['stop_time'] remark = task_detail.get('remark', '') portrait_detail = [] count = 0 # 计数 top_influence = get_top_influence("influence") top_activeness = get_top_influence("activeness") top_importance = get_top_influence("importance") if social_sensors: search_results = es.mget(index=portrait_index_name, doc_type=portrait_index_type, body={"ids": social_sensors}, fields=SOCIAL_SENSOR_INFO)['docs'] for item in search_results: temp = [] if item['found']: for iter_item in SOCIAL_SENSOR_INFO: if iter_item == "topic_string": temp.append(item["fields"][iter_item][0].split('&')) elif iter_item == "activeness": temp.append( math.log( item['fields']['activeness'][0] / float(top_activeness) * 9 + 1, 10) * 100) elif iter_item == "importance": temp.append( math.log( item['fields']['importance'][0] / float(top_importance) * 9 + 1, 10) * 100) elif iter_item == "influence": temp.append( math.log( item['fields']['influence'][0] / float(top_influence) * 9 + 1, 10) * 100) else: temp.append(item["fields"][iter_item][0]) portrait_detail.append(temp) portrait_detail = sorted(portrait_detail, key=lambda x: x[5], reverse=True) time_series = [] # 时间 #positive_sentiment_list = [] # 情绪列表 #neutral_sentiment_list = [] #negetive_sentiment_list = [] all_weibo_list = [] origin_weibo_list = [] # 微博列表 retweeted_weibo_list = [] #retweeted_weibo_count = [] # 别人转发他的数量 #comment_weibo_count = [] #total_number_count = [] #burst_time_list = [] # 爆发时间列表 important_user_set = set() # 重要人物列表 out_portrait_users = set() # 未入库 ts = int(ts) time_series = history_status #for item in history_status: # if int(item[0]) <= ts: # time_series.append(item[0]) # 到目前为止的所有的时间戳 # get detail task information from es if time_series: flow_detail = es.mget(index=index_sensing_task, doc_type=_id, body={"ids": time_series})['docs'] else: flow_detail = {} if flow_detail: for item in flow_detail: item = item['_source'] timestamp = item['timestamp'] #sentiment_distribution = json.loads(item["sentiment_distribution"]) #positive_sentiment_list.append(int(sentiment_distribution['1'])) #negetive_sentiment_list.append(int(sentiment_distribution['2'])+int(sentiment_distribution['3']) \ # +int(sentiment_distribution['4'])+int(sentiment_distribution['5'])+int(sentiment_distribution['6'])) #neutral_sentiment_list.append(int(sentiment_distribution['0'])) origin_weibo_list.append(item["origin_weibo_number"]) # real retweeted_weibo_list.append(item['retweeted_weibo_number']) # real all_weibo_list.append(item["origin_weibo_number"] + item['retweeted_weibo_number']) #retweeted_weibo_count.append(item['retweeted_weibo_count']) #comment_weibo_count.append(item['comment_weibo_count']) #total_number_count.append(item['weibo_total_number']) temp_important_user_list = json.loads(item['important_users']) unfiltered_users = json.loads(item['unfilter_users']) temp_out_portrait_users = set(unfiltered_users) - set( temp_important_user_list) # 未入库 important_user_set = important_user_set | set( temp_important_user_list) out_portrait_users = out_portrait_users | set( temp_out_portrait_users) #burst_reason = item.get("burst_reason", "") #if burst_reason: # burst_time_list.append([timestamp, count, burst_reason]) count += 1 #################################################################################### # 统计爆发原因,下相应的结论 """ weibo_variation_count = 0 weibo_variation_time = [] sentiment_variation_count = 0 sentiment_variation_time = [] sensitive_variation_count = 0 # sensitive sensitive_variation_time = [] # sensitive common_variation_count = 0 common_variation_time = [] if burst_time_list: for item in burst_time_list: tmp_common = 0 x1 = 0 x2 = 0 x3 = 0 if signal_count_varition in item[2]: weibo_variation_count += 1 weibo_variation_time.append([ts2date_min(item[0]), total_number_count[item[1]]]) x1 = total_number_count[item[1]] tmp_common += 1 if signal_sentiment_varition in item[2]: tmp_common += 1 sentiment_variation_count += 1 x2 = negetive_sentiment_list[item[1]] sentiment_variation_time.append([ts2date_min(item[0]), negetive_sentiment_list[item[1]]]) if signal_sensitive_variation in item[2]: tmp_common += 1 sensitive_variation_count += 1 x3 = sensitive_total_number_list[item[1]] sensitive_variation_time.append([ts2date_min(item[0]), all_weibo_list[item[1]]]) if tmp_common >= 2: common_variation_count += 1 common_variation_time.append([ts2date_min(item[0]), x1, x2, x3]) warning_conclusion = remark variation_distribution = [] if weibo_variation_count: variation_distribution.append(weibo_variation_time) else: variation_distribution.append([]) if sentiment_variation_count: variation_distribution.append(sentiment_variation_time) else: variation_distribution.append([]) if sensitive_variation_count: variation_distribution.append(sensitive_variation_time) else: variation_distribution.append([]) if common_variation_count: variation_distribution.append(common_variation_time) else: variation_distribution.append([]) results['warning_conclusion'] = warning_conclusion results['variation_distribution'] = variation_distribution # 每个用户的热度 """ # 获取重要用户的个人信息 important_uid_list = list(important_user_set) out_portrait_users_list = list(out_portrait_users) social_sensor_set = set(social_sensors) user_detail_info = [] # out_user_detail_info = [] if important_uid_list: user_results = es.mget(index=portrait_index_name, doc_type=portrait_index_type, body={"ids": important_uid_list}, fields=[ 'uid', 'uname', 'domain', 'topic_string', "photo_url", 'importance', 'influence', 'activeness' ])['docs'] for item in user_results: if item['found']: temp = [] #if int(item['fields']['importance'][0]) < IMPORTANT_USER_THRESHOULD: # continue temp.append(item['fields']['uid'][0]) uname = item['fields']['uname'][0] if not uname or uname == "未知": uname = item['fields']['uid'][0] temp.append(uname) temp.append(item['fields']['photo_url'][0]) temp.append(item['fields']['domain'][0]) temp.append(item['fields']['topic_string'][0].split('&')) #hot_count = count_hot_uid(item['fields']['uid'][0], start_time, stop_time) #temp.append(hot_count) temp.append( math.log( item['fields']['importance'][0] / float(top_importance) * 9 + 1, 10) * 100) temp.append( math.log( item['fields']['influence'][0] / float(top_influence) * 9 + 1, 10) * 100) temp.append( math.log( item['fields']['activeness'][0] / float(top_activeness) * 9 + 1, 10) * 100) if item['fields']['uid'][0] in social_sensor_set: temp.append(1) else: temp.append(0) user_detail_info.append(temp) # 排序 if user_detail_info: user_detail_info = sorted(user_detail_info, key=lambda x: x[6], reverse=True) else: user_detail_info = [] if out_portrait_users_list: profile_results = es_profile.mget( index=profile_index_name, doc_type=profile_index_type, body={"ids": out_portrait_users_list})["docs"] bci_index = "bci_" + ts2datetime(ts - DAY).replace('-', '') influence_results = es.mget(index=bci_index, doc_type="bci", body={"ids": out_portrait_users_list}, fields=["user_index"])['docs'] bci_results = es_profile.mget(index="bci_history", doc_type="bci", body={"ids": out_portrait_users_list}, fields=['user_fansnum'])['docs'] top_influence = get_top_all_influence("user_index", ts) count = 0 if profile_results: for item in profile_results: temp = [] if item['found']: temp.append(item['_source']['uid']) if item['_source']['nick_name']: temp.append(item['_source']['nick_name']) else: temp.append(item['_source']['uid']) temp.append(item['_source']['user_location']) #temp.append(item['_source']['fansnum']) else: temp.append(item['_id']) temp.append(item['_id']) temp.extend(['']) try: user_fansnum = bci_results[count]["fields"][ "user_fansnum"][0] except: user_fansnum = 0 temp.append(user_fansnum) temp_influ = influence_results[count] if temp_influ.get('found', 0): user_index = temp_influ['fields']['user_index'][0] temp.append( math.log(user_index / float(top_influence) * 9 + 1, 10) * 100) else: temp.append(0) count += 1 out_user_detail_info.append(temp) print len(out_user_detail_info) if len(out_user_detail_info): print "sort" out_user_detail_info = sorted(out_user_detail_info, key=lambda x: x[4], reverse=True) revise_time_series = [] for item in time_series: revise_time_series.append(ts2date_min(item)) results['important_user_detail'] = user_detail_info results['out_portrait_user_detail'] = out_user_detail_info #results['burst_time'] = burst_time_list # 爆发时间点,以及爆发原因 results['time_series'] = revise_time_series #results['positive_sentiment_list'] = positive_sentiment_list #esults['negetive_sentiment_list'] = negetive_sentiment_list #results['neutral_sentiment_list'] = neutral_sentiment_list results['all_weibo_list'] = all_weibo_list results['origin_weibo_list'] = origin_weibo_list results['retweeted_weibo_list'] = retweeted_weibo_list #results['comment_weibo_count'] = comment_weibo_count #results['retweeted_weibo_count'] = retweeted_weibo_count #results['total_number_list'] = total_number_count results['social_sensors_detail'] = portrait_detail return results
def get_more_out(task_name, ts, user): results = dict() index_name = task_name _id = user + "-" + task_name task_detail = es.get(index=index_manage_sensing_task, doc_type=task_doc_type, id=_id)["_source"] task_name = task_detail['task_name'] social_sensors = json.loads(task_detail['social_sensors']) history_status = json.loads(task_detail['history_status']) important_user_set = set() # 重要人物列表 out_portrait_users = set() # 未入库 top_importance = get_top_influence("importance") top_influence = get_top_influence("influence") top_activeness = get_top_influence("activeness") ts = int(ts) time_series = history_status if time_series: flow_detail = es.mget(index=index_sensing_task, doc_type=_id, body={"ids": time_series})['docs'] else: flow_detail = {} if flow_detail: for item in flow_detail: item = item['_source'] timestamp = item['timestamp'] temp_important_user_list = json.loads(item['important_users']) unfiltered_users = json.loads(item['unfilter_users']) temp_out_portrait_users = set(unfiltered_users) - set( temp_important_user_list) important_user_set = important_user_set | set( temp_important_user_list) out_portrait_users = out_portrait_users | set( temp_out_portrait_users) out_portrait_users_list = list(out_portrait_users) social_sensor_set = set(social_sensors) out_user_detail_info = [] if out_portrait_users_list: out_portrait_users_list = out_portrait_users_list[:1000] profile_results = es_profile.mget( index=profile_index_name, doc_type=profile_index_type, body={"ids": out_portrait_users_list})["docs"] bci_index = "bci_" + ts2datetime(ts - DAY).replace('-', '') influence_results = es.mget(index=bci_index, doc_type="bci", body={"ids": out_portrait_users_list}, fields=["user_index"])['docs'] bci_results = es_profile.mget(index="bci_history", doc_type="bci", body={"ids": out_portrait_users_list}, fields=['user_fansnum'])['docs'] top_influence = get_top_all_influence("user_index", ts) count = 0 if profile_results: for item in profile_results: temp = [] if item['found']: temp.append(item['_source']['uid']) if item['_source']['nick_name']: temp.append(item['_source']['nick_name']) else: temp.append(item['_source']['uid']) temp.append(item['_source']['user_location']) else: temp.append(item['_id']) temp.append(item['_id']) temp.extend(['']) try: user_fansnum = bci_results[count]["fields"][ "user_fansnum"][0] except: user_fansnum - 0 temp.append(user_fansnum) temp_influ = influence_results[count] if temp_influ.get('found', 0): user_index = temp_influ['fields']['user_index'][0] temp.append( math.log(user_index / float(top_influence) * 9 + 1, 10) * 100) else: temp.append(0) count += 1 out_user_detail_info.append(temp) if len(out_user_detail_info): out_user_detail_info = sorted(out_user_detail_info, key=lambda x: x[4], reverse=True) return out_user_detail_info