def get_community_userinfo(uid_list,core_uidlist,outer_uidlist): user_list = [] # print 'es_user_profile::',es_user_profile user_result = es_user_profile.mget(index = profile_index_name,doc_type = profile_index_type,body = {'ids':uid_list})['docs'] core_user = [] for item in user_result: user_dict = dict() user_dict['uid'] = item['_id'] if item['found']: user_dict['photo_url']=item['_source']['photo_url'] user_dict['nick_name']=item['_source']['nick_name'] user_dict['sex']=item['_source']['sex'] user_dict['friendsnum']=item['_source']['friendsnum'] user_dict['fansnum']=item['_source']['fansnum'] user_dict['user_location']=item['_source']['user_location'] else: user_dict['photo_url']='' user_dict['nick_name']='' user_dict['sex']='' user_dict['friendsnum']='' user_dict['fansnum']='' user_dict['user_location']='' #核心人物判断 core_mark = len(list(set(item['_id'].split())&set(core_uidlist))) if core_mark > 0: user_dict['core_user'] = 1 core_user.append(user_dict) else: user_dict['core_user'] = 0 user_list.append(user_dict) #社区外人物信息 outer_userlist = [] outeruser_result = es_user_profile.mget(index = profile_index_name,doc_type = profile_index_type,body = {'ids':outer_uidlist})['docs'] for item in outeruser_result: user_dict = dict() user_dict['uid'] = item['_id'] if item['found']: user_dict['photo_url']=item['_source']['photo_url'] user_dict['nick_name']=item['_source']['nick_name'] user_dict['sex']=item['_source']['sex'] user_dict['friendsnum']=item['_source']['friendsnum'] user_dict['fansnum']=item['_source']['fansnum'] user_dict['user_location']=item['_source']['user_location'] else: user_dict['photo_url']='' user_dict['nick_name']='' user_dict['sex']='' user_dict['friendsnum']='' user_dict['fansnum']='' user_dict['user_location']='' outer_userlist.append(user_dict) return json.dumps(user_list),json.dumps(core_user),json.dumps(outer_userlist)
def group_evaluate(xnr_user_no, nodes, all_influence, all_sensitive, G=None): result = {} result['xnr_user_no'] = xnr_user_no result['nodes'] = nodes result['num'] = len(nodes) if G: sub_g = G.subgraph(nodes) else: sub_g = get_users(xnr_user_no, nodes) result['density'] = round(nx.density(sub_g), 4) result['cluster'] = round(nx.average_clustering(sub_g), 4) result['transitivity'] = round(nx.transitivity(sub_g), 4) # for i in user_es.mget(index=sensitive_index, doc_type=sensitive_type,body={'ids':nodes}, fields=['sensitive_week_ave'],_source=False)['docs']: # print i#['fields']['sensitive_week_ave'] influence_result = [ float(i['fields']['bci_week_ave'][0]) if i['found'] else 0 for i in es_user_profile.mget(index=influence_index, doc_type=influence_type, body={'ids': nodes}, fields=['bci_week_ave'], _source=False)['docs'] ] sensitive_result = [ float(i['fields']['sensitive_week_ave'][0]) if i['found'] else 0 for i in es_user_profile.mget(index=sensitive_index, doc_type=sensitive_type, body={'ids': nodes}, fields=['sensitive_week_ave'], _source=False)['docs'] ] result['max_influence'] = round( (max(influence_result) / float(all_influence)) * 100, 4) result['mean_influence'] = round( ((sum(influence_result) / len(influence_result)) / float(all_influence)) * 100, 4) max_sensitive = round((max(sensitive_result) / float(all_sensitive)) * 100, 4) result['mean_sensitive'] = round( ((sum(sensitive_result) / len(sensitive_result)) / float(all_sensitive)) * 100, 4) if max_sensitive > 100: result['max_sensitive'] = 100.0000 else: result['max_sensitive'] = max_sensitive # result['mean_sensitive'] = 0.4670 # result['mean_influence'] = 25.9874 # result['density'] = 0.0068 return result
def get_user_profile(uid_list, specify_field=[]): if not uid_list: return [] results = [] search_results = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids": uid_list})['docs'] field_list = [ "nick_name", "fansnum", "friendsnum", "photo_url", 'description', "statusnum", "sp_type", "user_location", "create_at", "sex", "verified_type", "isreal", "user_email" ] for item in search_results: iter_result = [] iter_result.append(item['_id']) if item['found']: if specify_field: field_list = specify_field for iter_field in field_list: iter_result.append(item['_source'][iter_field]) else: if specify_field: iter_result.extend([''] * len(specify_field)) else: iter_result.extend([''] * len(field_list)) iter_result[1] = item['_id'] results.append(iter_result) return results
def es_mget_source(ids): try: source = es.mget(index=INDEX_NAME, doc_type=DOC_TYPE, body={'ids': ids}) except Exception as e: raise e source = [item['_source'] for item in source['docs'] if item['found'] is True] return source
def get_profile_information(uid_list): #print 'len uid list:', len(uid_list) result_dict = dict() search_result = es.mget(index=index_name, doc_type=index_type, body={'ids': uid_list}, _source=True)['docs'] #print 'search_result:', search_result for item in search_result: user_dict = {} for field in fields_dict: try: user_dict[field] = item['_source'][fields_dict[field]] except: if field == 'statusnum': user_dict[field] = 0 elif field == 'fansnum': user_dict[field] = 0 elif field == 'friendsnum': user_dict[field] = 0 elif field == 'uname': user_dict[field] = "unknown" else: user_dict[field] = 0 result_dict[item['_id']] = user_dict #print 'uname type:', type(user_dict['uname']) #print 'result_dict:', result_dict #print 'len result_dict:', len(search_result) return result_dict
def get_user_info(uid_list): user_list = [] # print 'user_list::',user_list user_result = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={'ids': uid_list})['docs'] core_user = [] for item in user_result: user_dict = dict() user_dict['uid'] = item['_id'] if item['found']: user_dict['photo_url'] = item['_source']['photo_url'] user_dict['nick_name'] = item['_source']['nick_name'] user_dict['sex'] = item['_source']['sex'] user_dict['friendsnum'] = item['_source']['friendsnum'] user_dict['fansnum'] = item['_source']['fansnum'] user_dict['user_location'] = item['_source']['user_location'] else: user_dict['photo_url'] = '' user_dict['nick_name'] = '' user_dict['sex'] = '' user_dict['friendsnum'] = '' user_dict['fansnum'] = '' user_dict['user_location'] = '' user_list.append(user_dict) return json.dumps(user_list)
def get_profile_information(uid_list): #print 'len uid list:', len(uid_list) result_dict = dict() search_result = es.mget(index=index_name, doc_type=index_type, body={'ids':uid_list}, _source=True)['docs'] #print 'search_result:', search_result for item in search_result: user_dict = {} for field in fields_dict: try: user_dict[field] = item['_source'][fields_dict[field]] except: if field=='statusnum': user_dict[field] = 0 elif field=='fansnum': user_dict[field] =0 elif field=='friendsnum': user_dict[field] = 0 elif field=='uname': user_dict[field] = 0 else: user_dict[field] = 0 result_dict[item['_id']] = user_dict #print 'uname type:', type(user_dict['uname']) #print 'result_dict:', result_dict #print 'len result_dict:', len(search_result) return result_dict
def get_profile_information(uid_list): result_dict = dict() search_result = es.mget(index=index_name, doc_type=index_type, body={'ids': uid_list}, _source=True)['docs'] for item in search_result: user_dict = {} for field in fields_dict: try: user_dict[field] = item['_source'][fields_dict[field]] except: if field == 'statusnum': user_dict[field] = 0 elif field == 'fansnum': user_dict[field] = 0 elif field == 'friendsnum': user_dict[field] = 0 elif field == 'gender': user_dict[field] = 0 elif field == 'uname': user_dict[field] = item['_id'] else: user_dict[field] = '' result_dict[item['_id']] = user_dict return result_dict
def main(): uid_list = [] with open("uid_list_0520.txt", 'rb') as f: for item in f: uid_list.append(item.strip()) bci_results = es_user_profile.mget(index="bci_history", doc_type="bci", body={"ids":uid_list})["docs"] sen_results = es_user_profile.mget(index="sensitive_history", doc_type="sensitive", body={"ids":uid_list})["docs"] with open("bci_history.txt", 'wb') as f_bci: for item in bci_results: if item['found']: f_bci.write(json.dumps(item['_source'])+"\n") with open("sen_history.txt", "wb") as f_sen: for item in sen_results: if item['found']: f_sen.write(json.dumps(item['_source'])+"\n")
def main(): uid_list = [] with open("uid_list_0520.txt", 'rb') as f: for item in f: uid_list.append(item.strip()) bci_results = es_user_profile.mget(index="bci_history", doc_type="bci", body={"ids": uid_list})["docs"] sen_results = es_user_profile.mget(index="sensitive_history", doc_type="sensitive", body={"ids": uid_list})["docs"] with open("bci_history.txt", 'wb') as f_bci: for item in bci_results: if item['found']: f_bci.write(json.dumps(item['_source']) + "\n") with open("sen_history.txt", "wb") as f_sen: for item in sen_results: if item['found']: f_sen.write(json.dumps(item['_source']) + "\n")
def search_hot_mid(task_name, ts): query_body = { "query": { "range":{ "timestamp":{ "lt": ts } } }, "aggs":{ "hot_mid":{ "terms":{"field": "root_mid", "size": 100} } } } mid_list = [] return_list = [] # return hot mid uid_list = [] es_results = es.search(index=task_name, doc_type="text", body=query_body)["aggregations"]["hot_mid"]["buckets"] for item in es_results: if item["doc_count"] >= 500: mid_list.append(item["key"]) if mid_list: weibo_results = es.mget(index=task_name, doc_type="text", body={"ids":mid_list})["docs"] for item in weibo_results: if item["found"]: mid = item["_id"] retweet, comment = search_retweet_comment(task_name, mid) detail = item["_source"] detail["retweet"] = retweet detail["comment"] = comment uid_list.append(detail["uid"]) return_list.append(detail) if uid_list: profile_results = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list})["docs"] for i in range(len(uid_list)): detail = profile_results[i] if detail["found"]: return_list[i]["uname"] = detail["_source"]["nick_name"] return_list[i]["photo_url"] = detail["_source"]["photo_url"] return_list[i]["fansnum"] = detail["_source"]["fansnum"] return_list[i]["statusnum"] = detail["_source"]["statusnum"] else: return_list[i]["uname"] = detail["_id"] return_list[i]["photo_url"] = "" return_list[i]["fansnum"] = "" return_list[i]["statusnum"] = "" return return_list
def get_user_name(uid_list): try: portrait_exist_result = es_user_profile.mget(index=profile_index_name,\ doc_type=profile_index_type,body={'ids':uid_list})['docs'] except: portrait_exist_result = [] uid2uname_dict = {} print portrait_exist_result for portrait_item in portrait_exist_result: uid = portrait_item['_id'] if portrait_item['found'] == True: source = portrait_item['_source'] uname = source['nick_name'] else: uname = uid uid2uname_dict[uid] = uname return uid2uname_dict
def get_profile_information(uid_list): result_dict = dict() search_result = es.mget(index=index_name, doc_type=index_type, body={'ids': uid_list}, _source=True)['docs'] iter_count = 0 for item in search_result: user_dict = {} for field in fields_dict: try: user_dict[field] = item['_source'][fields_dict[field]] except: user_dict[field] = '' result_dict[item['_id']] = user_dict iter_count += 1 return result_dict
def get_future_user(uid_list): return_list = [] if uid_list: profile_results = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list})["docs"] for item in profile_results: tmp = [] tmp.append(item["_id"]) if item["found"]: tmp.append(item["_source"]["nick_name"]) tmp.append(item["_source"]["photo_url"]) tmp.append(item["_source"]["fansnum"]) tmp.append(item["_source"]["statusnum"]) else: tmp.append(item["_id"]) tmp.extend(["","",""]) return_list.append(tmp) return return_list
def search_attention(uid): stat_results = dict() results = dict() for db_num in R_DICT: r = R_DICT[db_num] ruid_results = r.hgetall('retweet_'+str(uid)) if ruid_results: for ruid in ruid_results: if ruid != uid: try: stat_results[ruid] += ruid_results[ruid] except: stat_results[ruid] = ruid_results[ruid] # print 'results:', stat_results if not stat_results: return [None, 0] try: sort_state_results = sorted(stat_results.items(), key=lambda x:x[1], reverse=True)[:20] except: return [None, 0] print 'sort_state_results:', sort_state_results uid_list = [item[0] for item in sort_state_results] es_profile_results = es_user_profile.mget(index='weibo_user', doc_type='user', body={'ids':uid_list})['docs'] es_portrait_results = es_user_portrait.mget(index='user_portrait', doc_type='user', body={'ids':uid_list})['docs'] result_list = dict() for i in range(len(es_profile_results)): item = es_profile_results[i] uid = item['_id'] try: source = item['_source'] uname = source['nick_name'] except: uname = u'未知' # identify uid is in the user_portrait portrait_item = es_portrait_results[i] try: source = portrait_item[i] in_status = 1 except: in_status = 0 result_list[uid] = [uid,[uname, stat_results[uid], in_status]] return [result_list, len(stat_results)]
def search_attention(uid): stat_results = dict() results = dict() for db_num in R_DICT: r = R_DICT[db_num] ruid_results = r.hgetall("retweet_" + str(uid)) if ruid_results: for ruid in ruid_results: if ruid != uid: try: stat_results[ruid] += ruid_results[ruid] except: stat_results[ruid] = ruid_results[ruid] # print 'results:', stat_results if not stat_results: return [None, 0] try: sort_state_results = sorted(stat_results.items(), key=lambda x: x[1], reverse=True)[:20] except: return [None, 0] print "sort_state_results:", sort_state_results uid_list = [item[0] for item in sort_state_results] es_profile_results = es_user_profile.mget(index="weibo_user", doc_type="user", body={"ids": uid_list})["docs"] es_portrait_results = es_user_portrait.mget(index="user_portrait", doc_type="user", body={"ids": uid_list})["docs"] result_list = dict() for i in range(len(es_profile_results)): item = es_profile_results[i] uid = item["_id"] try: source = item["_source"] uname = source["nick_name"] except: uname = u"未知" # identify uid is in the user_portrait portrait_item = es_portrait_results[i] try: source = portrait_item[i] in_status = 1 except: in_status = 0 result_list[uid] = [uid, [uname, stat_results[uid], in_status]] return [result_list, len(stat_results)]
def search_profile(uid_list): ''' 输入:uid列表 ''' es_profile_results = es_user_profile.mget(index='user_portrait_1222', doc_type='user', body={'ids':uid_list})['docs'] result_list = dict() for i in range(len(es_profile_results)): item = es_profile_results[i] uid = item['_id'].encode('utf-8') if item['found']:#有数据 source = item['_source'] topic = source['topic_string'] else: topic = '' result_list[uid] = topic return result_list
def search_user_type(uid_list): type_list = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type, \ body={'ids': uid_list},_source=False, fields=['id', 'verified_type'])['docs'] user_list = [] org_list = [] for i in type_list: if i['found'] == False: user_list.append(i['_id']) else: # print i if not i.has_key('verified_type'): user_list.append(i['_id']) continue verified_type = i['fields']['verified_type'][0] if verified_type in org_list: org_list.append(i['_id']) else: user_list.append(i['_id']) return user_list, org_list
def get_user_profile(uid_list,specify_field=[]): if not uid_list: return [] results = [] search_results = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type,body={"ids":uid_list})['docs'] field_list = ["nick_name", "fansnum", "friendsnum","photo_url", 'description', "statusnum","sp_type", "user_location", "create_at", "sex", "verified_type", "isreal", "user_email"] for item in search_results: iter_result = [] iter_result.append(item['_id']) if item['found']: if specify_field: field_list = specify_field for iter_field in field_list: iter_result.append(item['_source'][iter_field]) else: iter_result.extend(['']*len(field_list)) results.append(iter_result) return results
def search_profile(uid_list): ''' 输入:uid列表 ''' es_profile_results = es_user_profile.mget(index='user_portrait_1222', doc_type='user', body={'ids': uid_list})['docs'] result_list = dict() for i in range(len(es_profile_results)): item = es_profile_results[i] uid = item['_id'].encode('utf-8') if item['found']: #有数据 source = item['_source'] topic = source['topic_string'] else: topic = '' result_list[uid] = topic return result_list
def reducer(): count = 0 ts = time.time() while 1: user_set = r_flow.rpop('update_bci_list') bulk_action = [] if user_set: items = json.loads(user_set) uid_list = [] for item in items: uid_list.append(item['id']) if uid_list: search_results = es.mget(index=BCIHIS_INDEX_NAME, doc_type=BCIHIS_INDEX_TYPE, body={"ids":uid_list})["docs"] cal_num_for_bci_history(uid_list, items, search_results) count += len(uid_list) if count % 10000 == 0: te = time.time() #print "count: %s, cost time: %s" %(count, te-ts) ts = te else: print count break
def get_profile_information(uid_list): result_dict = dict() search_result = es.mget(index=index_name, doc_type=index_type, body={'ids':uid_list}, _source=True)['docs'] for item in search_result: user_dict = {} for field in fields_dict: try: user_dict[field] = item['_source'][fields_dict[field]] except: if field=='statusnum': user_dict[field] = 0 elif field=='fansnum': user_dict[field] =0 elif field=='friendsnum': user_dict[field] = 0 elif field=='gender': user_dict[field] = 0 elif field=='uname': user_dict[field] = u'unknown' else: user_dict[field] = 'unknown' result_dict[item['_id']] = user_dict return result_dict
def get_topic_top_user(topic_top): result = {} topic_user = {} index_name = 'weibo_user' index_type = 'user' #test user list test_user_list = [['1499104401', '1265965213', '3270699555', '2073915493', '1686474312'],\ ['2803301701', '2105426467', '1665372775', '3716504593', '2892376557'],\ ['1457530250', '1698513182', '2793591492', '2218894100', '1737961042'],\ ['1656818110', '1660127070', '1890124610', '1182391230', '1243861100'],\ ['1680430844', '2998045524', '2202896360', '1639498782', '3494698730'],\ ['2587093162', '1677675054', '1871767009', '1193111400', '1672418622'],\ ['1730726640', '1752502540', '1868725480', '1262486750', '1235733080'],\ ['1250041100', '2275231150', '1268642530', '1658606270', '1857599860'],\ ['1929496477', '2167425990', '1164667670', '2417139911', '1708853044'],\ ['1993292930', '1645823930', '1890926610', '1641561810', '2023833990'],\ ['2005471590', '1233628160', '2074684140', '1396715380', '1236762250'],\ ['1423592890', '2612799560', '1926127090', '2684951180', '1760607220']] count = 0 for item in topic_top: topic = item[0] #test user_list = test_user_list[count] result[topic] = [] profile_result = es_user_profile.mget(index=index_name, doc_type=index_type, body={'ids':user_list})['docs'] for profile in profile_result: uid = profile['_id'] try: uname = profile['_source']['nick_name'] photo_url = profile['_source']['photo_url'] except: uname = 'unknown' photo_url = 'unknown' result[topic].append([uid, uname, photo_url]) count += 1 return result
def reducer(): count = 0 ts = time.time() while 1: user_set = r_flow.rpop('update_bci_list') bulk_action = [] if user_set: items = json.loads(user_set) uid_list = [] for item in items: uid_list.append(item['id']) if uid_list: search_results = es.mget(index=BCIHIS_INDEX_NAME, doc_type=BCIHIS_INDEX_TYPE, body={"ids": uid_list})["docs"] cal_num_for_bci_history(uid_list, items, search_results) count += len(uid_list) if count % 10000 == 0: te = time.time() #print "count: %s, cost time: %s" %(count, te-ts) ts = te else: print count break
def get_domain_top_user(domain_top): result = {} domain_user = {} index_name = 'weibo_user' index_type = 'user' #test user list test_user_list = [['2803301701', '1639498782', '2656274875', '1402977920', '3114175427'], \ ['3575186384', '1316683401', '1894603174', '1641542052', '1068248497'], \ ['1729736051', '1396715380', '2377610962', '1828183230', '2718018210'], \ ['1250748474', '3270699555', '1417037145', '1193111400', '1403915120'], \ ['1671342103', '1255849511', '1647497355', '1989660417', '1189729754'], \ ['1182391231', '1670071920', '1689618340', '1494850741', '1708942053'],\ ['3400918220', '2685504141', '2056115850', '1768001547', '3317008062'],\ ['2001627641', '1773489534', '2458194884', '1822155333', '1799201635'],\ ['1709157165', '2074370833', '2167425990', '3204839810', '3690518992'],\ ['1664065962', '3299094722', '1942531237', '2799434700', '1784404677'],\ ['1218353337', '1761179351', '3482911112', '1220291284', '2504433601'],\ ['3682473195', '1627673351', '1779065471', '3316144700', '1896701827']] count = 0 for item in domain_top: domain = item[0] #test user_list = test_user_list[count] result[domain] = [] profile_result = es_user_profile.mget(index=index_name, doc_type=index_type, body={'ids':user_list})['docs'] for profile in profile_result: uid = profile['_id'] try: uname = profile['_source']['nick_name'] photo_url = profile['_source']['photo_url'] except: uname = 'unknown' photo_url = 'unknown' result[domain].append([uid, uname, photo_url]) count += 1 return result
query_sensitive_body = { "query":{ "match_all":{} }, "size":1, "sort":{sensitive_string:{"order":"desc"}} } try: top_sensitive_result = es_bci_history.search(index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX, body=query_sensitive_body, _source=False, fields=[sensitive_string])['hits']['hits'] top_sensitive = top_sensitive_result[0]['fields'][sensitive_string][0] except Exception, reason: print Exception, reason top_sensitive = 400 index_type = 'bci' user_bci_result = es_cluster.mget(index=index_name, doc_type=index_type, body={'ids':uid_list}, _source=True)['docs'] user_profile_result = es_user_profile.mget(index='weibo_user', doc_type='user', body={'ids':uid_list}, _source=True)['docs'] bci_history_result = es_bci_history.mget(index=bci_history_index_name, doc_type=bci_history_index_type, body={"ids":uid_list}, fields=['user_fansnum', 'weibo_month_sum'])['docs'] sensitive_history_result = es_bci_history.mget(index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX, body={'ids':uid_list}, fields=[sensitive_string], _source=False)['docs'] max_evaluate_influ = get_evaluate_max(index_name) for i in range(0, len(uid_list)): uid = uid_list[i] bci_dict = user_bci_result[i] profile_dict = user_profile_result[i] bci_history_dict = bci_history_result[i] sensitive_history_dict = sensitive_history_result[i] #print sensitive_history_dict try: bci_source = bci_dict['_source'] except: bci_source = None if bci_source:
def main(): if RUN_TYPE: now_ts = time.time() - DAY # 前一天 ts = str(datetime2ts(ts2datetime(now_ts))) else: ts = str(datetime2ts('2016-03-16')) now_ts = int(ts) print now_ts sensitive_string = "sensitive_" + ts date_string = ts update_sensitive_key = "sensitive_score_" + ts # 更新的键 sensitive_dict_key = "sensitive_dict_" + ts sensitive_string_key = "sensitive_string_" + ts sensitive_day_change_key = "sensitive_" + ts + "_day_change" del_month = datetime2ts(ts2datetime(now_ts - MONTH)) del_sensitive_key = "sensitive_score_" + str(del_month) # 要删除的键 former_ts = int(ts) - DAY former_date = str(datetime2ts(ts2datetime(former_ts))) former_sensitive_key = "sensitive_score_" + former_date iter_count = 0 bulk_action = [] mappings(ES_SENSITIVE_INDEX) total_number = r.hlen(sensitive_string) scan_cursor = 0 print total_number while 1: re_scan = r.hscan(sensitive_string, scan_cursor, count=1000) scan_cursor = re_scan[0] if len(re_scan[1]) != 0: sensitive_info = re_scan[1] # 字典形式,uid:sensitive_words_dict uid_list = sensitive_info.keys() sensitive_results = es.mget(index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX, body={"ids": uid_list})['docs'] if sensitive_results: for item in sensitive_results: uid = item['_id'] sensitive_words_dict = json.loads( sensitive_info[uid]) # json.loads current_sensitive_score = 0 for k, v in sensitive_words_dict.iteritems(): tmp_stage = r_sensitive.hget("sensitive_words", k) if tmp_stage: current_sensitive_score += v * sensitive_score_dict[ str(tmp_stage)] if item['found']: # 之前存在相关信息 revise_item = item["_source"] if del_sensitive_key in revise_item: item.remove(del_sensitive_key) revise_item['uid'] = uid # 新更新的敏感度 revise_item[ update_sensitive_key] = current_sensitive_score # 新更新的敏感词 revise_item[sensitive_dict_key] = sensitive_info[uid] # 新更新的string revise_item[sensitive_string_key] = "&".join( sensitive_words_dict.keys()) # 当天和之前一天、一周和一月均值的差异 revise_item[ 'sensitive_day_change'] = current_sensitive_score - revise_item.get( former_sensitive_key, 0) revise_item[ 'sensitive_week_change'] = current_sensitive_score - revise_item.get( 'sensitive_week_ave', 0) revise_item[ 'sensitive_month_change'] = current_sensitive_score - revise_item.get( 'sensitive_month_ave', 0) # 更新后week、month的均值和方差 revise_item['sensitive_week_ave'], revise_item[ 'sensitive_week_var'], revise_item[ 'sensitive_week_sum'] = compute_week( revise_item, now_ts) revise_item['senstiive_month_ave'], revise_item[ 'sensitive_month_var'], revise_item[ 'sensitive_month_sum'] = compute_month( revise_item, now_ts) else: revise_item = dict() revise_item['uid'] = uid revise_item[ update_sensitive_key] = current_sensitive_score revise_item[sensitive_dict_key] = sensitive_info[uid] revise_item[sensitive_string_key] = "&".join( sensitive_words_dict.keys()) revise_item[ 'sensitive_day_change'] = current_sensitive_score revise_item[ 'sensitive_week_change'] = current_sensitive_score revise_item[ 'sensitive_month_change'] = current_sensitive_score revise_item['sensitive_week_ave'], revise_item[ 'sensitive_week_var'], revise_item[ 'sensitive_week_sum'] = compute_week( revise_item, now_ts) revise_item['senstiive_month_ave'], revise_item[ 'sensitive_month_var'], revise_item[ 'sensitive_month_sum'] = compute_month( revise_item, now_ts) action = {'index': {'_id': uid}} bulk_action.extend([action, revise_item]) iter_count += 1 if iter_count % 1000 == 0: es.bulk(bulk_action, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX) bulk_action = [] print iter_count if int(scan_cursor) == 0: break if bulk_action: es.bulk(bulk_action, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX) print iter_count
def key_words_search(task_id, search_type, pre, during, start_time, keyword_list, search_key='', sort_norm='', sort_scope='', time=1, isall=False, number=100): number = int(number) should = [] for key in keyword_list: if search_type == "hashtag": should.append({"prefix": {"text": "#" + key + "#"}}) else: should.append({"wildcard": {"text": "*" + key + "*"}}) index_list = [] date = ts2datetime(start_time) index_name = pre + date while during: if es_flow_text.indices.exists(index=index_name): index_list.append(index_name) start_time = start_time + DAY date = ts2datetime(start_time) index_name = pre + date during -= 1 print index_list uid_set = set() text_results = [] query_body = { "query": { "bool": { "must": should } }, "sort": { "user_fansnum": { "order": "desc" } }, "size": 5000 } results = es_flow_text.search(index=index_list, doc_type='text', body=query_body, _source=False, fields=[ "uid", "user_fansnum", "text", "message_type", "sentiment", "timestamp", "geo", "retweeted", "comment" ])["hits"]["hits"] id_index = 0 index_list = [] un_uid_list = [] for item in results: if item['fields']['uid'][0] not in uid_set: uid_set.add(item['fields']['uid'][0]) un_uid_list.append(item['fields']['uid'][0]) index_list.append(id_index) id_index += 1 uid_list = [] print "un_uid_list: ", len(un_uid_list) portrait_list = [] count = 0 in_index = 0 if not isall and un_uid_list: # 库内 portrait_results = es_user_portrait.mget(index=USER_INDEX_NAME, doc_type=USER_INDEX_TYPE, body={"ids": un_uid_list}, _source=False, fields=['uname'])["docs"] for item in portrait_results: if item["found"]: portrait_list.append(item['_id']) nick_name = item['fields']['uname'][0] if nick_name == 'unknown': nick_name = item['_id'] index = index_list[in_index] weibo_url = weiboinfo2url(results[index]['fields']['uid'][0], results[index]['_id']) text_results.extend([ results[index]['fields']['uid'][0], results[index]['fields']['user_fansnum'][0], results[index]['fields']['text'][0], results[index]['fields']['message_type'][0], results[index]['fields']['sentiment'][0], ts2date(results[index]['fields']['timestamp'][0]), results[index]['fields']['geo'][0], results[index]['fields']['retweeted'][0], results[index]['fields']['comment'][0], nick_name, weibo_url ]) count += 1 if count == number: break print "portrait_len, ", len(portrait_list) in_index += 1 if portrait_list: uid_list = in_sort_filter(time, sort_norm, sort_scope, None, portrait_list, True, number) # sort elif un_uid_list: profile_result = es_user_profile.mget(index="weibo_user", doc_type="user", body={"ids": un_uid_list}, fields=['nick_name'])["docs"] for i in range(len(profile_result)): index = index_list[i] try: nick_name = profile_result[i]['fields']['nick_name'][0] except: nick_name = un_uid_list[i] item = results[index] weibo_url = weiboinfo2url(item['fields']['uid'][0], results[index]['_id']) text_results.append([ item['fields']['uid'][0], item['fields']['user_fansnum'][0], item['fields']['text'][0], item['fields']['message_type'][0], item['fields']['sentiment'][0], ts2date(item['fields']['timestamp'][0]), results[index]['fields']['geo'][0], results[index]['fields']['retweeted'][0], results[index]['fields']['comment'][0], nick_name, weibo_url ]) if i == number: break uid_list = all_sort_filter(un_uid_list[:number], sort_norm, time, True, number) print "filter_uid_list: ", len(uid_list) if uid_list: results = make_up_user_info(uid_list, isall, time, sort_norm) else: results = [] print "results: ", len(results) # 修改状态 task_detail = es_user_portrait.get(index=USER_RANK_KEYWORD_TASK_INDEX, doc_type=USER_RANK_KEYWORD_TASK_TYPE, id=task_id) item = task_detail['_source'] item['status'] = 1 item['result'] = json.dumps(results) item['text_results'] = json.dumps(text_results) item['number'] = len(results) es_user_portrait.index(index=USER_RANK_KEYWORD_TASK_INDEX, doc_type=USER_RANK_KEYWORD_TASK_TYPE, id=task_id, body=item) return "1"
def get_retweet_weibo_detail(ts, size, text_type, type_value): task_detail = es_prediction.get(index=index_sensing_task, doc_type=_id, id=ts)['_source'] origin_weibo_detail = json.loads(task_detail['origin_weibo_detail']) retweeted_weibo_detail = json.loads(task_detail['retweeted_weibo_detail']) mid_list = [] mid_list.extend(origin_weibo_detail.keys()) mid_list.extend(retweeted_weibo_detail.keys()) query_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ {"range":{ "timestamp":{ "gte": ts - time_interval, "lt": ts } }}, {"terms": {"root_mid": mid_list}} ] } } } }, "sort": {"timestamp": {"order": "desc"}}, "size": 100 } if text_type == "message_type": query_body['query']['filtered']['filter']['bool']['must'].append({"term":{text_type: type_value}}) if text_type == "sentiment": #if isinstance(type_value, str): if len(type_value) == 1: query_body['query']['filtered']['filter']['bool']['must'].append({"term":{text_type: type_value}}) else: query_body['query']['filtered']['filter']['bool']['must'].append({"terms":{text_type: type_value}}) datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts-time_interval) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es_1 = es_text.indices.exists(index_name_1) # 1. 查询微博 if datetime == datetime_1 and exist_es: search_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] elif datetime != datetime_1 and exist_es_1: search_results = es_text.search(index=index_name_1, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] else: search_results = [] #print search_results # 2. 获取微博相关信息 results = [] uid_list = [] if search_results: for item in search_results: uid_list.append(item["_source"]['uid']) if uid_list: portrait_result = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list}, fields=['nick_name', 'photo_url'])["docs"] for i in range(len(uid_list)): item = search_results[i]['_source'] temp = [] # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type temp.append(item['uid']) if portrait_result[i]['found']: temp.append(portrait_result[i]["fields"]["nick_name"][0]) temp.append(portrait_result[i]["fields"]["photo_url"][0]) else: temp.append(item['uid']) temp.append("") temp.append(item["text"]) #print item['text'] temp.append(item["sentiment"]) temp.append(ts2date(item['timestamp'])) temp.append(item['geo']) temp.append(item["message_type"]) results.append(temp) return results
def get_origin_weibo_detail(ts, size, order, message_type=1): #print r.get("topic_value_dict") #error:topic_value_dict里存的为空 #topic_value_dict = json.loads(r.get("topic_value_dict")) task_detail = es_prediction.get(index=index_sensing_task, doc_type=_id, id=ts)['_source'] mid_value = json.loads(task_detail['mid_topic_value']) duplicate_dict = json.loads(task_detail['duplicate_dict']) tmp_duplicate_dict = dict() for k,v in duplicate_dict.iteritems(): try: tmp_duplicate_dict[v].append(k) except: tmp_duplicate_dict[v] = [k, v] if message_type == 1: weibo_detail = json.loads(task_detail['origin_weibo_detail']) elif message_type == 2: weibo_detail = json.loads(task_detail['retweeted_weibo_detail']) else: weibo_detail = json.loads(task_detail['sensitive_weibo_detail']) weibo_detail_list = [] if weibo_detail: for iter_mid, item in weibo_detail.iteritems(): tmp = [] tmp.append(iter_mid) tmp.append(item[iter_mid]) tmp.append(item['retweeted']) tmp.append(item['comment']) weibo_detail_list.append(tmp) mid_list = weibo_detail.keys() results = [] query_body = { "query":{ "filtered":{ "filter":{ "terms":{"mid": mid_list} } } }, "size": 1000, "sort": {"timestamp": {"order": "desc"}} } index_list = [] datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts-DAY) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) if exist_es: index_list.append(index_name) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es_1 = es_text.indices.exists(index_name_1) if exist_es_1: index_list.append(index_name_1) if index_list and mid_list: search_results = es_text.search(index=index_list, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] else: search_results = [] uid_list = [] text_dict = dict() # 文本信息 portrait_dict = dict() # 背景信息 sort_results = [] if search_results: for item in search_results: uid_list.append(item["_source"]['uid']) text_dict[item['_id']] = item['_source'] # _id是mid if uid_list: portrait_result = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list}, fields=['nick_name', 'photo_url'])["docs"] for item in portrait_result: if item['found']: portrait_dict[item['_id']] = {"nick_name": item["fields"]["nick_name"][0], "photo_url": item["fields"]["photo_url"][0]} else: portrait_dict[item['_id']] = {"nick_name": item['_id'], "photo_url":""} if order == "total": sorted_list = sorted(weibo_detail_list, key=lambda x:x[1], reverse=True) elif order == "retweeted": sorted_list = sorted(weibo_detail_list, key=lambda x:x[2], reverse=True) elif order == "comment": sorted_list = sorted(weibo_detail_list, key=lambda x:x[3], reverse=True) else: sorted_list = weibo_detail_list count_n = 0 results_dict = dict() mid_index_dict = dict() for item in sorted_list: # size mid = item[0] iter_text = text_dict.get(mid, {}) temp = [] # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type if iter_text: uid = iter_text['uid'] temp.append(uid) iter_portrait = portrait_dict.get(uid, {}) if iter_portrait: temp.append(iter_portrait['nick_name']) temp.append(iter_portrait['photo_url']) else: temp.extend([uid,'']) temp.append(iter_text["text"]) temp.append(iter_text["sentiment"]) temp.append(ts2date(iter_text['timestamp'])) temp.append(iter_text['geo']) if message_type == 1: temp.append(1) elif message_type == 2: temp.append(3) else: temp.append(iter_text['message_type']) temp.append(item[2]) temp.append(item[3]) temp.append(iter_text.get('sensitive', 0)) temp.append(iter_text['timestamp']) temp.append(mid_value[mid]) temp.append(mid) results.append(temp) count_n += 1 results = sorted(results, key=operator.itemgetter(-4, -2, -6), reverse=True) # -4 -2 -3 sort_results = [] count = 0 for item in results: sort_results.append([item]) mid_index_dict[item[-1]] = count count += 1 if tmp_duplicate_dict: remove_list = [] value_list = tmp_duplicate_dict.values() # [[mid, mid], ] for item in value_list: tmp = [] for mid in item: if mid_index_dict.get(mid, 0): tmp.append(mid_index_dict[mid]) if len(tmp) > 1: tmp_min = min(tmp) else: continue tmp.remove(tmp_min) for iter_count in tmp: sort_results[tmp_min].extend(sort_results[iter_count]) remove_list.append(sort_results[iter_count]) if remove_list: for item in remove_list: sort_results.remove(item) return sort_results
def get_positive_weibo_detail(ts, social_sensors, keywords_list, size, sentiment_type=1): former_mid_list = query_mid_list(ts-time_interval, keywords_list, time_segment, social_sensors) # 前一段时间内的微博mid list current_mid_list = query_mid_list(ts, keywords_list, time_interval, social_sensors) mid_list = [] mid_list.extend(former_mid_list) mid_list.extend(current_mid_list) query_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ {"range":{ "timestamp":{ "gte": ts - time_interval, "lt": ts } }}, ], "should":[ {"terms": {"root_mid": mid_list}}, {"terms": {"mid": mid_list}}, {"terms":{"keywords_string": keywords_list}} ] } } } }, "sort": {"timestamp": {"order": "desc"}}, "size": 100 } #if social_sensors and int(sentiment_type) == 1: # query_body["query"]["filtered"]["filter"]["bool"]["must"].append({"terms":{"uid": social_sensors}}) if int(sentiment_type) == 1 or int(sentiment_type) == 0: query_body["query"]["filtered"]["filter"]["bool"]["must"].append({"term":{"sentiment":sentiment_type}}) else: query_body["query"]["filtered"]["filter"]["bool"]["must"] = [{"terms":{"sentiment": ["2", "3"]}}] # 判断当前ts和ts-time_interval是否属于同一天,确定查询哪个es datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts-time_interval) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es_1 = es_text.indices.exists(index_name_1) # 1. 聚合原创微博mid list if datetime == datetime_1 and exist_es: search_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] elif datetime != datetime_1 and exist_es_1: search_results = es_text.search(index=index_name_1, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] else: search_results = [] uid_list = [] results = [] if search_results: for item in search_results: uid_list.append(item["_source"]['uid']) if uid_list: portrait_result = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list}, fields=['nick_name', 'photo_url'])["docs"] for i in range(len(uid_list)): item = search_results[i]['_source'] temp = [] # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type temp.append(item['uid']) if portrait_result[i]['found']: temp.append(portrait_result[i]["fields"]["nick_name"][0]) temp.append(portrait_result[i]["fields"]["photo_url"][0]) else: temp.append("unknown") temp.append("") temp.append(item["text"]) temp.append(item["sentiment"]) temp.append(ts2date(item['timestamp'])) temp.append(item['geo']) keywords_set = set(item['keywords_string'].split('&')) common_keywords = set(keywords_list) & keywords_set temp.append(list(common_keywords)) temp.append(item['message_type']) results.append(temp) return results
def key_words_search(task_id, search_type , pre , during , start_time , keyword_list , search_key = '' , sort_norm = '', sort_scope = '' ,time = 7 , isall = False, number = 100): number = int(number) should = [] for key in keyword_list: if search_type == "hashtag": should.append({"prefix":{"text": "#" + key + "#"}}) else: should.append({"wildcard":{"text": "*" +key + "*"}}) index_list = [] date = ts2datetime(start_time) index_name = pre + date while during: if es_flow_text.indices.exists(index=index_name): index_list.append(index_name) start_time = start_time + DAY date = ts2datetime(start_time) index_name = pre + date during -= 1 print index_list uid_set = set() text_results = [] sorted_text_results = [] query_body = { "query":{ "bool":{ "must":should } }, "sort":{"user_fansnum":{"order":"desc"}}, "size":5000 } results = es_flow_text.search(index = index_list , doc_type = 'text' , body = query_body, _source=False, fields=["uid", "user_fansnum","text", "message_type", "sentiment","timestamp", "geo", "retweeted", "comment"])["hits"]["hits"] id_index = 0 index_list = [] un_uid_list = [] for item in results : if item['fields']['uid'][0] not in uid_set: uid_set.add(item['fields']['uid'][0]) un_uid_list.append(item['fields']['uid'][0]) index_list.append(id_index) id_index += 1 #get_all_filed(sort_norm , time) uid_list = [] print "un_uid_list: ", len(un_uid_list) portrait_list = [] count = 0 in_index = 0 if not isall and un_uid_list : # 库内 portrait_results = es_user_portrait.mget(index=USER_INDEX_NAME, doc_type=USER_INDEX_TYPE, body={"ids":un_uid_list}, _source=False, fields=['uname'])["docs"] for item in portrait_results: if item["found"]: portrait_list.append(item['_id']) nick_name = item['fields']['uname'][0] if nick_name == 'unknown': nick_name = item['_id'] index = index_list[in_index] weibo_url = weiboinfo2url(results[index]['fields']['uid'][0], results[index]['_id']) text_results.extend([results[index]['fields']['uid'][0], results[index]['fields']['user_fansnum'][0], results[index]['fields']['text'][0], results[index]['fields']['message_type'][0], results[index]['fields']['sentiment'][0], ts2date(results[index]['fields']['timestamp'][0]), results[index]['fields']['geo'][0], results[index]['fields']['retweeted'][0], results[index]['fields']['comment'][0], nick_name, weibo_url]) count += 1 if count == number: break print "portrait_len, ", len(portrait_list) in_index += 1 if portrait_list: uid_list = in_sort_filter(time,sort_norm ,sort_scope ,None , portrait_list , True, number) # sort for iter_uid in uid_list: iter_index = portrait_list.index(iter_uid) sorted_text_results.append(text_results[i]) elif un_uid_list: profile_result = es_user_profile.mget(index="weibo_user", doc_type="user", body={"ids":un_uid_list}, fields=['nick_name'])["docs"] for i in range(len(profile_result)): index = index_list[i] try: nick_name = profile_result[i]['fields']['nick_name'][0] except: nick_name = un_uid_list[i] item = results[index] weibo_url = weiboinfo2url(item['fields']['uid'][0], results[index]['_id']) text_results.append([item['fields']['uid'][0], item['fields']['user_fansnum'][0], item['fields']['text'][0], item['fields']['message_type'][0], item['fields']['sentiment'][0], ts2date(item['fields']['timestamp'][0]), results[index]['fields']['geo'][0], results[index]['fields']['retweeted'][0], results[index]['fields']['comment'][0], nick_name, weibo_url]) if i == number: break uid_list = all_sort_filter(un_uid_list[:number] , sort_norm , time ,True, number) sorted_text_results = [] f = open("small.txt", "wb") for iter_uid in uid_list: iter_index = un_uid_list.index(iter_uid) f.write(str(iter_uid)+"\n") sorted_text_results.append(text_results[iter_index]) f.close() print "filter_uid_list: ", len(uid_list) if uid_list: results = make_up_user_info(uid_list,isall,time,sort_norm) else: results = [] print "results: ", len(results) # 修改状态 task_detail = es_user_portrait.get(index=USER_RANK_KEYWORD_TASK_INDEX , doc_type=USER_RANK_KEYWORD_TASK_TYPE, id=task_id) item = task_detail['_source'] item['status'] = 1 item['result'] = json.dumps(results) item['text_results'] = json.dumps(sorted_text_results) item['number'] = len(results) es_user_portrait.index(index = USER_RANK_KEYWORD_TASK_INDEX , doc_type=USER_RANK_KEYWORD_TASK_TYPE , id=task_id, body=item) return "1"
def potential_user(task_name, ts): index_name = "stimulation_"+task_name index_type = "stimulation_results" #查询当前root_mid query_body = { "query": { "bool":{ "must":[ {"range":{ "timestamp":{ "lt": ts } }}, {"term":{"message_type":1}}, {"range":{ "user_fansnum":{ "gte": 10000 } }} ] } }, "size": 10000 } es_results = es.search(index=task_name, doc_type="text", body=query_body)["hits"]["hits"] mid_list = [] uid_list = [] feature_list = [] prediction_uid = [] prediction_weibo = [] with open("prediction_uid.pkl", "r") as f: uid_model = pickle.load(f) with open("prediction_weibo.pkl", "r") as f: weibo_model = pickle.load(f) for item in es_results: mid_list.append(item["_id"]) uid_list.append(item["_source"]["uid"]) tmp_feature_list = organize_feature(task_name,item["_id"], ts) feature_list.append(tmp_feature_list) weibo_prediction_result = weibo_model.predict(feature_list) uid_prediction_result = uid_model.predict(feature_list) future_total = 0 current_total = 0 results_dict = dict() in_potential_list = [] for i in range(len(mid_list)): mid = mid_list[i] uid = uid_list[i] iter_count = es.count(index=task_name, doc_type="text", body={"query":{"term":{"root_mid":mid}}})["count"] pre_count = weibo_prediction_result[i] future_total += abs(pre_count-iter_count) if pre_count >= 500 and iter_count <= 500: current_total += abs(pre_count-iter_count) if not results_dict.has_key(uid): results_dict[uid] = dict() tmp = dict() tmp["mid"] = mid tmp["current_count"] = iter_count tmp["prediction_count"] = int(pre_count) weibo_detail = es.get(index=task_name, doc_type="text", id=mid)["_source"] tmp.update(weibo_detail) retweet, comment = search_retweet_comment(task_name, mid) tmp["retweeted"] = retweet tmp["comment"] = comment results_dict[uid][mid] = tmp # user profile tmp_in_list = results_dict.keys() if tmp_in_list: profile_results = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":tmp_in_list})["docs"] for i in range(len(tmp_in_list)): detail = profile_results[i] tmp = [] uid = tmp_in_list[i] if detail["found"]: tmp.append(detail["_source"]["nick_name"]) tmp.append(detail["_source"]["photo_url"]) tmp.append(detail["_source"]["fansnum"]) tmp.append(detail["_source"]["statusnum"]) else: tmp.append(detail["_id"]) tmp.extend(["","",""]) results_dict[uid]["user_profile"] = tmp return results_dict, future_total, current_total
def create_event_warning(xnr_user_no,start_time,end_time): #获取事件名称 today_datetime = start_time hashtag_list = get_hashtag(today_datetime) #print 'hashtag_list::',hashtag_list flow_text_index_name = get_day_flow_text_index_list(today_datetime) #虚拟人的粉丝列表和关注列表 try: es_xnr_result=es_xnr.get(index=weibo_xnr_fans_followers_index_name,doc_type=weibo_xnr_fans_followers_index_type,id=xnr_user_no)['_source'] followers_list=es_xnr_result['followers_list'] fans_list=es_xnr_result['fans_list'] except: followers_list=[] fans_list=[] event_warming_list=[] event_num=0 for event_item in hashtag_list: event_sensitive_count=0 event_warming_content=dict() #事件名称、主要参与用户、典型微博、事件影响力、事件平均时间 event_warming_content['event_name']=event_item['event_name'] print 'event_name:',event_item event_num=event_num+1 print 'event_num:::',event_num print 'first_time:::',int(time.time()) event_influence_sum=0 event_time_sum=0 query_body={ 'query':{ # 'bool':{ # 'must':[{'wildcard':{'text':'*'+event_item[0]+'*'}}, # {'range':{'sensitive':{'gte':1}}}] # } 'filtered':{ 'filter':{ 'bool':{ 'must':[ {'term':{'hashtag':event_item['event_name']}}, {'range':{'sensitive':{'gte':1}}}, {'range':{'timestamp':{'gte':start_time,'lte':end_time}}} ] } } } }, 'size':MAX_WARMING_SIZE, 'sort':{'sensitive':{'order':'desc'}} } #try: event_results=es_flow_text.search(index=flow_text_index_name,doc_type=flow_text_index_type,body=query_body)['hits']['hits'] print 'event:::',len(event_results),start_time,end_time if event_results: weibo_result=[] fans_num_dict=dict() followers_num_dict=dict() alluser_num_dict=dict() print 'sencond_time:::',int(time.time()) for item in event_results: #print 'event_content:',item['_source']['text'] #统计用户信息 if alluser_num_dict.has_key(str(item['_source']['uid'])): followers_mark=set_intersection(item['_source']['uid'],followers_list) if followers_mark > 0: alluser_num_dict[str(item['_source']['uid'])]=alluser_num_dict[str(item['_source']['uid'])]+1*2 else: alluser_num_dict[str(item['_source']['uid'])]=alluser_num_dict[str(item['_source']['uid'])]+1 else: alluser_num_dict[str(item['_source']['uid'])]=1 #计算影响力 origin_influence_value=(1+item['_source']['comment']+item['_source']['retweeted'])*(1+item['_source']['sensitive']) # fans_value=judge_user_type(item['_source']['uid'],fans_list) followers_value=judge_user_type(item['_source']['uid'],followers_list) item['_source']['weibo_influence_value']=origin_influence_value*(followers_value) item['_source']['nick_name']=get_user_nickname(item['_source']['uid']) weibo_result.append(item['_source']) #统计影响力、时间 event_influence_sum=event_influence_sum+item['_source']['weibo_influence_value'] event_time_sum=event_time_sum+item['_source']['timestamp'] print 'third_time:::',int(time.time()) #典型微博信息 the_weibo_result=remove_repeat_v2(weibo_result) the_weibo_result.sort(key=lambda k:(k.get('weibo_influence_value',0)),reverse=True) event_warming_content['main_weibo_info']=json.dumps(the_weibo_result) #事件影响力和事件时间 number=len(event_results) event_warming_content['event_influence']=event_influence_sum/number event_warming_content['event_time']=event_time_sum/number # except: # event_warming_content['main_weibo_info']=[] # event_warming_content['event_influence']=0 # event_warming_content['event_time']=0 # try: #对用户进行排序 alluser_num_dict=sorted(alluser_num_dict.items(),key=lambda d:d[1],reverse=True) main_userid_list=[] for i in xrange(0,len(alluser_num_dict)): main_userid_list.append(alluser_num_dict[i][0]) #主要参与用户信息 main_user_info=[] user_es_result=es_user_profile.mget(index=profile_index_name,doc_type=profile_index_type,body={'ids':main_userid_list})['docs'] for item in user_es_result: user_dict=dict() if item['found']: user_dict['photo_url']=item['_source']['photo_url'] user_dict['uid']=item['_id'] user_dict['nick_name']=item['_source']['nick_name'] user_dict['favoritesnum']=item['_source']['favoritesnum'] user_dict['fansnum']=item['_source']['fansnum'] else: user_dict['photo_url']='' user_dict['uid']=item['_id'] user_dict['nick_name']='' user_dict['favoritesnum']=0 user_dict['fansnum']=0 main_user_info.append(user_dict) event_warming_content['main_user_info']=json.dumps(main_user_info) # except: # event_warming_content['main_user_info']=[] print 'fourth_time:::',int(time.time()) event_warming_content['xnr_user_no']=xnr_user_no event_warming_content['validity']=0 event_warming_content['timestamp']=today_datetime event_warming_list.append(event_warming_content) else: pass print 'fifth_time:::',int(time.time()) return event_warming_list
def get_profile_information(uid_list): result_dict = dict() search_result = es.mget(index=index_name, doc_type=index_type, body={'ids': uid_list}, _source=True)['docs'] try: bci_history_result = es_bci_history.mget( index=bci_history_index_name, doc_type=bci_history_index_type, body={'ids': uid_list}, fields=['user_fansnum', 'weibo_month_sum', 'user_friendsnum'])['docs'] except: bci_history_result = [] iter_count = 0 for item in search_result: user_dict = {} try: bci_history_item = bci_history_result[iter_count] except: bci_history_item = {} for field in fields_dict: try: if field == 'statusnum': if bci_history_item and bci_history_item['found'] == True: if isinstance( bci_history_item['fields']['weibo_month_num'] [0], int): user_dict[field] = bci_history_item['fields'][ 'weibo_month_sum'][0] else: user_dict[field] = 0 else: user_dict[field] = 0 elif field == 'fansnum': if bci_history_item and bci_history_item['found'] == True: if isinstance( bci_history_item['fields']['user_fansnum'][0], int): user_dict[field] = bci_history_item['fields'][ 'user_fansnum'][0] else: user_dict[field] = 0 else: user_dict[field] = 0 elif field == 'friendsnum': if bci_history_item and bci_history_item['found'] == True: if isinstance( bci_history_item['fields']['user_friendsnum'] [0], int): user_dict[field] = bci_history_item['fields'][ 'user_friendsnum'][0] else: user_dict[field] = 0 else: user_dict[field] = 0 else: try: user_dict[field] = item['_source'][field] except: user_dict[field] = '' except: if field == 'statusnum': user_dict[field] = 0 elif field == 'fansnum': user_dict[field] = 0 elif field == 'friendsnum': user_dict[field] = 0 elif field == 'gender': user_dict[field] = 0 elif field == 'uname': user_dict[field] = u'unknown' else: user_dict[field] = 'unknown' result_dict[item['_id']] = user_dict iter_count += 1 return result_dict
def get_profile_information(uid_list): result_dict = dict() search_result = es.mget(index=index_name, doc_type=index_type, body={'ids':uid_list}, _source=True)['docs'] try: bci_history_result = es_bci_history.mget(index=bci_history_index_name, doc_type=bci_history_index_type, body={'ids': uid_list}, fields=['user_fansnum', 'weibo_month_sum', 'user_friendsnum'])['docs'] except: bci_history_result = [] iter_count = 0 for item in search_result: user_dict = {} try: bci_history_item = bci_history_result[iter_count] except: bci_history_item = {} for field in fields_dict: try: if field == 'statusnum': if bci_history_item and bci_history_item['found']==True: if isinstance(bci_history_item['fields']['weibo_month_num'][0], int): user_dict[field] = bci_history_item['fields']['weibo_month_sum'][0] else: user_dict[field] = 0 else: user_dict[field] = 0 elif field == 'fansnum': if bci_history_item and bci_history_item['found']==True: if isinstance(bci_history_item['fields']['user_fansnum'][0], int): user_dict[field] = bci_history_item['fields']['user_fansnum'][0] else: user_dict[field] = 0 else: user_dict[field] = 0 elif field == 'friendsnum': if bci_history_item and bci_history_item['found']==True: if isinstance(bci_history_item['fields']['user_friendsnum'][0], int): user_dict[field] = bci_history_item['fields']['user_friendsnum'][0] else: user_dict[field] = 0 else: user_dict[field] = 0 else: try: user_dict[field] = item['_source'][field] except: user_dict[field] = '' except: if field=='statusnum': user_dict[field] = 0 elif field=='fansnum': user_dict[field] =0 elif field=='friendsnum': user_dict[field] = 0 elif field=='gender': user_dict[field] = 0 elif field=='uname': user_dict[field] = u'unknown' else: user_dict[field] = 'unknown' result_dict[item['_id']] = user_dict iter_count += 1 return result_dict
def main(): if RUN_TYPE: now_ts = time.time()-DAY # 前一天 now_ts = datetime2ts('2016-03-24') ts = str(datetime2ts(ts2datetime(now_ts))) else: ts = str(datetime2ts('2016-03-16')) now_ts = int(ts) print now_ts sensitive_string = "sensitive_" + ts date_string = ts update_sensitive_key = "sensitive_score_" + ts # 更新的键 sensitive_dict_key = "sensitive_dict_" + ts sensitive_string_key = "sensitive_string_" + ts sensitive_day_change_key = "sensitive_" + ts +"_day_change" del_month = datetime2ts(ts2datetime(now_ts - MONTH)) del_sensitive_key = "sensitive_score_"+str(del_month) # 要删除的键 former_ts = int(ts) - DAY former_date = str(datetime2ts(ts2datetime(former_ts))) former_sensitive_key = "sensitive_score_" + former_date iter_count = 0 bulk_action = [] mappings(ES_SENSITIVE_INDEX) total_number = r.hlen(sensitive_string) scan_cursor = 0 print total_number while 1: re_scan = r.hscan(sensitive_string, scan_cursor, count=1000) scan_cursor = re_scan[0] if len(re_scan[1]) != 0: sensitive_info = re_scan[1] # 字典形式,uid:sensitive_words_dict uid_list = sensitive_info.keys() sensitive_results = es.mget(index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX, body={"ids":uid_list})['docs'] if sensitive_results: for item in sensitive_results: uid = item['_id'] sensitive_words_dict = json.loads(sensitive_info[uid]) # json.loads current_sensitive_score = 0 for k,v in sensitive_words_dict.iteritems(): tmp_stage = r_sensitive.hget("sensitive_words", k) if tmp_stage: current_sensitive_score += v*sensitive_score_dict[str(tmp_stage)] if item['found']: # 之前存在相关信息 revise_item = item["_source"] if del_sensitive_key in revise_item: item.remove(del_sensitive_key) revise_item['uid'] = uid # 新更新的敏感度 revise_item[update_sensitive_key] = current_sensitive_score # 新更新的敏感词 revise_item[sensitive_dict_key] = sensitive_info[uid] # 新更新的string revise_item[sensitive_string_key] = "&".join(sensitive_words_dict.keys()) # 当天和之前一天、一周和一月均值的差异 revise_item['sensitive_day_change'] = current_sensitive_score - revise_item.get(former_sensitive_key, 0) revise_item['sensitive_week_change'] = current_sensitive_score - revise_item.get('sensitive_week_ave', 0) revise_item['sensitive_month_change'] = current_sensitive_score - revise_item.get('sensitive_month_ave', 0) # 更新后week、month的均值和方差 revise_item['sensitive_week_ave'], revise_item['sensitive_week_var'], revise_item['sensitive_week_sum'] = compute_week(revise_item, now_ts) revise_item['senstiive_month_ave'], revise_item['sensitive_month_var'], revise_item['sensitive_month_sum'] = compute_month(revise_item, now_ts) else: revise_item = dict() revise_item['uid'] = uid revise_item[update_sensitive_key] = current_sensitive_score revise_item[sensitive_dict_key] = sensitive_info[uid] revise_item[sensitive_string_key] = "&".join(sensitive_words_dict.keys()) revise_item['sensitive_day_change'] = current_sensitive_score revise_item['sensitive_week_change'] = current_sensitive_score revise_item['sensitive_month_change'] = current_sensitive_score revise_item['sensitive_week_ave'], revise_item['sensitive_week_var'], revise_item['sensitive_week_sum'] = compute_week(revise_item, now_ts) revise_item['senstiive_month_ave'], revise_item['sensitive_month_var'], revise_item['sensitive_month_sum'] = compute_month(revise_item, now_ts) action = {'index':{'_id': uid}} bulk_action.extend([action, revise_item]) iter_count += 1 if iter_count % 1000 == 0: es.bulk(bulk_action, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX) bulk_action = [] print iter_count if int(scan_cursor) == 0: break if bulk_action: es.bulk(bulk_action, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX) print iter_count