def get_generate_example_model(domain_name,role_name): domain_pinyin = pinyin.get(domain_name,format='strip',delimiter='_') role_en = domain_ch2en_dict[role_name] task_id = domain_pinyin + '_' + role_en es_result = es.get(index=weibo_role_index_name,doc_type=weibo_role_index_type,id=task_id)['_source'] item = es_result #print 'es_result:::',es_result # 政治倾向 political_side = json.loads(item['political_side'])[0][0] if political_side == 'mid': item['political_side'] = u'中立' elif political_side == 'left': item['political_side'] = u'左倾' else: item['political_side'] = u'右倾' # 心理特征 psy_feature_list = [] psy_feature = json.loads(item['psy_feature']) for i in range(TOP_PSY_FEATURE): psy_feature_list.append(psy_feature[i][0]) item['psy_feature'] = '&'.join(psy_feature_list) role_group_uids = json.loads(item['member_uids']) mget_results = es_user_portrait.mget(index=portrait_index_name,doc_type=portrait_index_type,body={'ids':role_group_uids})['docs'] # topic_list = [] # for mget_item in mget_results: # if mget_item['found']: # keywords_list = json.loads(mget_item['_source']['keywords']) # topic_list.extend(keywords_list) # topic_keywords_dict = {} # for topic_item in topic_list: # keyword = topic_item[0] # keyword_count = topic_item[1] # try: # topic_keywords_dict[keyword] += keyword_count # except: # topic_keywords_dict[keyword] = keyword_count # monitor_keywords_list = [] # for i in range(3): # keyword_max = max(topic_keywords_dict,key=topic_keywords_dict.get) # monitor_keywords_list.append(keyword_max) # del topic_keywords_dict[keyword_max] # item['monitor_keywords'] = '&'.join(monitor_keywords_list) if S_TYPE == 'test': current_time = datetime2ts(S_DATE) else: current_time = int(time.time()) index_name_list = get_flow_text_index_list(current_time) query_body_search = { 'query':{ 'filtered':{ 'filter':{ 'terms':{'uid':role_group_uids} } } }, 'size':MAX_VALUE, '_source':['keywords_string'] } es_keyword_results = es_flow_text.search(index=index_name_list,doc_type=flow_text_index_type,\ body=query_body_search)['hits']['hits'] keywords_string = '' for mget_item in es_keyword_results: #print 'mget_item:::',mget_item #if mget_item['found']: keywords_string += '&' keywords_string += mget_item['_source']['keywords_string'] k_dict = extract_keywords(keywords_string) monitor_keywords_list = [] for item_item in k_dict: monitor_keywords_list.append(item_item.word.encode('utf-8')) item['monitor_keywords'] = ','.join(monitor_keywords_list) mget_results_user = es_user_profile.mget(index=profile_index_name,doc_type=profile_index_type,body={'ids':role_group_uids})['docs'] item['nick_name'] = [] if mget_results_user: for mget_item in mget_results_user: #print 'mget_item:::',mget_item if mget_item['found']: item['nick_name'] = mget_item['_source']['nick_name'] item['location'] = mget_item['_source']['user_location'] item['gender'] = mget_item['_source']['sex'] uid = mget_item['_source']['uid'] try: profile_results = es_user_portrait.get(index=profile_index_name,doc_type=profile_index_type,id=uid)['_source'] if profile_results['description']: item['description'] = profile_results['description'] break except: pass item['business_goal'] = u'渗透' item['daily_interests'] = u'旅游' # if S_TYPE == 'test': # user_mget_results = es.mget(index=profile_index_name,doc_type=profile_index_type,body={'ids':role_group_uids})['docs'] # if user_mget_results item['age'] = 30 item['career'] = u'自由职业' active_time_list_np = np.array(json.loads(item['active_time'])) active_time_list_np_sort = np.argsort(-active_time_list_np)[:TOP_ACTIVE_TIME] item['active_time'] = active_time_list_np_sort.tolist() day_post_num_list = np.array(json.loads(item['day_post_num'])) item['day_post_num'] = np.mean(day_post_num_list).tolist() item['role_name'] = role_name task_id_new =domain_pinyin + '_' + role_en example_model_file_name = EXAMPLE_MODEL_PATH + task_id_new + '.json' try: with open(example_model_file_name,"w") as dump_f: json.dump(item,dump_f) item_dict = dict() #item_dict['xnr_user_no'] = xnr_user_no item_dict['domain_name'] = domain_name item_dict['role_name'] = role_name es.index(index=weibo_example_model_index_name,doc_type=weibo_example_model_index_type,\ body=item_dict,id=task_id_new) mark = True except: mark = False return mark
def create_event_warning(xnr_user_no,today_datetime,write_mark): #获取事件名称 hashtag_list = get_hashtag(today_datetime) #print 'hashtag_list::',hashtag_list flow_text_index_name = get_day_flow_text_index_list(today_datetime) #虚拟人的粉丝列表和关注列表 try: es_xnr_result=es_xnr.get(index=weibo_xnr_fans_followers_index_name,doc_type=weibo_xnr_fans_followers_index_type,id=xnr_user_no)['_source'] followers_list=es_xnr_result['followers_list'] fans_list=es_xnr_result['fans_list'] except: followers_list=[] fans_list=[] event_warming_list=[] event_num=0 for event_item in hashtag_list: event_sensitive_count=0 event_warming_content=dict() #事件名称、主要参与用户、典型微博、事件影响力、事件平均时间 event_warming_content['event_name']=event_item['event_name'] # print 'event_name:',event_item event_num=event_num+1 # print 'event_num:::',event_num # print 'first_time:::',int(time.time()) event_influence_sum=0 event_time_sum=0 query_body={ 'query':{ # 'bool':{ # 'must':[{'wildcard':{'text':'*'+event_item[0]+'*'}}, # {'range':{'sensitive':{'gte':1}}}] # } 'filtered':{ 'filter':{ 'bool':{ 'must':[ {'term':{'hashtag':event_item['event_name']}}, {'range':{'sensitive':{'gte':1}}} ] } } } }, 'size':MAX_WARMING_SIZE, 'sort':{'sensitive':{'order':'desc'}} } #try: event_results=es_flow_text.search(index=flow_text_index_name,doc_type=flow_text_index_type,body=query_body)['hits']['hits'] if event_results: weibo_result=[] fans_num_dict=dict() followers_num_dict=dict() alluser_num_dict=dict() # print 'sencond_time:::',int(time.time()) for item in event_results: #print 'event_content:',item['_source']['text'] #统计用户信息 if alluser_num_dict.has_key(str(item['_source']['uid'])): followers_mark=set_intersection(item['_source']['uid'],followers_list) if followers_mark > 0: alluser_num_dict[str(item['_source']['uid'])]=alluser_num_dict[str(item['_source']['uid'])]+1*2 else: alluser_num_dict[str(item['_source']['uid'])]=alluser_num_dict[str(item['_source']['uid'])]+1 else: alluser_num_dict[str(item['_source']['uid'])]=1 #计算影响力 origin_influence_value=(1+item['_source']['comment']+item['_source']['retweeted'])*(1+item['_source']['sensitive']) # fans_value=judge_user_type(item['_source']['uid'],fans_list) followers_value=judge_user_type(item['_source']['uid'],followers_list) item['_source']['weibo_influence_value']=origin_influence_value*(followers_value) item['_source']['nick_name']=get_user_nickname(item['_source']['uid']) weibo_result.append(item['_source']) #统计影响力、时间 event_influence_sum=event_influence_sum+item['_source']['weibo_influence_value'] event_time_sum=event_time_sum+item['_source']['timestamp'] # print 'third_time:::',int(time.time()) #典型微博信息 weibo_result.sort(key=lambda k:(k.get('weibo_influence_value',0)),reverse=True) event_warming_content['main_weibo_info']=json.dumps(weibo_result) #事件影响力和事件时间 number=len(event_results) event_warming_content['event_influence']=event_influence_sum/number event_warming_content['event_time']=event_time_sum/number # except: # event_warming_content['main_weibo_info']=[] # event_warming_content['event_influence']=0 # event_warming_content['event_time']=0 # try: #对用户进行排序 alluser_num_dict=sorted(alluser_num_dict.items(),key=lambda d:d[1],reverse=True) main_userid_list=[] for i in xrange(0,len(alluser_num_dict)): main_userid_list.append(alluser_num_dict[i][0]) #主要参与用户信息 main_user_info=[] user_es_result=es_user_profile.mget(index=profile_index_name,doc_type=profile_index_type,body={'ids':main_userid_list})['docs'] for item in user_es_result: user_dict=dict() if item['found']: user_dict['photo_url']=item['_source']['photo_url'] user_dict['uid']=item['_id'] user_dict['nick_name']=item['_source']['nick_name'] user_dict['favoritesnum']=item['_source']['favoritesnum'] user_dict['fansnum']=item['_source']['fansnum'] else: user_dict['photo_url']='' user_dict['uid']=item['_id'] user_dict['nick_name']='' user_dict['favoritesnum']=0 user_dict['fansnum']=0 main_user_info.append(user_dict) event_warming_content['main_user_info']=json.dumps(main_user_info) # except: # event_warming_content['main_user_info']=[] # print 'fourth_time:::',int(time.time()) event_warming_content['xnr_user_no']=xnr_user_no event_warming_content['validity']=0 event_warming_content['timestamp']=today_datetime now_time=int(time.time()) # task_id=xnr_user_no+'_'+str(now_time) event_warming_content['_id']=xnr_user_no+'_'+event_warming_content['event_name'] task_id=event_warming_content['_id'] if write_mark: # print 'today_datetime:::',ts2datetime(today_datetime) mark=write_envent_warming(today_datetime,event_warming_content,task_id) event_warming_list.append(mark) else: event_warming_list.append(event_warming_content) else: pass # print 'fifth_time:::',int(time.time()) return event_warming_list
def get_recommend_step_two(task_detail): # print 'task_detail:::',task_detail domain_name = task_detail['domain_name'] role_name = task_detail['role_name'] daily_interests_list = task_detail['daily_interests'].encode('utf-8').split(',') domain_pinyin = pinyin.get(domain_name,format='strip',delimiter='_') role_name_en = domain_ch2en_dict[role_name] _id = domain_pinyin + '_' + role_name_en #try: recommend_results = dict() ## 根据角色信息 es_result = es.get(index=weibo_role_index_name,doc_type=weibo_role_index_type,id=_id)['_source'] #### 角色实例 nick_name_list = [] user_location_top_list = [] description_list = [] sex_list = [] #try: role_example_dict = {} # print 'es_result:::',es_result member_uids = json.loads(es_result['member_uids']) member_uids_results = es_user_profile.mget(index=profile_index_name,doc_type=profile_index_type,\ body={'ids':member_uids})['docs'] count = 0 # print 'member_uids_results:::',member_uids_results for result in member_uids_results: if result['found'] == True: result = result['_source'] person_url = 'http://weibo.com/u/'+str(result['uid'])+'/home' nick_name = result['nick_name'] nick_name_list.append(nick_name) sex_list.append(result['sex']) description_list.append(result['description']) role_example_dict[result['uid']] = [nick_name,person_url] count += 1 if count > NICK_NAME_TOP: break recommend_results['role_example'] = role_example_dict # except: # recommend_results['role_example'] = [] active_time_list_np = np.array(json.loads(es_result['active_time'])) active_time_list_np_sort = list(np.argsort(-active_time_list_np)[:ACTIVE_TIME_TOP]) recommend_results['active_time'] = active_time_list_np_sort day_post_num = json.loads(es_result['day_post_num']) day_post_num_new = pd.Series(day_post_num) day_post_num_new = day_post_num_new.fillna(0) day_post_num_new = list(day_post_num_new) day_post_num_average = sum(day_post_num_new)/float(len(day_post_num_new)) recommend_results['day_post_num_average'] = day_post_num_average ## 根据日常兴趣 # create_time = time.time() # if S_TYPE == 'test': # create_time = datetime2ts(S_DATE) # index_name_list = get_flow_text_index_list(create_time) # try: # query_body = { # 'query':{ # 'filtered':{ # 'filter':{ # 'terms':{'daily_interests':daily_interests_list} # } # } # }, # 'sort':{'user_fansnum':{'order':'desc'}}, # 'size':DAILY_INTEREST_TOP_USER, # '_source':['uid'] # } # es_results = es_flow_text.search(index=index_name_list,doc_type='text',body=query_body)['hits']['hits'] # daily_interest_uid_set = set() # for result in es_results: # daily_interest_uid_set.add(result['_source']['uid']) # daily_interest_uid_list = list(daily_interest_uid_set) # es_daily_interests_results = es_user_profile.mget(index=profile_index_name,doc_type=profile_index_type,\ # body={'ids':daily_interest_uid_list})['docs'] # nick_name_list = [] # sex_list = [] # user_location_list = [] # description_list = [] # for result in es_daily_interests_results: # if result['found'] == True: # result = result['_source'] # nick_name_list.append(result['nick_name']) # sex_list.append(result['sex']) # user_location_list.append(result['user_location']) # if result['description']: # description_list.append(result['description']) sex_list_count = Counter(sex_list) # print 'sex_list_count:::',sex_list_count sex_sort = sorted(sex_list_count.items(),key=lambda x:x[1],reverse=True)[:1][0][0] # user_location_list_count = Counter(user_location_list) # user_location_sort_top = sorted(user_location_list_count.items(),key=lambda x:x[1],reverse=True)[:USER_LOCATION_TOP] # user_location_top_list = [] # for user_location in user_location_sort_top: # user_location_top_list.append(user_location_sort[0]) recommend_results['nick_name'] = '&'.join(nick_name_list) recommend_results['role_example'] = recommend_results['role_example'] recommend_results['sex'] = sex_sort recommend_results['user_location'] = '&'.join(user_location_top_list) recommend_results['description'] = '&'.join(description_list[:DESCRIPTION_TOP]) # except: # print '没有找到日常兴趣相符的用户' # recommend_results['nick_name'] = '' # recommend_results['sex'] = '' # recommend_results['user_location'] = '' # recommend_results['description'] = '' ## 年龄、职业 recommend_results['age'] = '' recommend_results['career'] = '' return recommend_results
def get_recommend_follows(task_detail): recommend_results = dict() daily_interests_list = task_detail['daily_interests'].encode('utf-8').split(',') monitor_keywords_list = task_detail['monitor_keywords'].encode('utf-8').split(',') #print 'daily_interests_list::',daily_interests_list create_time = time.time() if S_TYPE == 'test': create_time = datetime2ts(S_DATE) index_name_list = get_flow_text_index_list(create_time) ## 日常兴趣关注 try: query_body = { 'query':{ 'filtered':{ 'filter':{ 'terms':{'daily_interests':daily_interests_list} } } }, 'sort':{'user_fansnum':{'order':'desc'}}, 'size':DAILY_INTEREST_TOP_USER, '_source':['uid'] } es_results = es_flow_text.search(index=index_name_list,doc_type='text',body=query_body)['hits']['hits'] daily_interest_uid_set = set() for result in es_results: daily_interest_uid_set.add(result['_source']['uid']) daily_interest_uid_list = list(daily_interest_uid_set) es_daily_interests_results = es_user_profile.mget(index=profile_index_name,doc_type=profile_index_type,\ body={'ids':daily_interest_uid_list})['docs'] nick_name_dict = {} es_daily_interests_results = es_daily_interests_results[:max(NICK_NAME_TOP,len(es_daily_interests_results))] for result in es_daily_interests_results: if result['found'] == True: result = result['_source'] nick_name_dict[result['uid']] = result['nick_name'] else: continue recommend_results['daily_interests'] = nick_name_dict except: print '没有找到日常兴趣相符的用户' recommend_results['daily_interests'] = {} ## 监测词关注 nest_query_list = [] #print 'monitor_keywords_list:::',monitor_keywords_list for monitor_keyword in monitor_keywords_list: nest_query_list.append({'wildcard':{'keywords_string':'*'+monitor_keyword+'*'}}) #print 'nest_query_list::',nest_query_list try: query_body_monitor = { 'query':{ 'bool':{ 'must':nest_query_list } }, 'sort':{'user_fansnum':{'order':'desc'}}, 'size':MONITOR_TOP_USER, '_source':['uid'] } #print '123' es_results = es_flow_text.search(index=index_name_list,doc_type='text',body=query_body_monitor)['hits']['hits'] #print 'es_results::',es_results monitor_keywords_uid_set = set() for result in es_results: monitor_keywords_uid_set.add(result['_source']['uid']) monitor_keywords_uid_list = list(monitor_keywords_uid_set) es_monitor_keywords_results = es_user_profile.mget(index=profile_index_name,doc_type=profile_index_type,\ body={'ids':monitor_keywords_uid_list})['docs'] nick_name_dict = {} es_monitor_keywords_results = es_monitor_keywords_results[:max(NICK_NAME_TOP,len(es_monitor_keywords_results))] for result in es_monitor_keywords_results: if result['found'] == True: result = result['_source'] nick_name_dict[result['uid']] = result['nick_name'] else: continue recommend_results['monitor_keywords'] = nick_name_dict except: print '没有找到监测词相符的用户' recommend_results['monitor_keywords'] = {} # print 'recommend_results::',recommend_results return recommend_results
def show_event_warming(xnr_user_no): now_time = int(time.time()) #print 'first_time:',time.time() hashtag_list = get_hashtag() #print 'hashtag_list_time:',time.time() #print 'hashtag_list:::::::',hashtag_list if S_TYPE == 'test': test_day_date = S_DATE_EVENT_WARMING test_day_time = datetime2ts(test_day_date) flow_text_index_list = get_flow_text_index_list(test_day_time) #print flow_text_index_list hashtag_list = [('网络义勇军发布', 13), ('美国', 7), ('德国', 5), ('中国', 4), ('清真食品', 3), ('反邪动态', 2), ('台海观察', 2), ('雷哥微评', 2), ('中国军队', 1)] #hashtag_list=[('网络义勇军发布',13),('美国',7),('芒果TV',6),('德国',5),('中国',4),('清真食品',3),('反邪动态',2),('台海观察',2),('每日一药',2),('雷哥微评',2),('PPAP洗脑神曲',1),('中国军队',1)] #weibo_xnr_flow_text_listname=['flow_text_2016-11-26','flow_text_2016-11-25','flow_text_2016-11-24'] else: flow_text_index_list = get_flow_text_index_list(now_time) #weibo_xnr_flow_text_listname=get_xnr_flow_text_index_list(now_time) #print flow_text_index_list,hashtag_list #虚拟人的粉丝列表和关注列表 try: es_xnr_result = es_xnr.get( index=weibo_xnr_fans_followers_index_name, doc_type=weibo_xnr_fans_followers_index_type, id=xnr_user_no)['_source'] followers_list = es_xnr_result['followers_list'] fans_list = es_xnr_result['fans_list'] except: followers_list = [] fans_list = [] #print 'weibo_xnr_fans_followers_time:',time.time() event_warming_list = [] for event_item in hashtag_list: #print event_item,event_item[0] event_sensitive_count = 0 event_warming_content = dict() #事件名称、主要参与用户、典型微博、事件影响力、事件平均时间 event_warming_content['event_name'] = event_item[0] event_influence_sum = 0 event_time_sum = 0 query_body = { 'query': { 'bool': { 'should': { 'wildcard': { 'text': '*' + event_item[0] + '*' } } } } } try: event_results = es_flow_text.search( index=flow_text_index_list, doc_type=flow_text_index_type, body=query_body)['hits']['hits'] weibo_result = [] fans_num_dict = dict() followers_num_dict = dict() alluser_num_dict = dict() #print event_results for item in event_results: if item['_source']['sensitive'] > 0: event_sensitive_count = event_sensitive_count + 1 #统计用户信息 if alluser_num_dict.has_key(str(item['_source']['uid'])): alluser_num_dict[str( item['_source']['uid'])] = alluser_num_dict[str( item['_source']['uid'])] + 1 else: alluser_num_dict[str(item['_source']['uid'])] = 1 for fans_uid in fans_list: if fans_uid == item['_source']['uid']: if fans_num_dict.has_key(str(fans_uid)): fans_num_dict[str(fans_uid)] = fans_num_dict[ str(fans_uid)] + 1 else: fans_num_dict[str(fans_uid)] = 1 for followers_uid in followers_list: if followers_uid == item['_source']['uid']: if followers_num_dict.has_key(str(followers_uid)): fans_num_dict[str( followers_uid )] = fans_num_dict[str(followers_uid)] + 1 else: fans_num_dict[str(followers_uid)] = 1 #计算影响力 origin_influence_value = (item['_source']['comment'] + item['_source']['retweeted']) * ( 1 + item['_source']['sensitive']) fans_value = judge_user_type(item['_source']['uid'], fans_list) followers_value = judge_user_type(item['_source']['uid'], followers_list) item['_source'][ 'weibo_influence_value'] = origin_influence_value * ( fans_value + followers_value) weibo_result.append(item['_source']) #统计影响力、时间 event_influence_sum = event_influence_sum + item[ '_source']['weibo_influence_value'] event_time_sum = item['_source']['timestamp'] #典型微博信息 weibo_result.sort(key=lambda k: (k.get('weibo_influence_value', 0)), reverse=True) event_warming_content['main_weibo_info'] = weibo_result #事件影响力和事件时间 number = len(event_results) event_warming_content[ 'event_influence'] = event_influence_sum / number event_warming_content['event_time'] = event_time_sum / number else: pass except: event_warming_content['main_weibo_info'] = [] event_warming_content['event_influence'] = [] event_warming_content['event_time'] = [] #print event_item[0],'event_search_time:',time.time() try: if event_sensitive_count > 0: #对用户进行排序 temp_userid_dict = union_dict(fans_num_dict, followers_num_dict) main_userid_dict = union_dict(temp_userid_dict, alluser_num_dict) main_userid_dict = sorted(main_userid_dict.items(), key=lambda d: d[1], reverse=True) main_userid_list = [] for i in xrange(0, len(main_userid_dict)): main_userid_list.append(main_userid_dict[i][0]) #print 'main_userid_list:',main_userid_list #主要参与用户信息 main_user_info = [] user_es_result = es_user_profile.mget( index=profile_index_name, doc_type=profile_index_type, body={'ids': main_userid_list})['docs'] for item in user_es_result: #print 'item:',item #print 'found:',item['found'] #print 'id:',item['_id'] user_dict = dict() if item['found']: user_dict['photo_url'] = item['_source']['photo_url'] user_dict['uid'] = item['_id'] user_dict['nick_name'] = item['_source']['nick_name'] user_dict['favoritesnum'] = item['_source'][ 'favoritesnum'] user_dict['fansnum'] = item['_source']['fansnum'] else: user_dict['photo_url'] = '' user_dict['uid'] = item['_id'] user_dict['nick_name'] = '' user_dict['favoritesnum'] = 0 user_dict['fansnum'] = 0 main_user_info.append(user_dict) event_warming_content['main_user_info'] = main_user_info #print 'main_user_info:',main_user_info #print user_es_result ''' user_query_body={ 'query':{ 'filtered':{ 'filter':{ 'terms':{'uid':main_userid_list} } } } } user_es_result=es_user_profile.search(index=profile_index_name,doc_type=profile_index_type,body=user_query_body)['hits']['hits'] #print user_es_result main_user_info=[] for item in user_es_result: main_user_info.append(item['_source']) event_warming_content['main_user_info']=main_user_info ''' else: event_warming_content['main_user_info'] = [] except: event_warming_content['main_user_info'] = [] #print 'user_search_time:',time.time() if event_sensitive_count > 0: #print event_warming_content['event_name'] event_warming_list.append(event_warming_content) else: pass #main_userid_list=['5536381570','2192435767','1070598590'] #user_es_result=es_user_profile.mget(index=profile_index_name,doc_type=profile_index_type,body={'ids':main_userid_list}) #print 'user_es_result',user_es_result #print 'end_time:',time.time() return event_warming_list