def get_show_trace_followers(xnr_user_no): es_get_result = es.get(index=tw_xnr_fans_followers_index_name,doc_type=tw_xnr_fans_followers_index_type,\ id=xnr_user_no)['_source'] trace_follow_list = es_get_result['trace_follow_list'] weibo_user_info = [] if trace_follow_list: mget_results = es.mget(index=twitter_user_index_name,doc_type=twitter_user_index_type,\ body={'ids':trace_follow_list})['docs'] # print 'mget_results::',mget_results for result in mget_results: if result['found']: weibo_user_info.append(result['_source']) else: uid = result['_id'] weibo_user_info.append({ 'uid': uid, 'statusnum': 0, 'fansnum': 0, 'friendsnum': 0, 'photo_url': '', 'sex': '', 'nick_name': uid, 'user_location': '' }) else: weibo_user_info = [] return weibo_user_info
def my_topic_classfiy(uid_list, datetime_list): topic_dict_results = {} topic_string_results = {} #将处理后的结果保存到数据库中,并在处理前查询数据库中是否已经有了相应内容之前存储的结果,以提高效率 uids = uid_list unresolved_uids = [] res = es.mget(index=fb_portrait_index_name, doc_type=fb_portrait_index_type, body={'ids': uids})['docs'] for r in res: uid = r['_id'] if r.has_key('found'): found = r['found'] if found and r['_source'].has_key('topic'): topic = r['_source']['topic'] topic_string = r['_source']['topic_string'] topic_dict_results[uid] = json.loads(topic) topic_string_results[uid] = [topic_ch2en_dict[ch_topic] for ch_topic in topic_string.split('&')] else: unresolved_uids.append(uid) else: #es表中目前无任何记录 unresolved_uids.append(uid) #未在数据库中的进行计算并存储 user_topic_dict = {} user_topic_list = {} if unresolved_uids: fb_flow_text_index_list = [] for datetime in datetime_list: fb_flow_text_index_list.append(flow_text_index_name_pre + datetime) user_topic_data = get_filter_keywords(fb_flow_text_index_list, unresolved_uids) user_topic_dict, user_topic_list = topic_classfiy(unresolved_uids, user_topic_data) user_topic_string = {} for uid, topic_list in user_topic_list.items(): li = [] for t in topic_list: li.append(zh_data[name_list.index(t)].decode('utf8')) user_topic_string[uid] = '&'.join(li) user_topic = {} for uid in unresolved_uids: if uid in user_topic_dict: user_topic[uid] = { 'filter_keywords': json.dumps(user_topic_data[uid]), 'topic': json.dumps(user_topic_dict[uid]), 'topic_string': user_topic_string[uid] } else: user_topic[uid] = { 'filter_keywords': json.dumps({}), 'topic': json.dumps({}), 'topic_string': '' } save_data2es(user_topic) #整合 user_topic_dict.update(topic_dict_results) user_topic_list.update(topic_string_results) return user_topic_dict, user_topic_list
def get_recommend_at_user(xnr_user_no): #_id = user_no2_id(user_no) es_result = es.get(index=tw_xnr_index_name, doc_type=tw_xnr_index_type, id=xnr_user_no)['_source'] #print 'es_result:::',es_result if es_result: uid = es_result['uid'] daily_interests = es_result['daily_interests'] if S_TYPE == 'test': now_ts = datetime2ts(S_DATE_TW) else: now_ts = int(time.time()) datetime = ts2datetime(now_ts - 24 * 3600) index_name = twitter_flow_text_index_name_pre + datetime nest_query_list = [] daily_interests_list = daily_interests.split('&') es_results_daily = es.search(index=index_name,doc_type=twitter_flow_text_index_type,\ body={'query':{'match_all':{}},'size':200,\ 'sort':{'timestamp':{'order':'desc'}}})['hits']['hits'] uid_list = [] if es_results_daily: for result in es_results_daily: result = result['_source'] uid_list.append(result['uid']) ## 根据uid,从weibo_user中得到 nick_name uid_nick_name_dict = dict() # uid不会变,而nick_name可能会变 es_results_user = es.mget(index=twitter_user_index_name, doc_type=twitter_user_index_type, body={'ids': uid_list})['docs'] i = 0 for result in es_results_user: if result['found'] == True: result = result['_source'] uid = result['uid'] nick_name = result['name'] if nick_name: i += 1 uid_nick_name_dict[uid] = nick_name if i >= DAILY_AT_RECOMMEND_USER_TOP: break return uid_nick_name_dict
def filter_mid(mid_list): llen = len(mid_list) l_1000 = llen/1000 result = [] for i in range(l_1000+1): tmp = mid_list[i*1000:(i+1)*1000] if tmp: es_results = es_xnr.mget(index="social_sensing_text", doc_type="text", body={"ids":tmp}, _source=False)["docs"] #print 'es_results:::',es_results for item in es_results: #print 'item::',item if not item["found"]: result.append(item["_id"]) return result
def get_forward_numerical_info(task_name, ts): results = [] ts_series = [] for i in range(1, forward_n + 1): ts_series.append(ts - i * time_interval) # check if detail es of task exists doctype = task_name index_exist = es_xnr.indices.exists_type(index_sensing_task, doctype) if not index_exist: print "new create task detail index" mappings_sensing_task(doctype) if ts_series: search_results = es_xnr.mget(index=index_sensing_task, doc_type=doctype, body={"ids": ts_series})['docs'] found_count = 0 average_origin = [] average_retweeted = [] average_commet = [] average_total = [] average_negetive = [] for item in search_results: if item['found']: temp = item['_source'] sentiment_dict = json.loads(temp['sentiment_distribution']) average_total.append(int(temp['weibo_total_number'])) average_negetive.append( int(sentiment_dict["2"]) + int(sentiment_dict['3']) + int(sentiment_dict['4']) + int(sentiment_dict['5']) + int(sentiment_dict['6'])) found_count += 1 if found_count > initial_count: number_mean = np.mean(average_total) number_std = np.std(average_total) sentiment_mean = np.mean(average_negetive) sentiment_std = np.mean(average_negetive) results = [ 1, number_mean, number_std, sentiment_mean, sentiment_std ] else: results = [0] return results
def create_event_warning(xnr_user_no,today_datetime,write_mark): #获取事件名称 hashtag_list = get_hashtag(today_datetime) print 'hashtag_list/:',hashtag_list twitter_flow_text_index_name=get_timets_set_indexset_list(twitter_flow_text_index_name_pre,today_datetime,today_datetime) #查询关注列表 lookup_type='followers_list' followers_list=lookup_xnr_fans_followers(xnr_user_no,lookup_type) event_warming_list=[] for event_item in hashtag_list: event_warming_content=dict() #事件名称、主要参与用户、典型微博、事件影响力、事件平均时间 event_warming_content['event_name']=event_item['event_name'] event_influence_sum=0 event_time_sum=0 query_body={ 'query':{ 'filtered':{ 'filter':{ 'bool':{ 'must':[ {'term':{'hashtag':event_item['event_name']}}, {'range':{'sensitive':{'gte':1}}} ] } } } }, 'size':MAX_WARMING_SIZE, 'sort':{'sensitive':{'order':'desc'}} } event_results=es_xnr.search(index=twitter_flow_text_index_name,doc_type=twitter_flow_text_index_type,body=query_body)['hits']['hits'] if event_results: twitter_result=[] alluser_num_dict=dict() #print 'sencond_time:::',int(time.time()) for item in event_results: #查询三个指标字段 tid_result=lookup_tid_attend_index(item['_source']['tid'],today_datetime) if tid_result: item['_source']['comment']=tid_result['comment'] item['_source']['share']=tid_result['share'] item['_source']['favorite']=tid_result['favorite'] else: item['_source']['comment']=0 item['_source']['share']=0 item['_source']['favorite']=0 #print 'event_content:',item['_source']['text'] #统计用户信息 if alluser_num_dict.has_key(str(item['_source']['uid'])): followers_mark=set_intersection(item['_source']['uid'],followers_list) if followers_mark > 0: alluser_num_dict[str(item['_source']['uid'])]=alluser_num_dict[str(item['_source']['uid'])]+1*2 else: alluser_num_dict[str(item['_source']['uid'])]=alluser_num_dict[str(item['_source']['uid'])]+1 else: alluser_num_dict[str(item['_source']['uid'])]=1 #计算影响力 origin_influence_value=(1+item['_source']['comment']+item['_source']['share']+item['_source']['favorite'])*(1+item['_source']['sensitive']) followers_value=judge_user_type(item['_source']['uid'],followers_list) item['_source']['twitter_influence_value']=origin_influence_value*followers_value #查询用户昵称 item['_source']['nick_name']=get_user_nickname(item['_source']['uid']) twitter_result.append(item['_source']) #统计影响力、时间 event_influence_sum=event_influence_sum+item['_source']['twitter_influence_value'] event_time_sum=event_time_sum+item['_source']['timestamp'] # print 'third_time:::',int(time.time()) #典型信息 twitter_result.sort(key=lambda k:(k.get('twitter_influence_value',0)),reverse=True) event_warming_content['main_twitter_info']=json.dumps(twitter_result) #事件影响力和事件时间 number=len(event_results) event_warming_content['event_influence']=event_influence_sum/number event_warming_content['event_time']=event_time_sum/number #对用户进行排序 alluser_num_dict=sorted(alluser_num_dict.items(),key=lambda d:d[1],reverse=True) main_userid_list=[] for i in xrange(0,len(alluser_num_dict)): main_userid_list.append(alluser_num_dict[i][0]) #主要参与用户信息 main_user_info=[] user_es_result=es_xnr.mget(index=twitter_user_index_name,doc_type=twitter_user_index_type,body={'ids':main_userid_list})['docs'] # print 'user_es_result:',user_es_result for item in user_es_result: user_dict=dict() if item['found']: user_dict['uid']=item['_id'] user_dict['username']=item['_source']['username'] if item['_source'].has_key('profileimageurl'): user_dict['profileimageurl']=item['_source']['profileimageurl'] else: user_dict['profileimageurl']='' if item['_source'].has_key('statuscount'): user_dict['statuscount']=item['_source']['statuscount'] else: user_dict['statuscount']=0 if item['_source'].has_key('followerscount'): user_dict['followerscount']=item['_source']['followerscount'] else: user_dict['followerscount']=0 if item['_source'].has_key('friendscount'): user_dict['friendscount']=item['_source']['friendscount'] else: user_dict['friendscount']=0 else: # user_dict['icon']='' user_dict['uid']=item['_id'] user_dict['username']='' user_dict['profileimageurl']='' user_dict['statuscount']=0 user_dict['followerscount']=0 user_dict['friendscount']=0 main_user_info.append(user_dict) event_warming_content['main_user_info']=json.dumps(main_user_info) # print 'fourth_time:::',int(time.time()) event_warming_content['xnr_user_no']=xnr_user_no event_warming_content['validity']=0 event_warming_content['timestamp']=today_datetime now_time=int(time.time()) task_id=xnr_user_no+'_'+str(now_time) #写入数据库 if write_mark: # print 'today_datetime:::',ts2datetime(today_datetime) mark=write_envent_warming(today_datetime,event_warming_content,task_id) event_warming_list.append(mark) else: event_warming_list.append(event_warming_content) else: pass # print 'fifth_time:::',int(time.time()) return event_warming_list
def get_hot_sensitive_recommend_at_user(sort_item): if S_TYPE == 'test': now_ts = datetime2ts(S_DATE_TW) else: now_ts = int(time.time()) datetime = ts2datetime(now_ts - 24 * 3600) #sort_item = 'sensitive' sort_item_2 = 'timestamp' index_name = twitter_flow_text_index_name_pre + datetime query_body = { 'query': { 'match_all': {} }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': HOT_EVENT_TOP_USER, '_source': ['uid', 'user_fansnum', 'retweeted', 'timestamp'] } es_results = es.search(index=index_name, doc_type=twitter_flow_text_index_type, body=query_body)['hits']['hits'] uid_fansnum_dict = dict() if es_results: for result in es_results: result = result['_source'] uid = result['uid'] uid_fansnum_dict[uid] = {} uid_fansnum_dict[uid][sort_item_2] = result[sort_item_2] uid_fansnum_dict_sort_top = sorted(uid_fansnum_dict.items(), key=lambda x: x[1][sort_item_2], reverse=True) uid_set = set() for item in uid_fansnum_dict_sort_top: uid_set.add(item[0]) uid_list = list(uid_set) ## 根据uid,从weibo_user中得到 nick_name uid_nick_name_dict = dict() # uid不会变,而nick_name可能会变 es_results_user = es.mget(index=twitter_user_index_name, doc_type=twitter_user_index_type, body={'ids': uid_list})['docs'] i = 0 for result in es_results_user: if result['found'] == True: result = result['_source'] uid = result['uid'] nick_name = result['username'] if nick_name: i += 1 uid_nick_name_dict[uid] = nick_name if i >= HOT_AT_RECOMMEND_USER_TOP: break return uid_nick_name_dict
def detect_by_seed_users(seed_users): retweet_mark = 1 #目前只有部分数据 comment_mark = 0 #暂无数据 group_uid_list = set() all_union_result_dict = {} #get retweet/comment es db_number now_ts = time.time() db_number = get_db_num(now_ts) #step1: mget retweet and be_retweet if retweet_mark == 1: # retweet_index_name = retweet_index_name_pre + str(db_number) be_retweet_index_name = be_retweet_index_name_pre + str(db_number) #mget retwet ''' try: retweet_result = es.mget(index=retweet_index_name, doc_type=retweet_index_type, \ body={'ids':seed_users}, _source=True)['docs'] except: retweet_result = [] ''' #mget be_retweet try: be_retweet_result = es.mget(index=be_retweet_index_name, doc_type=be_retweet_index_type, \ body={'ids':seed_users} ,_source=True)['docs'] except: be_retweet_result = [] ''' #step2: mget comment and be_comment if comment_mark == 1: comment_index_name = comment_index_name_pre + str(db_number) be_comment_index_name = be_comment_index_name_pre + str(db_number) #mget comment try: comment_result = es.mget(index=comment_index_name, doc_type=comment_index_type, \ body={'ids':seed_users}, _source=True)['docs'] except: comment_result = [] #mget be_comment try: be_comment_result = es.mget(index=be_comment_index_name, doc_type=be_comment_index_type, \ body={'ids':seed_users}, _source=True)['docs'] except: be_comment_result = [] ''' #step3: union retweet/be_retweet/comment/be_comment result union_count = 0 for iter_search_uid in seed_users: try: uid_retweet_dict = json.loads( retweet_result[union_count]['_source']['uid_retweet']) except: uid_retweet_dict = {} try: uid_be_retweet_dict = json.loads( be_retweet_result[union_count]['_source']['uid_be_retweet']) except: uid_be_retweet_dict = {} try: uid_comment_dict = json.loads( comment_result[union_count]['_source']['uid_comment']) except: uid_comment_dict = {} try: uid_be_comment_dict = json.loads( be_comment_result[union_count]['_source']['uid_be_comment']) except: uid_be_comment_dict = {} #union four type user set union_result = union_dict(uid_retweet_dict, uid_be_retweet_dict, uid_comment_dict, uid_be_comment_dict) all_union_result_dict[iter_search_uid] = union_result ''' !!!! 有一个转化提取 从 all_union_result_dict 中提取 所有的uid ''' for seeder_uid, inter_dict in all_union_result_dict.iteritems(): for uid, inter_count in inter_dict.iteritems(): group_uid_list.add(uid) group_uid_list = list(group_uid_list) return group_uid_list
def my_domain_classfiy(uid_list, datetime_list): domain_results = {} #将处理后的结果保存到数据库中,并在处理前查询数据库中是否已经有了相应内容之前存储的结果,以提高效率 uids = uid_list unresolved_uids = [] res = es.mget(index=fb_portrait_index_name, doc_type=fb_portrait_index_type, body={'ids': uids})['docs'] for r in res: uid = r['_id'] if r.has_key('found'): found = r['found'] if found and r['_source'].has_key('domain'): domain = r['_source']['domain'] domain_results[uid] = domain else: unresolved_uids.append(uid) else: #es表中目前无任何记录 unresolved_uids.append(uid) #未在数据库中的进行计算并存储 user_domain = {} user_domain_temp = {} if unresolved_uids: fb_flow_text_index_list = [] for datetime in datetime_list: fb_flow_text_index_list.append(flow_text_index_name_pre + datetime) user_domain_data = {} #load num of text count_result = count_text_num(unresolved_uids, fb_flow_text_index_list) #load baseinfo fb_user_query_body = { 'query': { "filtered": { "filter": { "bool": { "must": [ { "terms": { "uid": unresolved_uids } }, ] } } } }, 'size': MAX_SEARCH_SIZE, "fields": ["bio", "about", "description", "quotes", "category", "uid"] } try: search_results = es.search(index=facebook_user_index_name, doc_type=facebook_user_index_type, body=fb_user_query_body)['hits']['hits'] for item in search_results: content = item['fields'] uid = content['uid'][0] if not uid in user_domain_data: text_num = count_result[uid] user_domain_data[uid] = { 'bio_str': '', 'bio_list': [], 'category': '', 'number_of_text': text_num } #对于长文本,Goslate 会在标点换行等分隔处把文本分拆为若干接近 2000 字节的子文本,再一一查询,最后将翻译结果拼接后返回用户。通过这种方式,Goslate 突破了文本长度的限制。 if content.has_key('category'): category = content.get('category')[0] else: category = '' if content.has_key('description'): description = content.get( 'description' )[0][: 1000] #有的用户描述信息之类的太长了……3000+,没有卵用,而且翻译起来会出现一些问题,截取一部分就行了 else: description = '' if content.has_key('quotes'): quotes = content.get('quotes')[0][:1000] else: quotes = '' if content.has_key('bio'): bio = content.get('bio')[0][:1000] else: bio = '' if content.has_key('about'): about = content.get('about')[0][:1000] else: about = '' user_domain_data[uid]['bio_list'] = [ quotes, bio, about, description ] user_domain_data[uid]['category'] = category except Exception, e: print e #由于一个用户请求一次翻译太耗时,所以统一批量翻译 trans_uid_list = [] untrans_bio_data = [] cut = 100 n = len(user_domain_data) / cut for uid, content in user_domain_data.items(): trans_uid_list.append(uid) untrans_bio_data.extend(content['bio_list']) content.pop('bio_list') if n: if len(trans_uid_list) % cut == 0: temp_trans_bio_data = trans_bio_data(untrans_bio_data) for i in range(len(trans_uid_list)): uid = trans_uid_list[i] user_domain_data[uid]['bio_str'] = '_'.join( temp_trans_bio_data[4 * i:4 * i + 4]) trans_uid_list = [] untrans_bio_data = [] n = n - 1 else: if len(trans_uid_list) == (len(user_domain_data) % cut): temp_trans_bio_data = trans_bio_data(untrans_bio_data) for i in range(len(trans_uid_list)): uid = trans_uid_list[i] user_domain_data[uid]['bio_str'] = '_'.join( temp_trans_bio_data[4 * i:4 * i + 4]) trans_uid_list = [] untrans_bio_data = [] #domian计算 user_domain_temp = domain_main(user_domain_data) for uid in unresolved_uids: if uid in user_domain_temp: user_domain[uid] = {'domain': user_domain_temp[uid]} else: user_domain_temp[uid] = 'other' user_domain[uid] = {'domain': 'other'} save_data2es(user_domain)
def my_domain_classfiy(uid_list, datetime_list): domain_results = {} #将处理后的结果保存到数据库中,并在处理前查询数据库中是否已经有了相应内容之前存储的结果,以提高效率 uids = uid_list unresolved_uids = [] res = es.mget(index=tw_portrait_index_name, doc_type=tw_portrait_index_type, body={'ids': uids})['docs'] for r in res: uid = r['_id'] if r.has_key('found'): found = r['found'] if found and r['_source'].has_key('domain'): domain = r['_source']['domain'] domain_results[uid] = domain else: unresolved_uids.append(uid) else: #es表中目前无任何记录 unresolved_uids.append(uid) #未在数据库中的进行计算并存储 user_domain = {} user_domain_temp = {} if unresolved_uids: tw_flow_text_index_list = [] for datetime in datetime_list: tw_flow_text_index_list.append(flow_text_index_name_pre + datetime) user_domain_data = {} #load num of text count_result = count_text_num(unresolved_uids, tw_flow_text_index_list) #load baseinfo tw_user_query_body = { 'query': { "filtered": { "filter": { "bool": { "must": [ { "terms": { "uid": unresolved_uids } }, ] } } } }, 'size': MAX_SEARCH_SIZE, "fields": ["location", "username", "description", "uid"] } try: search_results = es.search(index=twitter_user_index_name, doc_type=twitter_user_index_type, body=tw_user_query_body)['hits']['hits'] for item in search_results: content = item['fields'] uid = content['uid'][0] if not uid in user_domain_data: text_num = count_result[uid] user_domain_data[uid] = { 'location': '', 'username': '', 'description': '', 'number_of_text': text_num } if content.has_key('location'): location = content.get('location')[0] else: location = '' if content.has_key('description'): description = content.get('description')[0][:1000] else: description = '' if content.has_key('username'): username = content.get('username')[0] else: username = '' user_domain_data[uid]['location'] = location user_domain_data[uid]['username'] = username user_domain_data[uid]['description'] = description except Exception, e: print e #由于一个用户请求一次翻译太耗时,所以统一批量翻译 trans_uid_list = [] untrans_bio_data = [] cut = 100 n = len(user_domain_data) / cut for uid, content in user_domain_data.items(): trans_uid_list.append(uid) untrans_bio_data.extend( [content['location'], content['description']]) if n: if len(trans_uid_list) % cut == 0: temp_trans_bio_data = trans_bio_data(untrans_bio_data) for i in range(len(trans_uid_list)): uid = trans_uid_list[i] user_domain_data[uid]['location'] = '_'.join( temp_trans_bio_data[2 * i]) user_domain_data[uid]['description'] = '_'.join( temp_trans_bio_data[2 * i + 1]) trans_uid_list = [] untrans_bio_data = [] n = n - 1 else: if len(trans_uid_list) == (len(user_domain_data) % cut): temp_trans_bio_data = trans_bio_data(untrans_bio_data) for i in range(len(trans_uid_list)): uid = trans_uid_list[i] user_domain_data[uid]['location'] = '_'.join( temp_trans_bio_data[2 * i]) user_domain_data[uid]['description'] = '_'.join( temp_trans_bio_data[2 * i + 1]) trans_uid_list = [] untrans_bio_data = [] #domian计算 user_domain_temp = domain_main(user_domain_data) for uid in unresolved_uids: if uid in user_domain_temp: user_domain[uid] = {'domain': user_domain_temp[uid]} else: user_domain_temp[uid] = 'other' user_domain[uid] = {'domain': 'other'} save_data2es(user_domain)