def get_user_ip(uid): flow_text_index_list = [] now_timestamp = datetime2ts(ts2datetime(time.time())) if RUN_TYPE == 0: now_timestamp = datetime2ts(RUN_TEST_TIME) for i in range(7, 0, -1): iter_date = ts2datetime(now_timestamp - DAY * i) flow_text_index_list.append(flow_text_index_name_pre + iter_date) weibo_all = es_flow_text.search(index=flow_text_index_list, doc_type=flow_text_index_type, body={ 'query': { 'filtered': { 'filter': { 'term': { 'uid': uid } } } }, 'size': 10, })['hits']['hits'] ip = weibo_all[0]["_source"]["ip"] return ip
def new_get_influence_trend(uid, time_segment): results = {} try: influence_history = ES_COPY_USER_PORTRAIT.get(index=COPY_USER_PORTRAIT_INFLUENCE, doc_type=COPY_USER_PORTRAIT_INFLUENCE_TYPE,\ id=uid)['_source'] print ES_COPY_USER_PORTRAIT,COPY_USER_PORTRAIT_INFLUENCE,COPY_USER_PORTRAIT_INFLUENCE_TYPE,uid print influence_history except: influence_history = {} if influence_history: results = get_evaluate_trend(influence_history, 'bci') else: results = {} print results #deal results for situation---server power off new_time_list = [] new_count_list = [] new_results = {} now_time_ts = time.time() now_date_ts = datetime2ts(ts2datetime(now_time_ts)) if RUN_TYPE == 0: now_date_ts = datetime2ts(RUN_TEST_TIME) for i in range(time_segment, 0, -1): iter_date_ts = now_date_ts - i * DAY try: date_count = results[iter_date_ts] except: date_count = 0 new_time_list.append(iter_date_ts) new_count_list.append(date_count) new_results = {'timeline': new_time_list, 'evaluate_index': new_count_list} return new_results
def get_influence_content(uid, timestamp_from, timestamp_to): weibo_list = [] #split timestamp range to new_range_dict_list from_date_ts = datetime2ts(ts2datetime(timestamp_from)) to_date_ts = datetime2ts(ts2datetime(timestamp_to)) new_range_dict_list = [] if from_date_ts != to_date_ts: iter_date_ts = from_date_ts while iter_date_ts < to_date_ts: iter_next_date_ts = iter_date_ts + DAY new_range_dict_list.append({ 'range': { 'timestamp': { 'gte': iter_date_ts, 'lt': iter_next_date_ts } } }) iter_date_ts = iter_next_date_ts if new_range_dict_list[0]['range']['timestamp']['gte'] < timestamp_from: new_range_dict_list[0]['range']['timestamp'][ 'gte'] = timestamp_from if new_range_dict_list[-1]['range']['timestamp']['lt'] > timestamp_to: new_range_dict_list[-1]['range']['timestamp']['lt'] = timestamp_to else: new_range_dict_list = [{ 'range': { 'timestamp': { 'gte': timestamp_from, 'lt': timestamp_to } } }] #iter date to search flow_text iter_result = [] for range_item in new_range_dict_list: range_from_ts = range_item['range']['timestamp']['gte'] range_from_date = ts2datetime(range_from_ts) flow_text_index_name = flow_text_index_name_pre + range_from_date query = [] query.append({'term': {'uid': uid}}) query.append(range_item) try: flow_text_exist = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\ body={'query':{'bool':{'must': query}}, 'sort':[{'timestamp':'asc'}]})['hits']['hits'] except: flow_text_exist = [] iter_result.extend(flow_text_exist) # get weibo list for item in flow_text_exist: source = item['_source'] weibo = {} weibo['timestamp'] = ts2date(source['timestamp']) weibo['ip'] = source['ip'] weibo['text'] = source['text'] weibo['geo'] = '\t'.join(source['geo'].split('&')) weibo_list.append(weibo) return weibo_list
def get_db_num(timestamp): date = ts2datetime(timestamp) date_ts = datetime2ts(date) r_beigin_ts = datetime2ts(R_BEGIN_TIME) db_number = ((date_ts - r_beigin_ts) / (DAY * 7)) % 2 + 1 # run_type if RUN_TYPE == 0: db_number = 1 return db_number
def get_db_num(timestamp): date = ts2datetime(timestamp) date_ts = datetime2ts(date) r_beigin_ts = datetime2ts(R_BEGIN_TIME) db_number = ((date_ts - r_beigin_ts) / (DAY * 7)) % 2 + 1 # run_type if RUN_TYPE == 0: db_number = 1 return db_number
def get_social_inter_content(uid1, uid2, type_mark): weibo_list = [] #get two type relation about uid1 and uid2 #search weibo list now_ts = int(time.time()) #run_type if RUN_TYPE == 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = datetime2ts(RUN_TEST_TIME) #uid2uname uid2uname = {} try: portrait_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type ,\ body={'ids': [uid1, uid2]}, _source=False, fields=['uid', 'uname'])['docs'] except: portrait_result = [] for item in portrait_result: uid = item['_id'] if item['found'] == True: uname = item['fields']['uname'][0] uid2uname[uid] = uname else: uid2uname[uid] = 'unknown' #iter date to search weibo list for i in range(7, 0, -1): iter_date_ts = now_date_ts - i*DAY iter_date = ts2datetime(iter_date_ts) flow_text_index_name = flow_text_index_name_pre + str(iter_date) query = [] query.append({'bool':{'must':[{'term':{'uid':uid1}}, {'term':{'directed_uid': int(uid2)}}]}}) if type_mark=='out': query.append({'bool':{'must':[{'term':{'uid':uid2}}, {'term':{'directed_uid': int(uid1)}}]}}) try: flow_text_result = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\ body={'query': {'bool':{'should': query}}, 'sort':[{'timestamp':{'order': 'asc'}}], 'size':MAX_VALUE})['hits']['hits'] except: flow_text_result = [] for flow_text in flow_text_result: source = flow_text['_source'] weibo = {} weibo['timestamp'] = source['timestamp'] weibo['ip'] = source['ip'] weibo['geo'] = source['geo'] weibo['text'] = '\t'.join(source['text'].split('&')) weibo['uid'] = source['uid'] weibo['uname'] = uid2uname[weibo['uid']] weibo['directed_uid'] = str(source['directed_uid']) weibo['directed_uname'] = uid2uname[str(source['directed_uid'])] weibo_list.append(weibo) return weibo_list
def new_get_user_location(uid): results = {} now_date = ts2datetime(time.time()) now_date_ts = datetime2ts(now_date) #jln #now_date_ts = 1378310400 #run type if RUN_TYPE == 0: now_date_ts = datetime2ts(RUN_TEST_TIME) - DAY now_date = ts2datetime(now_date_ts) #now ip try: ip_time_string = r_cluster.hget('new_ip_'+str(now_date_ts), uid) except Exception, e: raise e
def get_user_ip(uid): flow_text_index_list = [] now_timestamp = datetime2ts(ts2datetime(time.time())) if RUN_TYPE == 0: now_timestamp = datetime2ts(RUN_TEST_TIME) for i in range(7, 0, -1): iter_date = ts2datetime(now_timestamp - DAY * i) flow_text_index_list.append(flow_text_index_name_pre + iter_date) weibo_all = es_flow_text.search(index=flow_text_index_list, doc_type=flow_text_index_type, body={'query': {'filtered': {'filter': {'term': {'uid': uid}}}}, 'size': 10, })['hits']['hits'] ip = weibo_all[0]["_source"]["ip"] return ip
def new_get_activeness_trend(uid, time_segment): results = {} try: activeness_history = ES_COPY_USER_PORTRAIT.get(index=COPY_USER_PORTRAIT_ACTIVENESS, doc_type=COPY_USER_PORTRAIT_ACTIVENESS_TYPE,\ id=uid)['_source'] except: activeness_history = {} if activeness_history: results = get_evaluate_trend(activeness_history, 'activeness') else: results = {} #deal results for situation---server power off new_time_list = [] new_count_list = [] new_results = {} now_time_ts = time.time() now_date_ts = datetime2ts(ts2datetime(now_time_ts)) for i in range(time_segment, 0, -1): iter_date_ts = now_date_ts - i * DAY try: date_count = results[iter_date_ts] except: date_count = 0 new_time_list.append(iter_date_ts) new_count_list.append(date_count) new_results = {'timeline': new_time_list, 'evaluate_index': new_count_list} return new_results
def get_group_user_track(uid): results = [] #step1:get user_portrait activity_geo_dict try: portrait_result = es_user_portrait.get(index=portrait_index_name, doc_type=portrait_index_type,\ id=uid, _source=False, fields=['activity_geo_dict']) except: portrait_result = {} if portrait_result == {}: return 'uid is not in user_portrait' activity_geo_dict = json.loads( portrait_result['fields']['activity_geo_dict'][0]) now_date_ts = datetime2ts(ts2datetime(int(time.time()))) start_ts = now_date_ts - DAY * len(activity_geo_dict) #step2: iter date to get month track for geo_item in activity_geo_dict: iter_date = ts2datetime(start_ts) sort_day_dict = sorted(geo_item.items(), key=lambda x: x[1], reverse=True) if sort_day_dict: results.append([iter_date, sort_day_dict[0][0]]) else: results.append([iter_date, '']) start_ts = start_ts + DAY return results
def get_user_geo(uid, dropped_geos=u"中国&美国"): """ :param uid: 用户的id :param dropped_geos: &分割的地点,因为geo中都包含中国 :return: geo 位置的set """ dropped_geos = set(dropped_geos.split("&")) # 获取用户的偏好 try: user_portrait_result = es_user_portrait. \ get_source(index=portrait_index_name, doc_type=profile_index_type, id=uid) except NotFoundError: user_portrait_result = None # portrait表中存在geo信息 if user_portrait_result and len(user_portrait_result["activity_geo"]) > 0: geos = user_portrait_result["activity_geo"] - dropped_geos # 不存在geo信息,获取之前发去的微博提取 else: flow_text_index_list = [] now_timestamp = datetime2ts(ts2datetime(time.time())) if RUN_TYPE == 0: now_timestamp = datetime2ts(RUN_TEST_TIME) for i in range(7, 0, -1): iter_date = ts2datetime(now_timestamp - DAY * i) flow_text_index_list.append(flow_text_index_name_pre + iter_date) weibo_all = es_flow_text.search(index=flow_text_index_list, doc_type=flow_text_index_type, body={ 'query': { 'filtered': { 'filter': { 'term': { 'uid': uid } } } }, 'size': 2000, })['hits']['hits'] geos = set() for temp in weibo_all: geos |= set(temp["_source"]["geo"].split("&")) return geos
def get_db_num(timestamp): date = ts2datetime(timestamp) date_ts = datetime2ts(date) db_number = ((date_ts - r_beigin_ts) / (DAY * 7)) % 2 + 1 if RUN_TYPE == 0: db_number = 1 #print 'db_number:', db_number return db_number
def search_fans_new(uid,top_count): results = {} now_ts = time.time() now_date_ts = datetime2ts(ts2datetime(now_ts)) db_number = get_db_num(now_date_ts) fan_result_new=es_fans.get(index = fans_index_name,doc_type=fans_index_type,id=uid)['_source'] fan_result_new = json.loads(fan_result_new['uid_be_retweet']) out_portrait_list=[] # print fan_result_new i=1 for key in fan_result_new: # print key fansnum=0 user_friendsnum=0 user_weibo_count=0 fans_count=0 uid=fan_result_new[key]['uid'] if fan_result_new[key]['photo_url']: photo_url = fan_result_new[key]['photo_url'] else: photo_url="http://tp2.sinaimg.cn/1878376757/50/0/1" if fan_result_new[key]['nick_name']: uname=fan_result_new[key]['nick_name'] else: uname=uid if fan_result_new[key]['times']: fans_count=fan_result_new[key]['times'] else: fans_count=0 if fan_result_new[key]['fansnum']: fansnum=fan_result_new[key]['fansnum'] else: fansnum=0 if fan_result_new[key]['friendsnum']: user_friendsnum=fan_result_new[key]['friendsnum'] else: user_friendsnum=0 if fan_result_new[key]['statusnum']: user_weibo_count=fan_result_new[key]['statusnum'] else: user_weibo_count=0 out_portrait_list.append({'uid':uid,'photo_url':photo_url,'uname':uname, 'count':fans_count, 'fansnum':fansnum,'friendsnum': user_friendsnum,'weibo_count': user_weibo_count}) if i>100: break i=i+1 return out_portrait_list
def cctv_video_rec(uid, k=10): flow_text_index_list = [] now_timestamp = datetime2ts(ts2datetime(time.time())) if RUN_TYPE == 0: now_timestamp = datetime2ts(RUN_TEST_TIME) for i in range(7, 0, -1): iter_date = ts2datetime(now_timestamp - DAY * i) flow_text_index_list.append(flow_text_index_name_pre + iter_date) weibo_all = es_flow_text.search(index=flow_text_index_list, doc_type=flow_text_index_type, body={ 'query': { 'filtered': { 'filter': { 'term': { 'uid': uid } } } }, 'size': 100, })['hits']['hits'] user_words = set() for weibo in weibo_all: weibo_text = weibo["_source"]["text"] user_words |= set(jieba.cut(weibo_text)) rio_dict = load_topic_video_dict(RIO_VIDEO_INFO_FILE) tiger_videos = load_videos(TIGER_VIDEO_INFO_FILE) ret_dict = dict() ret_dict["tiger"] = random.sample(tiger_videos, k) user_pref_topic = set(rio_dict.keys()) & user_words # 若找不到,随机分配topic if len(user_pref_topic) == 0: user_pref_topic = set(random.sample(rio_dict.keys(), k)) ret_dict["rio"] = list() for topic in user_pref_topic: ret_dict["rio"].extend(rio_dict[topic]) if len(ret_dict["rio"]) >= k: ret_dict["rio"] = ret_dict["rio"][:k] break return ret_dict
def get_text_index(date): now_ts = datetime2ts(date) index_list = [] for i in range(7): ts = now_ts - i*DAY tmp_index = pre_text_index + ts2datetime(ts) index_list.append(tmp_index) return index_list
def get_influence_content(uid, timestamp_from, timestamp_to): weibo_list = [] #split timestamp range to new_range_dict_list from_date_ts = datetime2ts(ts2datetime(timestamp_from)) to_date_ts = datetime2ts(ts2datetime(timestamp_to)) new_range_dict_list = [] if from_date_ts != to_date_ts: iter_date_ts = from_date_ts while iter_date_ts < to_date_ts: iter_next_date_ts = iter_date_ts + DAY new_range_dict_list.append({'range':{'timestamp':{'gte':iter_date_ts, 'lt':iter_next_date_ts}}}) iter_date_ts = iter_next_date_ts if new_range_dict_list[0]['range']['timestamp']['gte'] < timestamp_from: new_range_dict_list[0]['range']['timestamp']['gte'] = timestamp_from if new_range_dict_list[-1]['range']['timestamp']['lt'] > timestamp_to: new_range_dict_list[-1]['range']['timestamp']['lt'] = timestamp_to else: new_range_dict_list = [{'range':{'timestamp':{'gte':timestamp_from, 'lt':timestamp_to}}}] #iter date to search flow_text iter_result = [] for range_item in new_range_dict_list: range_from_ts = range_item['range']['timestamp']['gte'] range_from_date = ts2datetime(range_from_ts) flow_text_index_name = flow_text_index_name_pre + range_from_date query = [] query.append({'term':{'uid':uid}}) query.append(range_item) try: flow_text_exist = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\ body={'query':{'bool':{'must': query}}, 'sort':[{'timestamp':'asc'}]})['hits']['hits'] except: flow_text_exist = [] iter_result.extend(flow_text_exist) # get weibo list for item in flow_text_exist: source = item['_source'] weibo = {} weibo['timestamp'] = ts2date(source['timestamp']) weibo['ip'] = source['ip'] weibo['text'] = source['text'] weibo['geo'] = '\t'.join(source['geo'].split('&')) weibo_list.append(weibo) return weibo_list
def search_weibo(root_uid,uid,mtype): query_body = { #'query':{ 'filter':{ 'bool':{ 'must':[{'term':{'uid':uid}}, {'term':{'message_type':mtype}}], 'should':[{'term':{'root_uid':root_uid}}, {'term':{'directed_uid':root_uid}}], } } #} } index_list = [] for i in range(7, 0, -1): if RUN_TYPE == 1: iter_date = ts2datetime(datetime2ts(now_date) - i * DAY) else: iter_date = ts2datetime(datetime2ts(RUN_TEST_TIME) - i * DAY) index_list.append(flow_text_index_name_pre + iter_date) results = es_flow_text.search(index=index_list,doc_type=flow_text_index_type,body=query_body)['hits']['hits'] weibo = {} f_result = [] if len(results) > 0: for result in results: #print type(result),result weibo['last_text'] = [result['_source']['text'],result['_source']['text'],result['_source']['timestamp']] mid = result['_source']['root_mid'] # print mid len_pre = len(flow_text_index_name_pre) index = result['_index'][len_pre:] root_index = [] for j in range(0,7): #一周的,一个月的话就0,30 iter_date = ts2datetime(datetime2ts(index) - j * DAY) root_index.append(flow_text_index_name_pre + iter_date) results0 = es_flow_text.search(index=root_index,doc_type=flow_text_index_type,body={'query':{'term':{'mid':mid}}})['hits']['hits'] if len(results0)>0: for result0 in results0: weibo['ori_text'] = [result0['_source']['text'],result0['_source']['timestamp']] f_result.append(weibo) weibo={} return f_result
def search_mention(uid): now_date_ts = datetime2ts(ts2datetime(time.time())) #run type if RUN_TYPE == 0: now_date_ts = datetime2ts(RUN_TEST_TIME) day_result_dict_list = [] for i in range(7,0, -1): iter_ts = now_date_ts - i * DAY try: result_string = r_cluster.hget('at_' + str(ts), str(uid)) except: result_string = '' if not result_string: continue day_result_dict = json.loads(results_string) day_result_dict_list.append(day_result_dict) if day_result_dict_list: week_result_dict = union_dict(day_result_dict_list) else: week_result_dict = {} return week_result_dict
def get_user_geo(uid, dropped_geos=u"中国&美国"): """ :param uid: 用户的id :param dropped_geos: &分割的地点,因为geo中都包含中国 :return: geo 位置的set """ dropped_geos = set(dropped_geos.split("&")) # 获取用户的偏好 try: user_portrait_result = es_user_portrait. \ get_source(index=portrait_index_name, doc_type=profile_index_type, id=uid) except NotFoundError: user_portrait_result = None # portrait表中存在geo信息 if user_portrait_result and len(user_portrait_result["activity_geo"]) > 0: geos = user_portrait_result["activity_geo"] - dropped_geos # 不存在geo信息,获取之前发去的微博提取 else: flow_text_index_list = [] now_timestamp = datetime2ts(ts2datetime(time.time())) if RUN_TYPE == 0: now_timestamp = datetime2ts(RUN_TEST_TIME) for i in range(7, 0, -1): iter_date = ts2datetime(now_timestamp - DAY * i) flow_text_index_list.append(flow_text_index_name_pre + iter_date) weibo_all = es_flow_text.search(index=flow_text_index_list, doc_type=flow_text_index_type, body={'query': {'filtered': {'filter': {'term': {'uid': uid}}}}, 'size': 2000, })['hits']['hits'] geos = set() for temp in weibo_all: geos |= set(temp["_source"]["geo"].split("&")) return geos
def cctv_video_rec(uid, k=10): flow_text_index_list = [] now_timestamp = datetime2ts(ts2datetime(time.time())) if RUN_TYPE == 0: now_timestamp = datetime2ts(RUN_TEST_TIME) for i in range(7, 0, -1): iter_date = ts2datetime(now_timestamp - DAY * i) flow_text_index_list.append(flow_text_index_name_pre + iter_date) weibo_all = es_flow_text.search(index=flow_text_index_list, doc_type=flow_text_index_type, body={'query': {'filtered': {'filter': {'term': {'uid': uid}}}}, 'size': 100, })['hits']['hits'] user_words = set() for weibo in weibo_all: weibo_text = weibo["_source"]["text"] user_words |= set(jieba.cut(weibo_text)) rio_dict = load_topic_video_dict(RIO_VIDEO_INFO_FILE) tiger_videos = load_videos(TIGER_VIDEO_INFO_FILE) ret_dict = dict() ret_dict["tiger"] = random.sample(tiger_videos, k) user_pref_topic = set(rio_dict.keys()) & user_words # 若找不到,随机分配topic if len(user_pref_topic) == 0: user_pref_topic = set(random.sample(rio_dict.keys(), k)) ret_dict["rio"] = list() for topic in user_pref_topic: ret_dict["rio"].extend(rio_dict[topic]) if len(ret_dict["rio"]) >= k: ret_dict["rio"] = ret_dict["rio"][:k] break return ret_dict
def conclusion_on_activeness(uid): # test index_name = copy_portrait_index_name index_type = copy_portrait_index_type try: influ_result = es.get(index=index_name, doc_type=index_type, id=uid)['_source'] except: influ_result = {} result = activeness_conclusion_dict['0'] return result # generate time series---keys now_ts = time.time() now_ts = datetime2ts('2013-09-12') activeness_set = set() for i in range(N): ts = ts2datetime(now_ts - i * 3600 * 24) activeness_set.add(pre_activeness + ts) # 区分影响力和活跃度的keys keys_set = set(influ_result.keys()) activeness_keys = keys_set & activeness_set if activeness_keys: activeness_value = [] for key in activeness_keys: activeness_value.append(influ_result[key]) mean, std_var = level(activeness_value) if mean < activeness_level[0]: result = activeness_conclusion_dict['1'] elif mean >= activeness_level[0] and mean < activeness_level[1]: result = activeness_conclusion_dict['2'] elif mean >= activeness_level[1] and mean < activeness_level[2]: result = activeness_conclusion_dict["3"] elif mean >= activeness_level[2] and mean < activeness_level[3]: result = activeness_conclusion_dict["4"] else: result = activeness_conclusion_dict["5"] else: result = conclusion_dict['0'] return result
def conclusion_on_activeness(uid): # test index_name = copy_portrait_index_name index_type = copy_portrait_index_type try: influ_result = es.get(index=index_name, doc_type=index_type, id=uid)['_source'] except: influ_result = {} result = activeness_conclusion_dict['0'] return result # generate time series---keys now_ts = time.time() now_ts = datetime2ts('2013-09-12') activeness_set = set() for i in range(N): ts = ts2datetime(now_ts - i*3600*24) activeness_set.add(pre_activeness+ts) # 区分影响力和活跃度的keys keys_set = set(influ_result.keys()) activeness_keys = keys_set & activeness_set if activeness_keys: activeness_value = [] for key in activeness_keys: activeness_value.append(influ_result[key]) mean, std_var = level(activeness_value) if mean < activeness_level[0]: result = activeness_conclusion_dict['1'] elif mean >= activeness_level[0] and mean < activeness_level[1]: result = activeness_conclusion_dict['2'] elif mean >= activeness_level[1] and mean < activeness_level[2]: result = activeness_conclusion_dict["3"] elif mean >= activeness_level[2] and mean < activeness_level[3]: result = activeness_conclusion_dict["4"] else: result = activeness_conclusion_dict["5"] else: result = conclusion_dict['0'] return result
def ajax_specified_user_active(): date = request.args.get('date', '') # '2013-09-01' uid = request.args.get('uid', '') # 123456,123456 date = str(date) results = [] if date and uid: if RUN_TYPE == 0: timetemp = datetime2ts(RUN_TEST_TIME) - DAY date = ts2datetime(timetemp) print date index_name = pre_influence_index + date.replace('-', '') list_1 = [] uid_list = [item for item in uid.split(',')] result = search_influence_detail(uid_list, index_name, "bci") description = influence_description(result) results.append(result) results.append(description) return json.dumps(results)
def get_group_user_track(uid): results = [] #step1:get user_portrait activity_geo_dict try: portrait_result = es_user_portrait.get(index=portrait_index_name, doc_type=portrait_index_type,\ id=uid, _source=False, fields=['activity_geo_dict']) except: portrait_result = {} if portrait_result == {}: return 'uid is not in user_portrait' activity_geo_dict = json.loads(portrait_result['fields']['activity_geo_dict'][0]) now_date_ts = datetime2ts(ts2datetime(int(time.time()))) start_ts = now_date_ts - DAY * len(activity_geo_dict) #step2: iter date to get month track for geo_item in activity_geo_dict: iter_date = ts2datetime(start_ts) sort_day_dict = sorted(geo_item.items(), key=lambda x:x[1], reverse=True) if sort_day_dict: results.append([iter_date, sort_day_dict[0][0]]) else: results.append([iter_date, '']) start_ts = start_ts + DAY return results
def search_fans(uid,top_count): results = {} now_ts = time.time() now_date_ts = datetime2ts(ts2datetime(now_ts)) db_number = get_db_num(now_date_ts) be_comment_index_name = be_comment_index_name_pre + str(db_number) be_retweet_index_name = be_retweet_index_name_pre + str(db_number) result = {} be_retweet_inter_dict = {} be_comment_inter_dict = {} center_uid = uid try: be_retweet_result = es_retweet.get(index = be_retweet_index_name,doc_type=be_retweet_index_type,id=uid)['_source'] except: be_retweet_result = {} if be_retweet_result: be_retweet_uid_dict = json.loads(be_retweet_result['uid_be_retweet']) else: be_retweet_uid_dict = {} # print "be_retweet_uid_dict", be_retweet_uid_dict try: be_comment_result = es_be_comment.get(index=be_comment_index_name, doc_type=be_comment_index_type, id=uid)['_source'] except: be_comment_result = {} if be_comment_result: be_comment_uid_dict = json.loads(be_comment_result['uid_be_comment']) else: be_comment_uid_dict = {} # print "be_comment_uid_dict", be_comment_uid_dict fans_result = union_dict(be_retweet_uid_dict,be_comment_uid_dict) fans_user_set = set(fans_result.keys()) fans_list = list(fans_user_set) # print "fans_list", fans_list all_fans_dict = {} for fans_user in fans_list: if fans_user != center_uid: all_fans_dict[fans_user] = fans_result[fans_user] sort_all_fans_dict = sorted(all_fans_dict.items(), key=lambda x:x[1], reverse=True) all_fans_uid_list=[] all_fans_uid_list_all = [item[0] for item in sort_all_fans_dict] count = 0 for i in all_fans_uid_list_all: count += 1 all_fans_uid_list.append(i) if count == 1000: break # print all_fans_uid_list out_portrait_list = all_fans_uid_list #use to get user information from user profile out_portrait_result = {} try: out_user_result = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={'ids':out_portrait_list})['docs'] except: out_user_result = [] #add index from bci_history try: bci_history_result = es_bci_history.mget(index=bci_history_index_name, doc_type=bci_history_index_type, body={'ids': out_portrait_list}, fields=fields)['docs'] except: bci_history_result = [] iter_count = 0 out_portrait_list = [] for out_user_item in out_user_result: uid = out_user_item['_id'] if out_user_item['found'] == True: source = out_user_item['_source'] uname = source['nick_name'] photo_url = source['photo_url'] if uname == '': uname = u'未知' location = source['user_location'] friendsnum = source['friendsnum'] else: uname = u'未知' location = '' friendsnum = '' photo_url = 'unknown' #add index from bci_history try: bci_history_item = bci_history_result[iter_count] except: bci_history_item = {'found': False} # print bci_history_item if bci_history_item['found'] == True: fansnum = bci_history_item['fields'][fields[0]][0] user_weibo_count = bci_history_item['fields'][fields[1]][0] user_friendsnum = bci_history_item['fields'][fields[2]][0] influence = bci_history_item['fields'][fields[3]][0] else: fansnum = '' user_weibo_count = '' user_friendsnum = '' fans_count = int(all_fans_dict[uid]) out_portrait_list.append({'uid':uid,'photo_url':photo_url,'uname':uname, 'count':fans_count, 'fansnum':fansnum,'friendsnum': user_friendsnum,'weibo_count': user_weibo_count}) iter_count += 1 return out_portrait_list
def search_mention(now_ts, uid, top_count): date = ts2datetime(now_ts) #evaluate_max_dict = get_evaluate_max() ts = datetime2ts(date) stat_results = dict() results = dict() uid_dict = {} for i in range(1, 8): ts = ts - DAY try: result_string = r_cluster.hget('at_' + str(ts), str(uid)) except: result_string = '' if not result_string: continue result_dict = json.loads(result_string) for at_uname in result_dict: try: stat_results[at_uname] += result_dict[at_uname] except: stat_results[at_uname] = result_dict[at_uname] sort_stat_results = sorted(stat_results.items(), key=lambda x: x[1], reverse=True) # # print sort_stat_results out_portrait_list = [] out_list = stat_results.keys() #use to get user information from user profile out_query_list = [{'match': {'uname': item}} for item in out_list] if len(out_query_list) != 0: query = [{'bool': {'should': out_query_list}}] try: out_profile_result = es_user_profile.search( index=profile_index_name, doc_type=profile_index_type, body={ 'query': { 'bool': { 'must': query } }, 'size': 100 })['hits']['hits'] except: out_profile_result = [] else: out_profile_result = [] out_in_profile_list = [] bci_search_id_list = [] for out_item in out_profile_result: source = out_item['_source'] uname = source['nick_name'] uid = source['uid'] location = source['location'] friendsnum = source['friendsnum'] out_portrait_list.append( [uid, uname, stat_results[uname], '', location, friendsnum, '']) out_in_profile_list.append(uname) #use to search bci history bci_search_id_list.append(uid) out_out_profile_list = list(set(out_list) - set(out_in_profile_list)) for out_out_item in out_out_profile_list: out_portrait_list.append( ['', out_out_item, stat_results[out_out_item], '', '', '', '']) #add index from bci_history new_out_portrait_list = [] try: bci_history_result = es_bci_history.mget( index=bci_history_index_name, doc_type=bci_history_index_type, body={'ids': bci_search_id_list}, fields=['user_fansnum', 'weibo_month_sum', 'user_friendsnum'])['docs'] except: bci_history_result = [] iter_count = 0 for out_portrait_item in out_portrait_list: append_dict = {} try: bci_history_item = bci_history_result[iter_count] except: bci_history_item = {} new_out_portrait_item = out_portrait_item append_dict['uid'] = out_portrait_item[0] append_dict['uname'] = out_portrait_item[1] append_dict['count'] = out_portrait_item[2] if bci_history_item: if bci_history_item['found'] == True: fansnum = bci_history_item['fields']['user_fansnum'][0] user_weibo_count = bci_history_item['fields'][ 'weibo_month_sum'][0] user_friendsnum = bci_history_item['fields'][ 'user_friendsnum'][0] else: fansnum = '' user_weibo_count = '' user_friendsnum = '' else: fansnum = '' user_weibo_count = '' user_friendsnum = '' append_dict['fansnum'] = fansnum append_dict['weibo_count'] = user_weibo_count append_dict['friendsnum'] = user_friendsnum # new_out_portrait_item[3] = fansnum # new_out_portrait_item[6] = user_weibo_count # new_out_portrait_item[-2] = user_friendsnum #new_out_portrait_list.append(new_out_portrait_item) new_out_portrait_list.append(append_dict) iter_count += 1 ## print append_dict return new_out_portrait_list # uid,名字,提及次数,粉丝数,注册地,关注数,微博数
def localRec(uid, queryInterval=HOUR*25*7, k=200): # 运行状态, # 0 -> 当前为2016-11-28 00:00:00 # 1 -> 当前时间 now_timestamp = datetime2ts(ts2datetime(time.time())) if RUN_TYPE == 0: now_timestamp = datetime2ts(RUN_TEST_TIME) flow_text_index_list = [] for i in range(7, 0, -1): iter_date = ts2datetime(now_timestamp - DAY * i) flow_text_index_list.append(flow_text_index_name_pre + iter_date) # 获取用户地理位置 # user_geos = get_user_geo(uid) # # 根据位置查询weibo # weibo_all = es_flow_text.search(index=flow_text_index_list, doc_type=ads_weibo_index_type, # body={"query":{"bool":{"must": # [{"match":{"keywords_string":"新闻"}}, # {"match":{"geo":"合肥"}} # ]}}, # "size": 200 # })["hits"]["hits"] '''可以直接查询长度大于100的但是很慢 {"query":{"filtered":{"query":{"bool":{"must":[{"match":{"keywords_string":"新闻"}},{"match":{"geo":"合肥"}}]}},"filter":{"regexp":{"text":{"value":".{100,}"}}}}}} ''' ip = get_user_ip(uid) ip = ".".join(ip.split(".")[:-2]) print '326' weibo_all = es_flow_text.search(index=flow_text_index_list, doc_type=ads_weibo_index_type, body={"query": {"bool": {"must": [{"prefix": {"text.ip": ip}}]}}, "size": 2000})["hits"]["hits"] local_weibo_rec = [] weibo_user_uids = [weibo["_source"]["uid"] for weibo in weibo_all] print '332',len(weibo_all) # user_profiles = search_user_profile_by_user_ids(weibo_user_uids) exists_ip = set() topic_word_weight_dic = construct_topic_word_weight_dic(ADS_TOPIC_TFIDF_DIR) for weibo in weibo_all: weibo = weibo["_source"] weibo_text = weibo["text"] if weibo["ip"] in exists_ip: continue # 一个ip只选一个 exists_ip.add(weibo["ip"]) if not is_suit(weibo_text): continue weibo["len"] = len(weibo_text) try: mid = weibo["mid"] uid = weibo["uid"] except: continue weibo["weibo_url"] = weiboinfo2url(uid, mid) weibo["weibo_topic"] = judge_ads_topic(list(jieba.cut(weibo_text)), topic_word_weight_dic) # 可能出现许多userprofile查不到的情况 # if uid in user_profiles: # weibo["photo_url"] = user_profiles[uid]["photo_url"] # weibo["nick_name"] = user_profiles[uid]["nick_name"] # else: # weibo["photo_url"] = "None" # weibo["nick_name"] = "None" # local_weibo_rec.append(weibo) local_weibo_rec.append(weibo) return local_weibo_rec
def search_mention(now_ts, uid, top_count): date = ts2datetime(now_ts) #evaluate_max_dict = get_evaluate_max() ts = datetime2ts(date) stat_results = dict() results = dict() uid_dict = {} for i in range(1,8): ts = ts - DAY try: result_string = r_cluster.hget('at_' + str(ts), str(uid)) except: result_string = '' if not result_string: continue result_dict = json.loads(result_string) for at_uname in result_dict: try: stat_results[at_uname] += result_dict[at_uname] except: stat_results[at_uname] = result_dict[at_uname] sort_stat_results = sorted(stat_results.items(), key=lambda x:x[1], reverse=True) # print sort_stat_results out_portrait_list = [] out_list = stat_results.keys() #use to get user information from user profile out_query_list = [{'match':{'uname':item}} for item in out_list] if len(out_query_list) != 0: query = [{'bool':{'should': out_query_list}}] try: out_profile_result = es_user_profile.search(index=profile_index_name, doc_type=profile_index_type, body={'query':{'bool':{'must':query}}, 'size':100})['hits']['hits'] except: out_profile_result = [] else: out_profile_result = [] out_in_profile_list = [] bci_search_id_list = [] for out_item in out_profile_result: source = out_item['_source'] uname = source['nick_name'] uid = source['uid'] location = source['location'] friendsnum = source['friendsnum'] out_portrait_list.append([uid, uname, stat_results[uname], '', location, friendsnum, '']) out_in_profile_list.append(uname) #use to search bci history bci_search_id_list.append(uid) out_out_profile_list = list(set(out_list) - set(out_in_profile_list)) for out_out_item in out_out_profile_list: out_portrait_list.append(['', out_out_item, stat_results[out_out_item],'', '', '', '']) #add index from bci_history new_out_portrait_list = [] try: bci_history_result = es_bci_history.mget(index=bci_history_index_name, doc_type=bci_history_index_type, body={'ids': bci_search_id_list}, fields=['user_fansnum', 'weibo_month_sum', 'user_friendsnum'])['docs'] except: bci_history_result = [] iter_count = 0 for out_portrait_item in out_portrait_list: append_dict = {} try: bci_history_item = bci_history_result[iter_count] except: bci_history_item = {} new_out_portrait_item = out_portrait_item append_dict['uid'] = out_portrait_item[0] append_dict['uname'] = out_portrait_item[1] append_dict['count'] = out_portrait_item[2] if bci_history_item: if bci_history_item['found'] == True: fansnum = bci_history_item['fields']['user_fansnum'][0] user_weibo_count = bci_history_item['fields']['weibo_month_sum'][0] user_friendsnum = bci_history_item['fields']['user_friendsnum'][0] else: fansnum = '' user_weibo_count = '' user_friendsnum = '' else: fansnum = '' user_weibo_count = '' user_friendsnum = '' append_dict['fansnum'] = fansnum append_dict['weibo_count'] = user_weibo_count append_dict['friendsnum'] = user_friendsnum # new_out_portrait_item[3] = fansnum # new_out_portrait_item[6] = user_weibo_count # new_out_portrait_item[-2] = user_friendsnum #new_out_portrait_list.append(new_out_portrait_item) new_out_portrait_list.append(append_dict) iter_count += 1 #print append_dict return new_out_portrait_list # uid,名字,提及次数,粉丝数,注册地,关注数,微博数
def search_bidirect_interaction(uid, top_count): now_ts = time.time() now_date_ts = datetime2ts(ts2datetime(now_ts)) db_number = get_db_num(now_date_ts) retweet_index_name = retweet_index_name_pre + str(db_number) be_retweet_index_name = be_retweet_index_name_pre + str(db_number) comment_index_name = comment_index_name_pre + str(db_number) be_comment_index_name = be_comment_index_name_pre + str(db_number) results = {} retweet_inter_dict = {} comment_inter_dict = {} center_uid = uid #bidirect interaction in retweet and be_retweet try: retweet_result = es_retweet.get(index=retweet_index_name, doc_type=retweet_index_type, id=uid)['_source'] except: retweet_result = {} if retweet_result: retweet_uid_dict = json.loads(retweet_result['uid_retweet']) else: retweet_uid_dict = {} retweet_uid_list = retweet_uid_dict.keys() try: be_retweet_result = es_retweet.get(index=be_retweet_index_name, doc_type=be_retweet_index_type, id=uid)['_source'] except: be_retweet_result = {} if be_retweet_result: be_retweet_uid_dict = json.loads(be_retweet_result['uid_be_retweet']) else: be_retweet_uid_dict = {} #bidirect interaction in comment and be_comment try: comment_result = es_comment.get(index=comment_index_name, doc_type=comment_index_type, id=uid)['_source'] except: comment_result = {} if comment_result: comment_uid_dict = json.loads(comment_result['uid_comment']) else: comment_uid_dict = {} comment_uid_list = comment_uid_dict.keys() try: be_comment_result = es_comment.get(index=be_coment_index_name, doc_type=be_comment_index_type, id=uid)['_source'] except: be_comment_result = {} if be_comment_result: be_comment_uid_dict = json.loads(be_comment_result['uid_be_comment']) else: be_comment_uid_dict = {} #get bidirect_interaction dict #all_interaction_dict = union_dict(retweet_inter_dict, comment_inter_dict) retweet_comment_result = union_dict(retweet_uid_dict, comment_uid_dict) be_retweet_comment_result = union_dict(be_retweet_uid_dict, be_comment_uid_dict) interaction_user_set = set(retweet_comment_result.keys()) & set(be_retweet_comment_result.keys()) interaction_user_list = list(interaction_user_set) all_interaction_dict = {} for interaction_user in interaction_user_list: if interaction_user != center_uid: all_interaction_dict[interaction_user] = retweet_comment_result[interaction_user] + be_retweet_comment_result[interaction_user] sort_all_interaction_dict = sorted(all_interaction_dict.items(), key=lambda x:x[1], reverse=True) #get in_portrait_list, in_portrait_results and out_portrait_list all_interaction_uid_list = [item[0] for item in sort_all_interaction_dict] #print all_interaction_uid_list # if RUN_TYPE == 0: # all_interaction_dict = {'2029036025':3,'1282005885':2,'2549228714':2,'1809833450':1} # all_interaction_uid_list = ['2029036025', '1282005885', '2549228714', '1809833450'] out_portrait_list = all_interaction_uid_list #use to get user information from user profile out_portrait_result = {} try: out_user_result = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={'ids':out_portrait_list})['docs'] except: out_user_result = [] #add index from bci_history try: bci_history_result = es_bci_history.mget(index=bci_history_index_name, doc_type=bci_history_index_type, body={'ids': out_portrait_list}, fields=fields)['docs'] except: bci_history_result = [] iter_count = 0 out_portrait_list = [] for out_user_item in out_user_result: uid = out_user_item['_id'] if out_user_item['found'] == True: source = out_user_item['_source'] uname = source['nick_name'] photo_url = source['photo_url'] if uname == '': uname = u'未知' location = source['user_location'] friendsnum = source['friendsnum'] else: uname = u'未知' location = '' friendsnum = '' photo_url = 'unknown' #add index from bci_history try: bci_history_item = bci_history_result[iter_count] except: bci_history_item = {'found': False} # print bci_history_item if bci_history_item['found'] == True: fansnum = bci_history_item['fields'][fields[0]][0] user_weibo_count = bci_history_item['fields'][fields[1]][0] user_friendsnum = bci_history_item['fields'][fields[2]][0] influence = bci_history_item['fields'][fields[3]][0] else: fansnum = '' user_weibo_count = '' user_friendsnum = '' interaction_count = int(all_interaction_dict[uid]) out_portrait_list.append({'uid':uid,'photo_url':photo_url,'uname':uname, 'count':interaction_count, 'fansnum':fansnum,'friendsnum': user_friendsnum,'weibo_count': user_weibo_count}) iter_count += 1 return out_portrait_list
def group_user_weibo(task_name, submit_user, sort_type): weibo_list = [] now_date = ts2datetime(time.time()) if sort_type == 'retweet': sort_type = 'retweeted' #run_type if RUN_TYPE == 0: now_date = RUN_TEST_TIME sort_type = 'timestamp' #step1: get group user task_id = submit_user + '-' + task_name try: group_exist_result = es_group_result.get(index=group_index_name, doc_type=group_index_type,\ id=task_id)['_source'] except: group_exist_result = {} if not group_exist_result: return 'group no exist' #step2: get user weibo list uid_list = group_exist_result['uid_list'] for i in range(6,-1,-1): iter_date = ts2datetime(datetime2ts(now_date) - i * DAY) index_name = flow_text_index_name_pre + iter_date try: weibo_result = es_flow_text.search(index=index_name, doc_type=flow_text_index_type,\ body={'query':{'filtered':{'filter':{'terms':{'uid': uid_list}}}}, 'sort':[{sort_type: {'order': 'desc'}}], 'size':100})['hits']['hits'] except: weibo_result = [] if weibo_result: weibo_list.extend(weibo_result) #sort_weibo_list = sorted(weibo_list, key=lambda x:x['_source'][sort_type], reverse=True)[:100] sort_weibo_list = weibo_list #step3: get user name try: portrait_exist_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type, \ body={'ids':uid_list})['docs'] except: portrait_exist_result = [] uid2uname_dict = {} for portrait_item in portrait_exist_result: uid = portrait_item['_id'] if portrait_item['found'] == True: source = portrait_item['_source'] uname = source['uname'] else: uname = 'unknown' uid2uname_dict[uid] = uname weibo_list = [] for weibo_item in sort_weibo_list: source = weibo_item['_source'] mid = source['mid'] uid = source['uid'] uname = uid2uname_dict[uid] text = source['text'] ip = source['geo'] timestamp = source['timestamp'] date = ts2date(timestamp) sentiment = source['sentiment'] weibo_url = weiboinfo2url(uid, mid) #run_type: if RUN_TYPE == 1: try: retweet_count = source['retweeted'] except: retweet_count = 0 try: comment_count = source['comment'] except: comment_count = 0 try: sensitive_score = source['sensitive'] except: sensitive_score = 0 else: retweet_count = 0 comment_count = 0 sensitive_score = 0 city = ip2city(ip) weibo_list.append([mid, uid, uname, text, ip, city, timestamp, date, retweet_count, comment_count, sensitive_score, weibo_url]) if sort_type == 'timestamp': new_weibo_list = sorted(weibo_list, key=lambda x:x[6], reverse=True) elif sort_type == 'retweeted': new_weibo_list = sorted(weibo_list, key=lambda x:x[8], reverse=True) elif sort_type == 'comment': new_weibo_list = sorted(weibo_list, key=lambda x:x[9], reverse=True) elif sort_type == 'sensitive': new_weibo_list = sorted(weibo_list, key=lambda x:x[10], reverse=True) return new_weibo_list
def get_social_inter_content(uid1, uid2, type_mark): weibo_list = [] #get two type relation about uid1 and uid2 #search weibo list now_ts = int(time.time()) #run_type if RUN_TYPE == 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = datetime2ts(RUN_TEST_TIME) #uid2uname uid2uname = {} try: portrait_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type ,\ body={'ids': [uid1, uid2]}, _source=False, fields=['uid', 'uname'])['docs'] except: portrait_result = [] for item in portrait_result: uid = item['_id'] if item['found'] == True: uname = item['fields']['uname'][0] uid2uname[uid] = uname else: uid2uname[uid] = 'unknown' #iter date to search weibo list for i in range(7, 0, -1): iter_date_ts = now_date_ts - i * DAY iter_date = ts2datetime(iter_date_ts) flow_text_index_name = flow_text_index_name_pre + str(iter_date) query = [] query.append({ 'bool': { 'must': [{ 'term': { 'uid': uid1 } }, { 'term': { 'directed_uid': int(uid2) } }] } }) if type_mark == 'out': query.append({ 'bool': { 'must': [{ 'term': { 'uid': uid2 } }, { 'term': { 'directed_uid': int(uid1) } }] } }) try: flow_text_result = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\ body={'query': {'bool':{'should': query}}, 'sort':[{'timestamp':{'order': 'asc'}}], 'size':MAX_VALUE})['hits']['hits'] except: flow_text_result = [] for flow_text in flow_text_result: source = flow_text['_source'] weibo = {} weibo['timestamp'] = source['timestamp'] weibo['ip'] = source['ip'] weibo['geo'] = source['geo'] weibo['text'] = '\t'.join(source['text'].split('&')) weibo['uid'] = source['uid'] weibo['uname'] = uid2uname[weibo['uid']] weibo['directed_uid'] = str(source['directed_uid']) weibo['directed_uname'] = uid2uname[str(source['directed_uid'])] weibo_list.append(weibo) return weibo_list
def new_get_user_weibo(uid, sort_type): results = [] weibo_list = [] now_date = ts2datetime(time.time()) #run_type if RUN_TYPE == 0: now_date = RUN_TEST_TIME sort_type = 'timestamp' #step1:get user name print '708' try: user_profile_result = es_user_profile.get(index=profile_index_name, doc_type=profile_index_type,\ id=uid, _source=False, fields=['nick_name']) except: user_profile_result = {} print '714',len(user_profile_result) if user_profile_result: uname = user_profile_result['fields']['nick_name'][0] else: uname = '' #step2:get user weibo for i in range(7, 0, -1): if RUN_TYPE == 1: iter_date = ts2datetime(datetime2ts(now_date) - i * DAY) else: iter_date = '2013-09-01' index_name = flow_text_index_name_pre + iter_date print '726' try: weibo_result = es_flow_text.search(index=index_name, doc_type=flow_text_index_type,\ body={'query':{'filtered':{'filter':{'term': {'uid': uid}}}}, 'size':MAX_VALUE})['hits']['hits'] #print weibo_result except: weibo_result = [] print '732',len(weibo_result) if weibo_result: weibo_list.extend(weibo_result) #sort_weibo_list = sorted(weibo_list, key=lambda x:x['_source'][sort_type], reverse=True)[:100] mid_set = set() for weibo_item in weibo_list: source = weibo_item['_source'] mid = source['mid'] uid = source['uid'] text = source['text'] ip = source['ip'] timestamp = source['timestamp'] date = ts2date(timestamp) sentiment = source['sentiment'] weibo_url = weiboinfo2url(uid, mid) #run_type if RUN_TYPE == 1: try: retweet_count = source['retweeted'] except: retweet_count = 0 try: comment_count = source['comment'] except: comment_count = 0 try: sensitive_score = source['sensitive'] except: sensitive_score = 0 else: retweet_count = 0 comment_count = 0 sensitive_score = 0 city = ip2city(ip) if mid not in mid_set: results.append([mid, uid, text, ip, city,timestamp, date, retweet_count, comment_count, sensitive_score, weibo_url]) mid_set.add(mid) if sort_type == 'timestamp': sort_results = sorted(results, key=lambda x:x[5], reverse=True) elif sort_type == 'retweet_count': sort_results = sorted(results, key=lambda x:x[7], reverse=True) elif sort_type == 'comment_count': sort_results = sorted(results, key=lambda x:x[8], reverse=True) elif sort_type == 'sensitive': sort_results = sorted(results, key=lambda x:x[9], reverse=True) print '778' return sort_results
def search_weibo(root_uid, uid, mtype): query_body = { #'query':{ 'filter': { 'bool': { 'must': [{ 'term': { 'uid': uid } }, { 'term': { 'message_type': mtype } }], 'should': [{ 'term': { 'root_uid': root_uid } }, { 'term': { 'directed_uid': root_uid } }], } } #} } index_list = [] for i in range(7, 0, -1): if RUN_TYPE == 1: iter_date = ts2datetime(datetime2ts(now_date) - i * DAY) else: iter_date = ts2datetime(datetime2ts(RUN_TEST_TIME) - i * DAY) index_list.append(flow_text_index_name_pre + iter_date) results = es_flow_text.search(index=index_list, doc_type=flow_text_index_type, body=query_body)['hits']['hits'] weibo = {} f_result = [] if len(results) > 0: for result in results: ## print type(result),result weibo['last_text'] = [ result['_source']['text'], result['_source']['text'], result['_source']['timestamp'] ] mid = result['_source']['root_mid'] # # print mid len_pre = len(flow_text_index_name_pre) index = result['_index'][len_pre:] root_index = [] for j in range(0, 7): #一周的,一个月的话就0,30 iter_date = ts2datetime(datetime2ts(index) - j * DAY) root_index.append(flow_text_index_name_pre + iter_date) results0 = es_flow_text.search( index=root_index, doc_type=flow_text_index_type, body={'query': { 'term': { 'mid': mid } }})['hits']['hits'] if len(results0) > 0: for result0 in results0: weibo['ori_text'] = [ result0['_source']['text'], result0['_source']['timestamp'] ] f_result.append(weibo) weibo = {} return f_result
def search_fans(uid, top_count): results = {} now_ts = time.time() now_date_ts = datetime2ts(ts2datetime(now_ts)) db_number = get_db_num(now_date_ts) be_comment_index_name = be_comment_index_name_pre + str(db_number) be_retweet_index_name = be_retweet_index_name_pre + str(db_number) result = {} be_retweet_inter_dict = {} be_comment_inter_dict = {} center_uid = uid try: be_retweet_result = es_retweet.get(index=be_retweet_index_name, doc_type=be_retweet_index_type, id=uid)['_source'] except: be_retweet_result = {} if be_retweet_result: be_retweet_uid_dict = json.loads(be_retweet_result['uid_be_retweet']) else: be_retweet_uid_dict = {} # # print "be_retweet_uid_dict", be_retweet_uid_dict try: be_comment_result = es_be_comment.get(index=be_comment_index_name, doc_type=be_comment_index_type, id=uid)['_source'] except: be_comment_result = {} if be_comment_result: be_comment_uid_dict = json.loads(be_comment_result['uid_be_comment']) else: be_comment_uid_dict = {} # # print "be_comment_uid_dict", be_comment_uid_dict fans_result = union_dict(be_retweet_uid_dict, be_comment_uid_dict) fans_user_set = set(fans_result.keys()) fans_list = list(fans_user_set) # # print "fans_list", fans_list all_fans_dict = {} for fans_user in fans_list: if fans_user != center_uid: all_fans_dict[fans_user] = fans_result[fans_user] sort_all_fans_dict = sorted(all_fans_dict.items(), key=lambda x: x[1], reverse=True) all_fans_uid_list = [] all_fans_uid_list_all = [item[0] for item in sort_all_fans_dict] count = 0 for i in all_fans_uid_list_all: count += 1 all_fans_uid_list.append(i) if count == 1000: break # # print all_fans_uid_list out_portrait_list = all_fans_uid_list #use to get user information from user profile out_portrait_result = {} try: out_user_result = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={'ids': out_portrait_list })['docs'] except: out_user_result = [] #add index from bci_history try: bci_history_result = es_bci_history.mget( index=bci_history_index_name, doc_type=bci_history_index_type, body={'ids': out_portrait_list}, fields=fields)['docs'] except: bci_history_result = [] iter_count = 0 out_portrait_list = [] for out_user_item in out_user_result: uid = out_user_item['_id'] if out_user_item['found'] == True: source = out_user_item['_source'] uname = source['nick_name'] photo_url = source['photo_url'] if uname == '': uname = u'未知' location = source['user_location'] friendsnum = source['friendsnum'] else: uname = u'未知' location = '' friendsnum = '' photo_url = 'unknown' #add index from bci_history try: bci_history_item = bci_history_result[iter_count] except: bci_history_item = {'found': False} # # print bci_history_item if bci_history_item['found'] == True: fansnum = bci_history_item['fields'][fields[0]][0] user_weibo_count = bci_history_item['fields'][fields[1]][0] user_friendsnum = bci_history_item['fields'][fields[2]][0] influence = bci_history_item['fields'][fields[3]][0] else: fansnum = '' user_weibo_count = '' user_friendsnum = '' fans_count = int(all_fans_dict[uid]) out_portrait_list.append({ 'uid': uid, 'photo_url': photo_url, 'uname': uname, 'count': fans_count, 'fansnum': fansnum, 'friendsnum': user_friendsnum, 'weibo_count': user_weibo_count }) iter_count += 1 return out_portrait_list
def search_fans_new(uid, top_count): results = {} now_ts = time.time() now_date_ts = datetime2ts(ts2datetime(now_ts)) db_number = get_db_num(now_date_ts) fan_result_new = es_fans.get(index=fans_index_name, doc_type=fans_index_type, id=uid)['_source'] fan_result_new = json.loads(fan_result_new['uid_be_retweet']) out_portrait_list = [] # # print fan_result_new i = 1 for key in fan_result_new: # # print key fansnum = 0 user_friendsnum = 0 user_weibo_count = 0 fans_count = 0 uid = fan_result_new[key]['uid'] if fan_result_new[key]['photo_url']: photo_url = fan_result_new[key]['photo_url'] else: photo_url = "http://tp2.sinaimg.cn/1878376757/50/0/1" if fan_result_new[key]['nick_name']: uname = fan_result_new[key]['nick_name'] else: uname = uid if fan_result_new[key]['times']: fans_count = fan_result_new[key]['times'] else: fans_count = 0 if fan_result_new[key]['fansnum']: fansnum = fan_result_new[key]['fansnum'] else: fansnum = 0 if fan_result_new[key]['friendsnum']: user_friendsnum = fan_result_new[key]['friendsnum'] else: user_friendsnum = 0 if fan_result_new[key]['statusnum']: user_weibo_count = fan_result_new[key]['statusnum'] else: user_weibo_count = 0 out_portrait_list.append({ 'uid': uid, 'photo_url': photo_url, 'uname': uname, 'count': fans_count, 'fansnum': fansnum, 'friendsnum': user_friendsnum, 'weibo_count': user_weibo_count }) if i > 100: break i = i + 1 return out_portrait_list
def adsRec(uid, queryInterval=HOUR * 24): ''' 从广告表中读取当前时间点前一段时间queryInterval内的广微博,得到其中的广告部分 然后根据用户的key_word信息得到推荐的广告。 :param uid: 用户ID :param queryInterval: 查询之前多久的广告 :return: 广告微博列表,按照相关度(感兴趣程度)排序 ''' # 运行状态, # 0 -> 当前为2013-9-8 00:00:00 # 1 -> 当前时间 now_date = ts2datetime(time.time()) if RUN_TYPE == 1 else ts2datetime( datetime2ts(RUN_TEST_TIME) - DAY) # 获取用户的偏好 try: print uid user_portrait_result = es_user_portrait. \ get_source(index=portrait_index_name, doc_type=profile_index_type, id=uid) except: return None user_key_words = set(user_portrait_result["keywords_string"].split("&")) # 直接从广告表中读取并计算 ads_weibo_all = es_ads_weibo.search( index=ads_weibo_index_name, doc_type=ads_weibo_index_type, body={ 'query': { "filtered": { "filter": { "range": { "timestamp": { "gte": datetime2ts(now_date) - queryInterval } } } } }, 'size': 2000, })['hits']['hits'] random.shuffle(ads_weibo_all) ads_weibo_all = ads_weibo_all[:800] # 根据权重得到不同类别上词语的权重TFIDF topic_word_weight_dic = construct_topic_word_weight_dic( ADS_TOPIC_TFIDF_DIR) # 根据用户发微博的keywords得到用户在广告的topic上的分布 # 因为已有的topic不太适合广告的分类 user_topic_dic = construct_topic_feature_dic(user_key_words, topic_word_weight_dic) test_user_topic = { "3069348215": { u'\u5a31\u4e50': 10.0, u'\u751f\u6d3b': 14.0, u'\u4f53\u80b2': 1.0, u'\u8d22\u7ecf': 0, u'\u6821\u56ed': 0, u'IT': 9.0, u'\u6c7d\u8f66': 1.0, u'\u6e38\u620f\u52a8\u6f2b': 6.0, u'\u6559\u80b2': 2.0 }, "2218894100": { u'\u5a31\u4e50': 0, u'\u751f\u6d3b': 5.0, u'\u4f53\u80b2': 16.0, u'\u8d22\u7ecf': 1.0, u'\u6821\u56ed': 0, u'IT': 0, u'\u6c7d\u8f66': 0, u'\u6e38\u620f\u52a8\u6f2b': 0, u'\u6559\u80b2': 0 }, "1035933493": { u'\u5a31\u4e50': 0, u'\u751f\u6d3b': 10.0, u'\u4f53\u80b2': 17.0, u'\u8d22\u7ecf': 0, u'\u6821\u56ed': 0, u'IT': 9.0, u'\u6c7d\u8f66': 8.0, u'\u6e38\u620f\u52a8\u6f2b': 0, u'\u6559\u80b2': 2.0 }, } if uid in test_user_topic: user_topic_dic = test_user_topic[uid] # topics = [u"IT", u"体育", u"娱乐", u"教育", u"游戏动漫", u"生活", u"校园", u"生活", u"财经"] # for topic in topics: # user_topic_dic[topic] = 1.0 print user_topic_dic print "f**k" ads_weibo_prefer = adsPreferred(user_topic_dic, ads_weibo_all, topic_word_weight_dic, uid, 30) return ads_weibo_prefer
from ruman.global_utils import ES_COPY_USER_PORTRAIT, COPY_USER_PORTRAIT_INFLUENCE, COPY_USER_PORTRAIT_INFLUENCE_TYPE,\ COPY_USER_PORTRAIT_IMPORTANCE, COPY_USER_PORTRAIT_IMPORTANCE_TYPE, COPY_USER_PORTRAIT_ACTIVENESS,\ COPY_USER_PORTRAIT_ACTIVENESS_TYPE, COPY_USER_PORTRAIT_SENSITIVE, COPY_USER_PORTRAIT_SENSITIVE_TYPE from ruman.global_utils import es_bci_history, bci_history_index_name, bci_history_index_type from ruman.parameter import verified_num2ch_dict, IP_TIME_SEGMENT, DAY, MAX_VALUE from ruman.parameter import RUN_TYPE, RUN_TEST_TIME from ruman.global_config import R_BEGIN_TIME from ruman.time_utils import ts2datetime, datetime2ts, ts2date from ruman.keyword_filter import keyword_filter evaluate_index_dict = {'bci': [COPY_USER_PORTRAIT_INFLUENCE, COPY_USER_PORTRAIT_INFLUENCE_TYPE], \ 'importance': [COPY_USER_PORTRAIT_IMPORTANCE, COPY_USER_PORTRAIT_IMPORTANCE_TYPE],\ 'activeness': [COPY_USER_PORTRAIT_ACTIVENESS, COPY_USER_PORTRAIT_ACTIVENESS_TYPE],\ 'sensitive': [COPY_USER_PORTRAIT_SENSITIVE, COPY_USER_PORTRAIT_SENSITIVE_TYPE ]} r_beigin_ts = datetime2ts(R_BEGIN_TIME) FILTER_ITER_COUNT = 100 #use to get user profile information def new_get_user_profile(uid): try: #print 'trying',es_user_profile,profile_index_name results = es_user_profile.get(index=profile_index_name, doc_type=profile_index_type,\ id=uid)['_source'] #print es_user_profile,profile_index_name except: results = {} #get new fansnum and statusnum try: bci_history_result = es_bci_history.get(index=bci_history_index_name, doc_type=bci_history_index_type, id=uid)['_source'] except:
def localRec(uid, queryInterval=HOUR * 25 * 7, k=200): # 运行状态, # 0 -> 当前为2016-11-28 00:00:00 # 1 -> 当前时间 now_timestamp = datetime2ts(ts2datetime(time.time())) if RUN_TYPE == 0: now_timestamp = datetime2ts(RUN_TEST_TIME) flow_text_index_list = [] for i in range(7, 0, -1): iter_date = ts2datetime(now_timestamp - DAY * i) flow_text_index_list.append(flow_text_index_name_pre + iter_date) # 获取用户地理位置 # user_geos = get_user_geo(uid) # # 根据位置查询weibo # weibo_all = es_flow_text.search(index=flow_text_index_list, doc_type=ads_weibo_index_type, # body={"query":{"bool":{"must": # [{"match":{"keywords_string":"新闻"}}, # {"match":{"geo":"合肥"}} # ]}}, # "size": 200 # })["hits"]["hits"] '''可以直接查询长度大于100的但是很慢 {"query":{"filtered":{"query":{"bool":{"must":[{"match":{"keywords_string":"新闻"}},{"match":{"geo":"合肥"}}]}},"filter":{"regexp":{"text":{"value":".{100,}"}}}}}} ''' ip = get_user_ip(uid) ip = ".".join(ip.split(".")[:-2]) print '326' weibo_all = es_flow_text.search(index=flow_text_index_list, doc_type=ads_weibo_index_type, body={ "query": { "bool": { "must": [{ "prefix": { "text.ip": ip } }] } }, "size": 2000 })["hits"]["hits"] local_weibo_rec = [] weibo_user_uids = [weibo["_source"]["uid"] for weibo in weibo_all] print '332', len(weibo_all) # user_profiles = search_user_profile_by_user_ids(weibo_user_uids) exists_ip = set() topic_word_weight_dic = construct_topic_word_weight_dic( ADS_TOPIC_TFIDF_DIR) for weibo in weibo_all: weibo = weibo["_source"] weibo_text = weibo["text"] if weibo["ip"] in exists_ip: continue # 一个ip只选一个 exists_ip.add(weibo["ip"]) if not is_suit(weibo_text): continue weibo["len"] = len(weibo_text) try: mid = weibo["mid"] uid = weibo["uid"] except: continue weibo["weibo_url"] = weiboinfo2url(uid, mid) weibo["weibo_topic"] = judge_ads_topic(list(jieba.cut(weibo_text)), topic_word_weight_dic) # 可能出现许多userprofile查不到的情况 # if uid in user_profiles: # weibo["photo_url"] = user_profiles[uid]["photo_url"] # weibo["nick_name"] = user_profiles[uid]["nick_name"] # else: # weibo["photo_url"] = "None" # weibo["nick_name"] = "None" # local_weibo_rec.append(weibo) local_weibo_rec.append(weibo) return local_weibo_rec
def adsRec(uid, queryInterval=HOUR * 24): ''' 从广告表中读取当前时间点前一段时间queryInterval内的广微博,得到其中的广告部分 然后根据用户的key_word信息得到推荐的广告。 :param uid: 用户ID :param queryInterval: 查询之前多久的广告 :return: 广告微博列表,按照相关度(感兴趣程度)排序 ''' # 运行状态, # 0 -> 当前为2013-9-8 00:00:00 # 1 -> 当前时间 now_date = ts2datetime(time.time()) if RUN_TYPE == 1 else ts2datetime(datetime2ts(RUN_TEST_TIME) - DAY) # 获取用户的偏好 try: print uid user_portrait_result = es_user_portrait. \ get_source(index=portrait_index_name, doc_type=profile_index_type, id=uid) except: return None user_key_words = set(user_portrait_result["keywords_string"].split("&")) # 直接从广告表中读取并计算 ads_weibo_all = es_ads_weibo.search(index=ads_weibo_index_name, doc_type=ads_weibo_index_type, body={'query': {"filtered": {"filter": { "range": {"timestamp": {"gte": datetime2ts(now_date) - queryInterval}}}}}, 'size': 2000, } )['hits']['hits'] random.shuffle(ads_weibo_all) ads_weibo_all = ads_weibo_all[:800] # 根据权重得到不同类别上词语的权重TFIDF topic_word_weight_dic = construct_topic_word_weight_dic(ADS_TOPIC_TFIDF_DIR) # 根据用户发微博的keywords得到用户在广告的topic上的分布 # 因为已有的topic不太适合广告的分类 user_topic_dic = construct_topic_feature_dic(user_key_words, topic_word_weight_dic) test_user_topic = { "3069348215": {u'\u5a31\u4e50': 10.0, u'\u751f\u6d3b': 14.0, u'\u4f53\u80b2': 1.0, u'\u8d22\u7ecf': 0, u'\u6821\u56ed': 0, u'IT': 9.0, u'\u6c7d\u8f66': 1.0, u'\u6e38\u620f\u52a8\u6f2b': 6.0, u'\u6559\u80b2': 2.0}, "2218894100": {u'\u5a31\u4e50': 0, u'\u751f\u6d3b': 5.0, u'\u4f53\u80b2': 16.0, u'\u8d22\u7ecf': 1.0, u'\u6821\u56ed': 0, u'IT': 0, u'\u6c7d\u8f66': 0, u'\u6e38\u620f\u52a8\u6f2b': 0, u'\u6559\u80b2': 0}, "1035933493": {u'\u5a31\u4e50': 0, u'\u751f\u6d3b': 10.0, u'\u4f53\u80b2': 17.0, u'\u8d22\u7ecf': 0, u'\u6821\u56ed': 0, u'IT': 9.0, u'\u6c7d\u8f66': 8.0, u'\u6e38\u620f\u52a8\u6f2b': 0, u'\u6559\u80b2': 2.0}, } if uid in test_user_topic: user_topic_dic = test_user_topic[uid] # topics = [u"IT", u"体育", u"娱乐", u"教育", u"游戏动漫", u"生活", u"校园", u"生活", u"财经"] # for topic in topics: # user_topic_dic[topic] = 1.0 print user_topic_dic print "f**k" ads_weibo_prefer = adsPreferred(user_topic_dic, ads_weibo_all, topic_word_weight_dic, uid, 30) return ads_weibo_prefer
def search_bidirect_interaction(uid, top_count): now_ts = time.time() now_date_ts = datetime2ts(ts2datetime(now_ts)) db_number = get_db_num(now_date_ts) retweet_index_name = retweet_index_name_pre + str(db_number) be_retweet_index_name = be_retweet_index_name_pre + str(db_number) comment_index_name = comment_index_name_pre + str(db_number) be_comment_index_name = be_comment_index_name_pre + str(db_number) results = {} retweet_inter_dict = {} comment_inter_dict = {} center_uid = uid #bidirect interaction in retweet and be_retweet try: retweet_result = es_retweet.get(index=retweet_index_name, doc_type=retweet_index_type, id=uid)['_source'] except: retweet_result = {} if retweet_result: retweet_uid_dict = json.loads(retweet_result['uid_retweet']) else: retweet_uid_dict = {} retweet_uid_list = retweet_uid_dict.keys() try: be_retweet_result = es_retweet.get(index=be_retweet_index_name, doc_type=be_retweet_index_type, id=uid)['_source'] except: be_retweet_result = {} if be_retweet_result: be_retweet_uid_dict = json.loads(be_retweet_result['uid_be_retweet']) else: be_retweet_uid_dict = {} #bidirect interaction in comment and be_comment try: comment_result = es_comment.get(index=comment_index_name, doc_type=comment_index_type, id=uid)['_source'] except: comment_result = {} if comment_result: comment_uid_dict = json.loads(comment_result['uid_comment']) else: comment_uid_dict = {} comment_uid_list = comment_uid_dict.keys() try: be_comment_result = es_comment.get(index=be_coment_index_name, doc_type=be_comment_index_type, id=uid)['_source'] except: be_comment_result = {} if be_comment_result: be_comment_uid_dict = json.loads(be_comment_result['uid_be_comment']) else: be_comment_uid_dict = {} #get bidirect_interaction dict #all_interaction_dict = union_dict(retweet_inter_dict, comment_inter_dict) retweet_comment_result = union_dict(retweet_uid_dict, comment_uid_dict) be_retweet_comment_result = union_dict(be_retweet_uid_dict, be_comment_uid_dict) interaction_user_set = set(retweet_comment_result.keys()) & set( be_retweet_comment_result.keys()) interaction_user_list = list(interaction_user_set) all_interaction_dict = {} for interaction_user in interaction_user_list: if interaction_user != center_uid: all_interaction_dict[interaction_user] = retweet_comment_result[ interaction_user] + be_retweet_comment_result[interaction_user] sort_all_interaction_dict = sorted(all_interaction_dict.items(), key=lambda x: x[1], reverse=True) #get in_portrait_list, in_portrait_results and out_portrait_list all_interaction_uid_list = [item[0] for item in sort_all_interaction_dict] ## print all_interaction_uid_list # if RUN_TYPE == 0: # all_interaction_dict = {'2029036025':3,'1282005885':2,'2549228714':2,'1809833450':1} # all_interaction_uid_list = ['2029036025', '1282005885', '2549228714', '1809833450'] out_portrait_list = all_interaction_uid_list #use to get user information from user profile out_portrait_result = {} try: out_user_result = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={'ids': out_portrait_list })['docs'] except: out_user_result = [] #add index from bci_history try: bci_history_result = es_bci_history.mget( index=bci_history_index_name, doc_type=bci_history_index_type, body={'ids': out_portrait_list}, fields=fields)['docs'] except: bci_history_result = [] iter_count = 0 out_portrait_list = [] for out_user_item in out_user_result: uid = out_user_item['_id'] if out_user_item['found'] == True: source = out_user_item['_source'] uname = source['nick_name'] photo_url = source['photo_url'] if uname == '': uname = u'未知' location = source['user_location'] friendsnum = source['friendsnum'] else: uname = u'未知' location = '' friendsnum = '' photo_url = 'unknown' #add index from bci_history try: bci_history_item = bci_history_result[iter_count] except: bci_history_item = {'found': False} # # print bci_history_item if bci_history_item['found'] == True: fansnum = bci_history_item['fields'][fields[0]][0] user_weibo_count = bci_history_item['fields'][fields[1]][0] user_friendsnum = bci_history_item['fields'][fields[2]][0] influence = bci_history_item['fields'][fields[3]][0] else: fansnum = '' user_weibo_count = '' user_friendsnum = '' interaction_count = int(all_interaction_dict[uid]) out_portrait_list.append({ 'uid': uid, 'photo_url': photo_url, 'uname': uname, 'count': interaction_count, 'fansnum': fansnum, 'friendsnum': user_friendsnum, 'weibo_count': user_weibo_count }) iter_count += 1 return out_portrait_list
def search_task(task_name, submit_date, state, status, submit_user): results = [] query = [] condition_num = 0 if task_name: task_name_list = task_name.split(' ') for item in task_name_list: query.append({'wildcard':{'task_name': '*' + item + '*'}}) condition_num += 1 if submit_date: submit_date_ts = datetime2ts(submit_date) submit_date_start = submit_date_ts submit_date_end = submit_date_ts + DAY query.append({'range':{'submit_date': {'gte': submit_date_start, 'lt': submit_date_end}}}) condition_num += 1 if state: state_list = state.split(' ') for item in state_list: query.append({'wildcard':{'state': '*' + item + '*'}}) condition_num += 1 if status: query.append({'match':{'status': status}}) condition_num += 1 if submit_user: query.append({'term':{'submit_user': submit_user}}) condition_num += 1 print es_group_result,group_index_name,group_index_type if condition_num > 0: query.append({'term':{'task_type': 'analysis'}}) try: source = es_group_result.search( index = group_index_name, doc_type = group_index_type, body = { 'query':{ 'bool':{ 'must':query } }, 'sort': [{'count':{'order': 'desc'}}], 'size': MAX_VALUE } ) except Exception as e: raise e else: query.append({'term':{'task_type': 'analysis'}}) source = es.search( index = group_index_name, doc_type = group_index_type, body = { 'query':{'bool':{ 'must':query } }, 'sort': [{'count': {'order': 'desc'}}], 'size': MAX_VALUE } ) try: task_dict_list = source['hits']['hits'] except: return None result = [] for task_dict in task_dict_list: try: state = task_dict['_source']['state'] except: state = '' try: status = task_dict['_source']['status'] except: status = 0 #result.append([task_dict['_source']['task_name'], task_dict['_source']['submit_date'], task_dict['_source']['count'], state, status]) result.append({'task_name':task_dict['_source']['task_name'],'submit_date':ts2date(task_dict['_source']['submit_date']), 'group_count':task_dict['_source']['count'], 'status':status}) return result
from ruman.global_config import R_BEGIN_TIME from ruman.parameter import DAY, WEEK, MAX_VALUE, HALF_HOUR, FOUR_HOUR, GEO_COUNT_THRESHOLD, PATTERN_THRESHOLD from ruman.parameter import PSY_DESCRIPTION_FIELD, psy_en2ch_dict, psy_description_dict from ruman.search_user_profile import search_uid2uname from ruman.filter_uid import all_delete_uid from ruman.parameter import IP_TIME_SEGMENT, IP_TOP, DAY, IP_CONCLUSION_TOP, domain_en2ch_dict, topic_en2ch_dict from ruman.parameter import INFLUENCE_TREND_SPAN_THRESHOLD, INFLUENCE_TREND_AVE_MIN_THRESHOLD,\ INFLUENCE_TREND_AVE_MAX_THRESHOLD, INFLUENCE_TREND_DESCRIPTION_TEXT from ruman.parameter import ACTIVENESS_TREND_SPAN_THRESHOLD, ACTIVENESS_TREND_AVE_MIN_THRESHOLD ,\ ACTIVENESS_TREND_AVE_MAX_THRESHOLD, ACTIVENESS_TREND_DESCRIPTION_TEXT from ruman.parameter import SENTIMENT_DICT, ACTIVENESS_TREND_TAG_VECTOR from ruman.parameter import SENTIMENT_SECOND from ruman.parameter import RUN_TYPE, RUN_TEST_TIME from ruman.keyword_filter import keyword_filter r_beigin_ts = datetime2ts(R_BEGIN_TIME) WEEK = 7 emotion_mark_dict = {'126': 'positive', '127':'negative', '128':'anxiety', '129':'angry'} link_ratio_threshold = [0, 0.5, 1] if RUN_TYPE == 0: fields = ['bci_week_sum', 'bci_month_ave', 'bci_month_sum','bci_week_ave'] else: fields = ['user_fansnum', 'weibo_month_sum', 'user_friendsnum','bci_week_ave'] def search_follower(uid, top_count): results = {} now_ts = time.time()
def group_user_weibo(task_name, submit_user, sort_type): weibo_list = [] now_date = ts2datetime(time.time()) if sort_type == 'retweet': sort_type = 'retweeted' #run_type if RUN_TYPE == 0: now_date = RUN_TEST_TIME sort_type = 'timestamp' #step1: get group user task_id = submit_user + '-' + task_name try: group_exist_result = es_group_result.get(index=group_index_name, doc_type=group_index_type,\ id=task_id)['_source'] except: group_exist_result = {} if not group_exist_result: return 'group no exist' #step2: get user weibo list uid_list = group_exist_result['uid_list'] for i in range(6, -1, -1): iter_date = ts2datetime(datetime2ts(now_date) - i * DAY) index_name = flow_text_index_name_pre + iter_date try: weibo_result = es_flow_text.search(index=index_name, doc_type=flow_text_index_type,\ body={'query':{'filtered':{'filter':{'terms':{'uid': uid_list}}}}, 'sort':[{sort_type: {'order': 'desc'}}], 'size':100})['hits']['hits'] except: weibo_result = [] if weibo_result: weibo_list.extend(weibo_result) #sort_weibo_list = sorted(weibo_list, key=lambda x:x['_source'][sort_type], reverse=True)[:100] sort_weibo_list = weibo_list #step3: get user name try: portrait_exist_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type, \ body={'ids':uid_list})['docs'] except: portrait_exist_result = [] uid2uname_dict = {} for portrait_item in portrait_exist_result: uid = portrait_item['_id'] if portrait_item['found'] == True: source = portrait_item['_source'] uname = source['uname'] else: uname = 'unknown' uid2uname_dict[uid] = uname weibo_list = [] for weibo_item in sort_weibo_list: source = weibo_item['_source'] mid = source['mid'] uid = source['uid'] uname = uid2uname_dict[uid] text = source['text'] ip = source['geo'] timestamp = source['timestamp'] date = ts2date(timestamp) sentiment = source['sentiment'] weibo_url = weiboinfo2url(uid, mid) #run_type: if RUN_TYPE == 1: try: retweet_count = source['retweeted'] except: retweet_count = 0 try: comment_count = source['comment'] except: comment_count = 0 try: sensitive_score = source['sensitive'] except: sensitive_score = 0 else: retweet_count = 0 comment_count = 0 sensitive_score = 0 city = ip2city(ip) weibo_list.append([ mid, uid, uname, text, ip, city, timestamp, date, retweet_count, comment_count, sensitive_score, weibo_url ]) if sort_type == 'timestamp': new_weibo_list = sorted(weibo_list, key=lambda x: x[6], reverse=True) elif sort_type == 'retweeted': new_weibo_list = sorted(weibo_list, key=lambda x: x[8], reverse=True) elif sort_type == 'comment': new_weibo_list = sorted(weibo_list, key=lambda x: x[9], reverse=True) elif sort_type == 'sensitive': new_weibo_list = sorted(weibo_list, key=lambda x: x[10], reverse=True) return new_weibo_list
def search_task(task_name, submit_date, state, status, submit_user): results = [] query = [] condition_num = 0 if task_name: task_name_list = task_name.split(' ') for item in task_name_list: query.append({'wildcard': {'task_name': '*' + item + '*'}}) condition_num += 1 if submit_date: submit_date_ts = datetime2ts(submit_date) submit_date_start = submit_date_ts submit_date_end = submit_date_ts + DAY query.append({ 'range': { 'submit_date': { 'gte': submit_date_start, 'lt': submit_date_end } } }) condition_num += 1 if state: state_list = state.split(' ') for item in state_list: query.append({'wildcard': {'state': '*' + item + '*'}}) condition_num += 1 if status: query.append({'match': {'status': status}}) condition_num += 1 if submit_user: query.append({'term': {'submit_user': submit_user}}) condition_num += 1 if condition_num > 0: query.append({'term': {'task_type': 'analysis'}}) try: source = es_group_result.search(index=group_index_name, doc_type=group_index_type, body={ 'query': { 'bool': { 'must': query } }, 'sort': [{ 'count': { 'order': 'desc' } }], 'size': MAX_VALUE }) except Exception as e: raise e else: query.append({'term': {'task_type': 'analysis'}}) source = es.search(index=group_index_name, doc_type=group_index_type, body={ 'query': { 'bool': { 'must': query } }, 'sort': [{ 'count': { 'order': 'desc' } }], 'size': MAX_VALUE }) try: task_dict_list = source['hits']['hits'] except: return None result = [] for task_dict in task_dict_list: try: state = task_dict['_source']['state'] except: state = '' try: status = task_dict['_source']['status'] except: status = 0 result.append([ task_dict['_source']['task_name'], task_dict['_source']['submit_date'], task_dict['_source']['count'], state, status ]) return result
new_get_influence_trend, new_get_sensitive_words #from search_mid import index_mid ''' from ruman.search_user_profile import es_get_source from ruman.global_utils import es_user_portrait as es from ruman.parameter import SOCIAL_DEFAULT_COUNT, SENTIMENT_TREND_DEFAULT_TYPE from ruman.parameter import DEFAULT_SENTIMENT, DAY from ruman.parameter import RUN_TYPE, RUN_TEST_TIME from ruman.time_utils import ts2datetime, datetime2ts #from personal_influence import get_user_influence, influenced_detail, influenced_people, influenced_user_detail, statistics_influence_people, tag_vector, comment_on_influence, detail_weibo_influence, influence_summary # use to test 13-09-08 test_time = datetime2ts(RUN_TEST_TIME) # custom_attribute attribute_index_name = 'custom_attribute' attribute_index_type = 'attribute' mod = Blueprint('info_person_social', __name__, url_prefix='/info_person_social') #use to get user be_retweet from es:be_retweet_1 or be_retweet_2 #write in version:15-12-08 #input: uid, top_count
def influenced_people(uid, mid, influence_style, date, default_number=20): # uid # which weibo----mid, retweeted weibo ---seek for root_mid # influence_style: retweeted(0) or comment(1) date1 = ts2datetime(datetime2ts(date)).replace('-', '') index_name = pre_index + date1 index_flow_text = pre_text_index + date text_result = es.get(index=index_flow_text, doc_type=flow_text_index_type, id=mid)["_source"] temp_mid = text_result.get("root_mid",'') #判断微博是否是原创微博 if temp_mid: mid_type = 1 # 非原创微博 else: mid_type = 0 # 原创微博 query_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ ] } } } }, "size": 30000 } if mid_type == 0: if int(influence_style) == 0: # origin weibo, all retweeted people query_body["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term": {"root_uid": uid}}, {"term": {"message_type": 3}}, {"term": {"root_mid": mid}}]) else: # commented people query_body["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term": {"directed_uid": uid}}, {"term": {"message_type": 2}}, {"term": {"root_mid": mid}}]) else: if int(influence_style) == 0: # origin weibo, all retweeted people query_body["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term": {"directed_uid": uid}}, {"term": {"message_type": 3}}, {"term": {"root_mid": temp_mid}}]) else: # commented people query_body["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term": {"directed_uid": uid}}, {"term": {"message_type": 2}}, {"term": {"root_mid": temp_mid}}]) search_results = es.search(index=index_flow_text, doc_type=flow_text_index_type, body=query_body, fields=["uid"], timeout=30)["hits"]["hits"] results = [] # uid_list if search_results: for item in search_results: if int(item["fields"]["uid"][0]) == int(uid): pass else: results.append(item["fields"]["uid"][0]) results = list(set(results)) else: results = [] bci_index = "bci_" + date.replace('-','') if results: portrait_results = es_user_portrait.mget(index=user_portrait, doc_type=portrait_index_type, body={"ids": results}, fields=["domain", "topic_string", "activity_geo_dict","importance", "influence"])["docs"] bci_results = es_user_portrait.mget(index=bci_index, doc_type='bci', body={"ids":results}, fields=['user_index'])['docs'] else: portrait_results = {} bci_results = {} in_portrait = [] out_portrait = [] in_portrait_info = [] retweeted_domain = {} retweeted_topic = {} retweeted_geo = {} average_influence = 0 total_influence = 0 count = 0 if bci_results: total_influence = 0 for item in bci_results: if item['found']: total_influence += item['fields']['user_index'][0] try: average_influence = total_influence/len(results) except: average_influence = 0 if portrait_results: for item in portrait_results: if item["found"]: temp = [] count += 1 temp.append(item['_id']) temp.append(item["fields"]["importance"][0]) in_portrait.append(temp) temp_domain = item["fields"]["domain"][0].split('&') temp_topic = item["fields"]["topic_string"][0].split('&') temp_geo = json.loads(item["fields"]["activity_geo_dict"][0])[-1].keys() #total_influence += item["fields"]["influence"][0] retweeted_domain = aggregation(temp_domain, retweeted_domain) retweeted_topic = aggregation(temp_topic, retweeted_topic) retweeted_geo = aggregation(temp_geo, retweeted_geo) else: out_portrait.append(item['_id']) retweeted_domain = proportion(retweeted_domain) retweeted_topic = proportion(retweeted_topic) retweeted_geo = proportion(retweeted_geo) #try: # average_influence = total_influence/count #except: # average_influence = 0 sorted_retweeted_domain = sorted(retweeted_domain.items(),key=lambda x:x[1], reverse=True) sorted_retweeted_topic = sorted(retweeted_topic.items(),key=lambda x:x[1], reverse=True) sorted_retweeted_geo = sorted(retweeted_geo.items(), key=lambda x:x[1], reverse=True) retweeted_results = dict() retweeted_results["domian"] = sorted_retweeted_domain[:5] retweeted_results["topic"] = sorted_retweeted_topic[:5] retweeted_results["geo"] = sorted_retweeted_geo[:5] retweeted_results["influence"] = average_influence in_portrait = sorted(in_portrait, key=lambda x:x[1], reverse=True) temp_list = [] for item in in_portrait: temp_list.append(item[0]) retweeted_results['in_portrait_number'] = len(temp_list) retweeted_results['out_portrait_number'] = len(out_portrait) in_portrait_url = get_user_url(temp_list[:default_number]) out_portrait_url = get_user_url(out_portrait[:default_number]) return_results = dict() return_results["influence_users"] = [in_portrait_url, out_portrait_url] return_results["influence_distribution"] = retweeted_results return return_results