def get_text(top_list, date, style): # input: [[mid1, no.1], [mid2, no.2], ['mid3', no.3]] # output: [[text1, no.1], [text2, no.2], [text3, no.3]] # mid, retweeted, comment, text, geo, timestamp, sentiment, mid_url results = [] index_flow_text = pre_text_index + date #index_list = get_text_index(date) if len(top_list) != 0: # no one mid_list = [] for item in top_list: mid_list.append(item[0]) search_result = es.mget(index=index_flow_text, doc_type=flow_text_index_type, body={"ids":mid_list})["docs"] for i in range(len(top_list)): temp = [] temp.extend(top_list[i]) if search_result[i]['found']: source = search_result[i]['_source'] temp.append(source["text"]) temp.append(source["geo"]) temp.append(ts2date(source["timestamp"])) temp.append(source["sentiment"]) temp.append(weiboinfo2url(source['uid'], source['mid'])) temp.append(uid_url+source['uid']) temp.append(source['uid']) try: uname = es_profile.get(index=profile_index_name, doc_type=profile_index_type, id=source['uid'])["_source"]["nick_name"] temp.append(uname) except: temp.append("unknown") else: temp.extend(["", "", "", "", "", "", "", ""]) results.append(temp) return results
def new_get_user_weibo(uid, sort_type): results = [] weibo_list = [] now_date = ts2datetime(time.time()) #run_type if RUN_TYPE == 0: now_date = RUN_TEST_TIME sort_type = 'timestamp' #step1:get user name try: user_profile_result = es_user_profile.get(index=profile_index_name, doc_type=profile_index_type,\ id=uid, _source=False, fields=['nick_name']) except: user_profile_result = {} if user_profile_result: uname = user_profile_result['fields']['nick_name'][0] else: uname = '' #step2:get user weibo for i in range(7, 0, -1): iter_date = ts2datetime(datetime2ts(now_date) - i * DAY) index_name = flow_text_index_name_pre + iter_date try: weibo_result = es_flow_text.search(index=index_name, doc_type=flow_text_index_type,\ body={'query':{'filtered':{'filter':{'term': {'uid': uid}}}}, 'sort':sort_type, 'size':100})['hits']['hits'] except: weibo_result = [] if weibo_result: weibo_list.extend(weibo_result) print 'weibo_list:', weibo_list[0] sort_weibo_list = sorted(weibo_list, key=lambda x: x['_source'][sort_type], reverse=True)[:100] for weibo_item in sort_weibo_list: source = weibo_item['_source'] mid = source['mid'] uid = source['uid'] text = source['text'] ip = source['geo'] timestamp = source['timestamp'] date = ts2date(timestamp) sentiment = source['sentiment'] #run_type if RUN_TYPE == 1: retweet_count = source['retweet_count'] comment_count = source['comment_count'] sensitive_score = source['sensitive'] else: retweet_count = 0 comment_count = 0 sensitive_score = 0 city = ip2city(ip) results.append([ mid, uid, text, ip, city, timestamp, date, retweet_count, comment_count, sensitive_score ]) return results
def new_get_user_weibo(uid, sort_type): results = [] weibo_list = [] now_date = ts2datetime(time.time()) #run_type if RUN_TYPE == 0: now_date = RUN_TEST_TIME sort_type = 'timestamp' #step1:get user name try: user_profile_result = es_user_profile.get(index=profile_index_name, doc_type=profile_index_type,\ id=uid, _source=False, fields=['nick_name']) except: user_profile_result = {} if user_profile_result: uname = user_profile_result['fields']['nick_name'][0] else: uname = '' #step2:get user weibo for i in range(7, 0, -1): iter_date = ts2datetime(datetime2ts(now_date) - i * DAY) index_name = flow_text_index_name_pre + iter_date try: weibo_result = es_flow_text.search(index=index_name, doc_type=flow_text_index_type,\ body={'query':{'filtered':{'filter':{'term': {'uid': uid}}}}, 'sort':sort_type, 'size':100})['hits']['hits'] except: weibo_result = [] if weibo_result: weibo_list.extend(weibo_result) print 'weibo_list:', weibo_list[0] sort_weibo_list = sorted(weibo_list, key=lambda x:x['_source'][sort_type], reverse=True)[:100] for weibo_item in sort_weibo_list: source = weibo_item['_source'] mid = source['mid'] uid = source['uid'] text = source['text'] ip = source['geo'] timestamp = source['timestamp'] date = ts2date(timestamp) sentiment = source['sentiment'] #run_type if RUN_TYPE == 1: retweet_count = source['retweet_count'] comment_count = source['comment_count'] sensitive_score = source['sensitive'] else: retweet_count = 0 comment_count = 0 sensitive_score = 0 city = ip2city(ip) results.append([mid, uid, text, ip, city,timestamp, date, retweet_count, comment_count, sensitive_score]) return results
def get_text(top_list, date, user_info, style): # input: [[mid1, no.1], [mid2, no.2], ['mid3', no.3]] # output: [[text1, no.1], [text2, no.2], [text3, no.3]] # mid, retweeted, comment, text, geo, timestamp, sentiment, mid_url results = [] detail_list = [ "origin_weibo_retweeted_detail", "origin_weibo_comment_detail", "retweeted_weibo_retweeted_detail", "retweeted_weibo_comment_detail", ] index_flow_text = pre_text_index + date if len(top_list) != 0: # no one mid_list = [] for i in range(len(top_list)): mid_list.append(top_list[i][0]) search_result = es.mget(index=index_flow_text, doc_type=flow_text_index_type, body={"ids": mid_list})["docs"] for i in range(len(top_list)): temp = [] temp.append(mid_list[i]) if int(style) == 0: temp.append(top_list[i][1]) temp.append(json.loads(user_info[detail_list[1]]).get(top_list[i][0], 0)) elif int(style) == 1: temp.append(json.loads(user_info[detail_list[0]]).get(top_list[i][0], 0)) temp.append(top_list[i][1]) elif int(style) == 2: temp.append(top_list[i][1]) temp.append(json.loads(user_info[detail_list[3]]).get(top_list[i][0], 0)) else: temp.append(json.loads(user_info[detail_list[2]]).get(top_list[i][0], 0)) temp.append(top_list[i][1]) if search_result[i]["found"]: source = search_result[i]["_source"] temp.append(source["text"]) temp.append(source["geo"]) temp.append(ts2date(source["timestamp"])) temp.append(source["sentiment"]) temp.append(weiboinfo2url(source["uid"], source["mid"])) temp.append(uid_url + source["uid"]) temp.append(source["uid"]) try: uname = es_profile.get(index=profile_index_name, doc_type=profile_index_type, id=source["uid"])[ "_source" ]["nick_name"] temp.append(uname) except: temp.append("unknown") else: temp.extend(["", "", "", "", "", "", "", ""]) results.append(temp) return results
def get_text(top_list, date, user_info, style): # input: [[mid1, no.1], [mid2, no.2], ['mid3', no.3]] # output: [[text1, no.1], [text2, no.2], [text3, no.3]] # mid, retweeted, comment, text, geo, timestamp, sentiment, mid_url results = [] detail_list = ["origin_weibo_retweeted_detail", "origin_weibo_comment_detail", "retweeted_weibo_retweeted_detail", "retweeted_weibo_comment_detail"] index_flow_text = pre_text_index + date if len(top_list) != 0: # no one mid_list = [] for i in range(len(top_list)): mid_list.append(top_list[i][0]) search_result = es.mget(index=index_flow_text, doc_type=flow_text_index_type, body={"ids":mid_list})["docs"] for i in range(len(top_list)): temp = [] temp.append(mid_list[i]) if int(style) == 0: temp.append(top_list[i][1]) temp.append(json.loads(user_info[detail_list[1]]).get(top_list[i][0], 0)) elif int(style) == 1: temp.append(json.loads(user_info[detail_list[0]]).get(top_list[i][0], 0)) temp.append(top_list[i][1]) elif int(style) == 2: temp.append(top_list[i][1]) temp.append(json.loads(user_info[detail_list[3]]).get(top_list[i][0], 0)) else: temp.append(json.loads(user_info[detail_list[2]]).get(top_list[i][0], 0)) temp.append(top_list[i][1]) if search_result[i]["found"]: source = search_result[i]["_source"] temp.append(source["text"]) temp.append(source["geo"]) temp.append(ts2date(source["timestamp"])) temp.append(source["sentiment"]) temp.append(weiboinfo2url(source['uid'], source['mid'])) temp.append(uid_url+source['uid']) temp.append(source['uid']) try: uname = es_profile.get(index=profile_index_name, doc_type=profile_index_type, id=source['uid'])["_source"]["nick_name"] temp.append(uname) except: temp.append("unknown") else: temp.extend(["", "", "", "", "", "", "", ""]) results.append(temp) return results
def new_get_user_profile(uid): try: #print 'trying',es_user_profile,profile_index_name results = es_user_profile.get(index=profile_index_name, doc_type=profile_index_type,\ id=uid)['_source'] #print es_user_profile,profile_index_name except: results = {} #get new fansnum and statusnum try: bci_history_result = es_bci_history.get( index=bci_history_index_name, doc_type=bci_history_index_type, id=uid)['_source'] except: bci_history_result = {} if not results: results['uid'] = uid results['photo_url'] = '' results['nick_name'] = '' results['verified_type'] = '' results['verified_type_ch'] = '' results['fansnum'] = '' results['friendsnum'] = '' results['statusnum'] = '' results['user_location'] = '' results['description'] = '' else: verified_num_type = results['verified_type'] try: verified_ch_type = verified_num2ch_dict[verified_num_type] except: verified_ch_type = '' results['verified_type_ch'] = verified_ch_type if bci_history_result: try: results['fansnum'] = int(bci_history_result['user_fansnum']) results['friendsnum'] = int(bci_history_result['user_friendsnum']) results['statusnum'] = int(bci_history_result['weibo_month_sum']) except: pass return results
def new_get_user_profile(uid): try: results = es_user_profile.get(index=profile_index_name, doc_type=profile_index_type,\ id=uid)['_source'] except: results = {} if not results: results['uid'] = uid results['photo_url'] = '' results['nick_name'] = '' results['verified_type'] = '' results['verified_type_ch'] = '' results['fansnum'] = '' results['friendsnum'] = '' results['statusnum'] = '' results['user_location'] = '' results['description'] = '' else: verified_num_type = results['verified_type'] verified_ch_type = verified_num2ch_dict[verified_num_type] results['verified_type_ch'] = verified_ch_type return results
def new_get_user_profile(uid): try: results = es_user_profile.get(index=profile_index_name, doc_type=profile_index_type,\ id=uid)['_source'] except: results = {} if not results: results['uid'] = uid results['photo_url'] = '' results['nick_name'] = '' results['verified_type'] = '' results['verified_type_ch'] = '' results['fansnum'] = '' results['friendsnum'] = '' results['statusnum'] = '' results['user_location'] = '' results['description'] = '' else: verified_num_type = results['verified_type'] verified_ch_type = verified_num2ch_dict[verified_num_type] results['verified_type_ch'] = verified_ch_type return results
def new_get_user_profile(uid): try: results = es_user_profile.get(index=profile_index_name, doc_type=profile_index_type,\ id=uid)['_source'] except: results = {} #get new fansnum and statusnum try: bci_history_result = es_bci_history.get(index=bci_history_index_name, doc_type=bci_history_index_type, id=uid)['_source'] except: bci_history_result = {} if not results: results['uid'] = uid results['photo_url'] = '' results['nick_name'] = '' results['verified_type'] = '' results['verified_type_ch'] = '' results['fansnum'] = '' results['friendsnum'] = '' results['statusnum'] = '' results['user_location'] = '' results['description'] = '' else: verified_num_type = results['verified_type'] try: verified_ch_type = verified_num2ch_dict[verified_num_type] except: verified_ch_type = '' results['verified_type_ch'] = verified_ch_type if bci_history_result: results['fansnum'] = int(bci_history_result['user_fansnum']) results['friendsnum'] = int(bci_history_result['user_friendsnum']) results['statusnum'] = int(bci_history_result['weibo_month_sum']) return results
def new_get_user_weibo(uid, sort_type): results = [] weibo_list = [] now_date = ts2datetime(time.time()) #run_type if RUN_TYPE == 0: now_date = RUN_TEST_TIME sort_type = 'timestamp' #step1:get user name print '708' try: user_profile_result = es_user_profile.get(index=profile_index_name, doc_type=profile_index_type,\ id=uid, _source=False, fields=['nick_name']) except: user_profile_result = {} print '714', len(user_profile_result) if user_profile_result: uname = user_profile_result['fields']['nick_name'][0] else: uname = '' #step2:get user weibo for i in range(7, 0, -1): if RUN_TYPE == 1: iter_date = ts2datetime(datetime2ts(now_date) - i * DAY) else: iter_date = '2013-09-01' index_name = flow_text_index_name_pre + iter_date print '726' try: weibo_result = es_flow_text.search(index=index_name, doc_type=flow_text_index_type,\ body={'query':{'filtered':{'filter':{'term': {'uid': uid}}}}, 'size':MAX_VALUE})['hits']['hits'] #print weibo_result except: weibo_result = [] print '732', len(weibo_result) if weibo_result: weibo_list.extend(weibo_result) #sort_weibo_list = sorted(weibo_list, key=lambda x:x['_source'][sort_type], reverse=True)[:100] mid_set = set() for weibo_item in weibo_list: source = weibo_item['_source'] mid = source['mid'] uid = source['uid'] text = source['text'] ip = source['ip'] timestamp = source['timestamp'] date = ts2date(timestamp) sentiment = source['sentiment'] weibo_url = weiboinfo2url(uid, mid) #run_type if RUN_TYPE == 1: try: retweet_count = source['retweeted'] except: retweet_count = 0 try: comment_count = source['comment'] except: comment_count = 0 try: sensitive_score = source['sensitive'] except: sensitive_score = 0 else: retweet_count = 0 comment_count = 0 sensitive_score = 0 city = ip2city(ip) if mid not in mid_set: results.append([ mid, uid, text, ip, city, timestamp, date, retweet_count, comment_count, sensitive_score, weibo_url ]) mid_set.add(mid) if sort_type == 'timestamp': sort_results = sorted(results, key=lambda x: x[5], reverse=True) elif sort_type == 'retweet_count': sort_results = sorted(results, key=lambda x: x[7], reverse=True) elif sort_type == 'comment_count': sort_results = sorted(results, key=lambda x: x[8], reverse=True) elif sort_type == 'sensitive': sort_results = sorted(results, key=lambda x: x[9], reverse=True) print '778' return sort_results
def search_attribute_portrait(uid): results = dict() index_name = 'user_portrait' index_type = 'user' try: results = es_user_portrait.get(index=index_name, doc_type=index_type, id=uid)['_source'] except: results = None return None keyword_list = [] if results['keywords']: keywords_dict = json.loads(results['keywords']) sort_word_list = sorted(keywords_dict.items(), key=lambda x:x[1], reverse=True) #print 'sort_word_list:', sort_word_list results['keywords'] = sort_word_list else: results['keywords'] = [] #print 'keywords:', results geo_top = [] if results['activity_geo_dict']: geo_dict = json.loads(results['activity_geo_dict']) sort_geo_dict = sorted(geo_dict.items(), key=lambda x:x[1], reverse=True) geo_top = sort_geo_dict results['activity_geo'] = geo_top else: results['activity_geo'] = [] if results['hashtag_dict']: hashtag_dict = json.loads(results['hashtag_dict']) sort_hashtag_dict = sorted(hashtag_dict.items(), key=lambda x:x[1], reverse=True) results['hashtag_dict'] = sort_hashtag_dict[:5] descriptions = hashtag_description(hashtag_dict) results['hashtag_description'] = descriptions else: results['hashtag_dict'] = [] results['hashtag_description'] = '' emotion_result = {} emotion_conclusion_dict = {} if results['emotion_words']: emotion_words_dict = json.loads(results['emotion_words']) for word_type in emotion_mark_dict: try: word_dict = emotion_words_dict[word_type] if word_type=='126' or word_type=='127': emotion_conclusion_dict[word_type] = word_dict sort_word_dict = sorted(word_dict.items(), key=lambda x:x[1], reverse=True) #print 'sort_word_dict:', sort_word_dict word_list = sort_word_dict[:5] except: word_list = [] emotion_result[emotion_mark_dict[word_type]] = word_list #print 'emotion_words:', type(emotion_result) results['emotion_words'] = emotion_result #emotion_conclusion results['emotion_conclusion'] = get_emotion_conclusion(emotion_conclusion_dict) #topic if results['topic']: topic_dict = json.loads(results['topic']) sort_topic_dict = sorted(topic_dict.items(), key=lambda x:x[1], reverse=True) results['topic'] = sort_topic_dict[:5] else: results['topic'] = [] #domain if results['domain']: domain_string = results['domain'] domain_list = domain_string.split('_') results['domain'] = domain_list else: results['domain'] = [] #emoticon if results['emoticon']: emoticon_dict = json.loads(results['emoticon']) sort_emoticon_dict = sorted(emoticon_dict.items(), key=lambda x:x[1], reverse=True) results['emoticon'] = sort_emoticon_dict[:5] else: results['emoticon'] = [] #online_pattern if results['online_pattern']: online_pattern_dict = json.loads(results['online_pattern']) sort_online_pattern_dict = sorted(online_pattern_dict.items(), key=lambda x:x[1], reverse=True) results['online_pattern'] = sort_online_pattern_dict[:5] else: results['online_pattern'] = [] #psycho_status if results['psycho_status']: psycho_status_dict = json.loads(results['psycho_status']) sort_psycho_status_dict = sorted(psycho_status_dict.items(), key=lambda x:x[1], reverse=True) results['psycho_status'] = sort_psycho_status_dict[:5] else: results['psycho_status'] = [] #psycho_feature if results['psycho_feature']: psycho_feature_list = results['psycho_feature'].split('_') results['psycho_feature'] = psycho_feature_list else: results['psycho_feature'] = [] #state if results['uid']: uid = results['uid'] try: profile_result = es_user_profile.get(index='weibo_user', doc_type='user', id=uid) except: profile_result = None try: user_state = profile_result['_source']['description'] results['description'] = user_state except: results['description'] = '' else: results['uid'] = '' results['description'] = '' if results['importance']: #print results['importance'] query_body = { 'query':{ "range":{ "importance":{ "from": results['importance'], "to": 1000000 } } } } importance_rank = es_user_portrait.count(index=index_name, doc_type=index_type, body=query_body) if importance_rank['_shards']['successful'] != 0: #print 'importance_rank:', importance_rank results['importance_rank'] = importance_rank['count'] else: print 'es_importance_rank error' results['importance_rank'] = 0 else: results['importance_rank'] = 0 if results['activeness']: query_body = { 'query':{ "range":{ "activeness":{ "from":results['activeness'], "to": 1000000 } } } } activeness_rank = es_user_portrait.count(index=index_name, doc_type=index_type, body=query_body) if activeness_rank['_shards']['successful'] != 0: results['activeness_rank'] = activeness_rank['count'] else: print 'es_activess_rank error' results['activeness_rank'] = 0 if results['influence']: query_body = { 'query':{ 'range':{ 'influence':{ 'from':results['influence'], 'to': 1000000 } } } } influence_rank = es_user_portrait.count(index=index_name, doc_type=index_type, body=query_body) if influence_rank['_shards']['successful'] != 0: results['influence_rank'] = influence_rank['count'] else: print 'es_influence_rank error' results['influence_rank'] = 0 #total count in user_portrait query_body ={ 'query':{ 'match_all':{} } } all_count_results = es_user_portrait.count(index=index_name, doc_type=index_type, body=query_body) if all_count_results['_shards']['successful'] != 0: results['all_count'] = all_count_results['count'] else: print 'es_user_portrait error' results['all_count'] = 0 #link conclusion link_ratio = results['link'] results['link_conclusion'] = get_link_conclusion(link_ratio) return results