def get_text(top_list, date, style): # input: [[mid1, no.1], [mid2, no.2], ['mid3', no.3]] # output: [[text1, no.1], [text2, no.2], [text3, no.3]] # mid, retweeted, comment, text, geo, timestamp, sentiment, mid_url results = [] index_flow_text = pre_text_index + date #index_list = get_text_index(date) if len(top_list) != 0: # no one mid_list = [] for item in top_list: mid_list.append(item[0]) search_result = es.mget(index=index_flow_text, doc_type=flow_text_index_type, body={"ids":mid_list})["docs"] for i in range(len(top_list)): temp = [] temp.extend(top_list[i]) if search_result[i]['found']: source = search_result[i]['_source'] temp.append(source["text"]) temp.append(source["geo"]) temp.append(ts2date(source["timestamp"])) temp.append(source["sentiment"]) temp.append(weiboinfo2url(source['uid'], source['mid'])) temp.append(uid_url+source['uid']) temp.append(source['uid']) try: uname = es_profile.get(index=profile_index_name, doc_type=profile_index_type, id=source['uid'])["_source"]["nick_name"] temp.append(uname) except: temp.append("unknown") else: temp.extend(["", "", "", "", "", "", "", ""]) results.append(temp) return results
def deal_show_weibo_list(flow_text_result): show_weibo_list = [] user_set = set() for weibo_item in flow_text_result: source = weibo_item['_source'] mid = source['mid'] uid = source['uid'] text = source['text'] geo = source['geo'] timestamp = source['timestamp'] date = ts2date(timestamp) weibo_url = weiboinfo2url(uid, mid) if RUN_TYPE == 1: try: retweet_count = source['retwet'] except: retweet_count = 0 try: comment_count = source['comment'] except: comment_count = 0 try: sensitive_score = source['sensitive'] except: sensitive_score = 0 else: retweet_count = 0 comment_count = 0 sensitive_score = 0 show_weibo_list.append([mid, uid, text, geo, timestamp, date, retweet_count, comment_count, sensitive_score, weibo_url]) user_set.add(uid) return show_weibo_list, user_set
def deal_show_weibo_list(flow_text_result): show_weibo_list = [] user_set = set() for weibo_item in flow_text_result: source = weibo_item['_source'] mid = source['mid'] uid = source['uid'] text = source['text'] geo = source['geo'] timestamp = source['timestamp'] date = ts2date(timestamp) weibo_url = weiboinfo2url(uid, mid) if RUN_TYPE == 1: try: retweet_count = source['retweeted'] except: retweet_count = 0 try: comment_count = source['comment'] except: comment_count = 0 try: sensitive_score = source['sensitive'] except: sensitive_score = 0 else: retweet_count = 0 comment_count = 0 sensitive_score = 0 show_weibo_list.append([mid, uid, text, geo, timestamp, date, retweet_count, comment_count, sensitive_score, weibo_url]) user_set.add(uid) return show_weibo_list, user_set
def get_text(top_list, date, user_info, style): # input: [[mid1, no.1], [mid2, no.2], ['mid3', no.3]] # output: [[text1, no.1], [text2, no.2], [text3, no.3]] # mid, retweeted, comment, text, geo, timestamp, sentiment, mid_url results = [] detail_list = [ "origin_weibo_retweeted_detail", "origin_weibo_comment_detail", "retweeted_weibo_retweeted_detail", "retweeted_weibo_comment_detail", ] index_flow_text = pre_text_index + date if len(top_list) != 0: # no one mid_list = [] for i in range(len(top_list)): mid_list.append(top_list[i][0]) search_result = es.mget(index=index_flow_text, doc_type=flow_text_index_type, body={"ids": mid_list})["docs"] for i in range(len(top_list)): temp = [] temp.append(mid_list[i]) if int(style) == 0: temp.append(top_list[i][1]) temp.append(json.loads(user_info[detail_list[1]]).get(top_list[i][0], 0)) elif int(style) == 1: temp.append(json.loads(user_info[detail_list[0]]).get(top_list[i][0], 0)) temp.append(top_list[i][1]) elif int(style) == 2: temp.append(top_list[i][1]) temp.append(json.loads(user_info[detail_list[3]]).get(top_list[i][0], 0)) else: temp.append(json.loads(user_info[detail_list[2]]).get(top_list[i][0], 0)) temp.append(top_list[i][1]) if search_result[i]["found"]: source = search_result[i]["_source"] temp.append(source["text"]) temp.append(source["geo"]) temp.append(ts2date(source["timestamp"])) temp.append(source["sentiment"]) temp.append(weiboinfo2url(source["uid"], source["mid"])) temp.append(uid_url + source["uid"]) temp.append(source["uid"]) try: uname = es_profile.get(index=profile_index_name, doc_type=profile_index_type, id=source["uid"])[ "_source" ]["nick_name"] temp.append(uname) except: temp.append("unknown") else: temp.extend(["", "", "", "", "", "", "", ""]) results.append(temp) return results
def new_get_user_weibo(uid, sort_type): results = [] weibo_list = [] now_date = ts2datetime(time.time()) #run_type if RUN_TYPE == 0: now_date = RUN_TEST_TIME sort_type = 'timestamp' #step1:get user name try: user_profile_result = es_user_profile.get(index=profile_index_name, doc_type=profile_index_type,\ id=uid, _source=False, fields=['nick_name']) except: user_profile_result = {} if user_profile_result: uname = user_profile_result['fields']['nick_name'][0] else: uname = '' #step2:get user weibo for i in range(7, 0, -1): iter_date = ts2datetime(datetime2ts(now_date) - i * DAY) index_name = flow_text_index_name_pre + iter_date try: weibo_result = es_flow_text.search(index=index_name, doc_type=flow_text_index_type,\ body={'query':{'filtered':{'filter':{'term': {'uid': uid}}}}, 'sort':sort_type, 'size':100})['hits']['hits'] except: weibo_result = [] if weibo_result: weibo_list.extend(weibo_result) sort_weibo_list = sorted(weibo_list, key=lambda x:x['_source'][sort_type], reverse=True)[:100] for weibo_item in sort_weibo_list: source = weibo_item['_source'] mid = source['mid'] uid = source['uid'] text = source['text'] ip = source['geo'] timestamp = source['timestamp'] date = ts2date(timestamp) sentiment = source['sentiment'] weibo_url = weiboinfo2url(uid, mid) #run_type if RUN_TYPE == 1: retweet_count = source['retweet_count'] comment_count = source['comment_count'] sensitive_score = source['sensitive'] else: retweet_count = 0 comment_count = 0 sensitive_score = 0 city = ip2city(ip) results.append([mid, uid, text, ip, city,timestamp, date, retweet_count, comment_count, sensitive_score, weibo_url]) return results
def get_text(top_list, date, order): # input: [[mid1, no.1], [mid2, no.2], ['mid3', no.3]] # output: [[text1, no.1], [text2, no.2], [text3, no.3]] # mid, retweeted, comment, text, geo, timestamp, sentiment, mid_url results = [] index_flow_text = pre_text_index + date #index_list = get_text_index(date) if len(top_list) != 0: # no one mid_list = [] for item in top_list: mid_list.append(item[0]) search_result = es.mget(index=index_flow_text, doc_type=flow_text_index_type, body={"ids":mid_list})["docs"] for i in range(len(top_list)): temp = [] temp.extend(top_list[i]) if search_result[i]['found']: source = search_result[i]['_source'] temp.append(source["text"]) temp.append(source["geo"]) temp.append(ts2date(source["timestamp"])) temp.append(source["sentiment"]) temp.append(weiboinfo2url(source['uid'], source['mid'])) temp.append(uid_url+source['uid']) temp.append(source['uid']) try: uname = es_profile.get(index=profile_index_name, doc_type=profile_index_type, id=source['uid'])["_source"]["nick_name"] temp.append(uname) except: temp.append(source['uid']) #temp.append(source['sensitive_words_string'].replace("&", " ")) temp.append(int(source["timestamp"])) else: temp.extend(["", "", "", "", "", "", "", "", time.time()]) results.append(temp) if results: if int(order) == 1:#"sensitive" results = sorted(results, key=lambda x:x[3], reverse=True) elif int(order) == 2: #retweet results = sorted(results, key=lambda x:x[1], reverse=True) elif int(order) == 3: #comment results = sorted(results, key=lambda x:x[2], reverse=True) else: results = sorted(results, key=lambda x:x[-1], reverse=False) return results
def get_text(top_list, date, user_info, style): # input: [[mid1, no.1], [mid2, no.2], ['mid3', no.3]] # output: [[text1, no.1], [text2, no.2], [text3, no.3]] # mid, retweeted, comment, text, geo, timestamp, sentiment, mid_url results = [] detail_list = ["origin_weibo_retweeted_detail", "origin_weibo_comment_detail", "retweeted_weibo_retweeted_detail", "retweeted_weibo_comment_detail"] index_flow_text = pre_text_index + date if len(top_list) != 0: # no one mid_list = [] for i in range(len(top_list)): mid_list.append(top_list[i][0]) search_result = es.mget(index=index_flow_text, doc_type=flow_text_index_type, body={"ids":mid_list})["docs"] for i in range(len(top_list)): temp = [] temp.append(mid_list[i]) if int(style) == 0: temp.append(top_list[i][1]) temp.append(json.loads(user_info[detail_list[1]]).get(top_list[i][0], 0)) elif int(style) == 1: temp.append(json.loads(user_info[detail_list[0]]).get(top_list[i][0], 0)) temp.append(top_list[i][1]) elif int(style) == 2: temp.append(top_list[i][1]) temp.append(json.loads(user_info[detail_list[3]]).get(top_list[i][0], 0)) else: temp.append(json.loads(user_info[detail_list[2]]).get(top_list[i][0], 0)) temp.append(top_list[i][1]) if search_result[i]["found"]: source = search_result[i]["_source"] temp.append(source["text"]) temp.append(source["geo"]) temp.append(ts2date(source["timestamp"])) temp.append(source["sentiment"]) temp.append(weiboinfo2url(source['uid'], source['mid'])) temp.append(uid_url+source['uid']) temp.append(source['uid']) try: uname = es_profile.get(index=profile_index_name, doc_type=profile_index_type, id=source['uid'])["_source"]["nick_name"] temp.append(uname) except: temp.append("unknown") else: temp.extend(["", "", "", "", "", "", "", ""]) results.append(temp) return results
def key_words_search(task_id, search_type, pre, during, start_time, keyword_list, search_key='', sort_norm='', sort_scope='', time=1, isall=False, number=100): number = int(number) should = [] for key in keyword_list: if search_type == "hashtag": should.append({"prefix": {"text": "#" + key + "#"}}) else: should.append({"wildcard": {"text": "*" + key + "*"}}) index_list = [] date = ts2datetime(start_time) index_name = pre + date while during: if es_flow_text.indices.exists(index=index_name): index_list.append(index_name) start_time = start_time + DAY date = ts2datetime(start_time) index_name = pre + date during -= 1 print index_list uid_set = set() text_results = [] query_body = { "query": { "bool": { "must": should } }, "sort": { "user_fansnum": { "order": "desc" } }, "size": 5000 } results = es_flow_text.search(index=index_list, doc_type='text', body=query_body, _source=False, fields=[ "uid", "user_fansnum", "text", "message_type", "sentiment", "timestamp", "geo", "retweeted", "comment" ])["hits"]["hits"] id_index = 0 index_list = [] un_uid_list = [] for item in results: if item['fields']['uid'][0] not in uid_set: uid_set.add(item['fields']['uid'][0]) un_uid_list.append(item['fields']['uid'][0]) index_list.append(id_index) id_index += 1 uid_list = [] print "un_uid_list: ", len(un_uid_list) portrait_list = [] count = 0 in_index = 0 if not isall and un_uid_list: # 库内 portrait_results = es_user_portrait.mget(index=USER_INDEX_NAME, doc_type=USER_INDEX_TYPE, body={"ids": un_uid_list}, _source=False, fields=['uname'])["docs"] for item in portrait_results: if item["found"]: portrait_list.append(item['_id']) nick_name = item['fields']['uname'][0] if nick_name == 'unknown': nick_name = item['_id'] index = index_list[in_index] weibo_url = weiboinfo2url(results[index]['fields']['uid'][0], results[index]['_id']) text_results.extend([ results[index]['fields']['uid'][0], results[index]['fields']['user_fansnum'][0], results[index]['fields']['text'][0], results[index]['fields']['message_type'][0], results[index]['fields']['sentiment'][0], ts2date(results[index]['fields']['timestamp'][0]), results[index]['fields']['geo'][0], results[index]['fields']['retweeted'][0], results[index]['fields']['comment'][0], nick_name, weibo_url ]) count += 1 if count == number: break print "portrait_len, ", len(portrait_list) in_index += 1 if portrait_list: uid_list = in_sort_filter(time, sort_norm, sort_scope, None, portrait_list, True, number) # sort elif un_uid_list: profile_result = es_user_profile.mget(index="weibo_user", doc_type="user", body={"ids": un_uid_list}, fields=['nick_name'])["docs"] for i in range(len(profile_result)): index = index_list[i] try: nick_name = profile_result[i]['fields']['nick_name'][0] except: nick_name = un_uid_list[i] item = results[index] weibo_url = weiboinfo2url(item['fields']['uid'][0], results[index]['_id']) text_results.append([ item['fields']['uid'][0], item['fields']['user_fansnum'][0], item['fields']['text'][0], item['fields']['message_type'][0], item['fields']['sentiment'][0], ts2date(item['fields']['timestamp'][0]), results[index]['fields']['geo'][0], results[index]['fields']['retweeted'][0], results[index]['fields']['comment'][0], nick_name, weibo_url ]) if i == number: break uid_list = all_sort_filter(un_uid_list[:number], sort_norm, time, True, number) print "filter_uid_list: ", len(uid_list) if uid_list: results = make_up_user_info(uid_list, isall, time, sort_norm) else: results = [] print "results: ", len(results) # 修改状态 task_detail = es_user_portrait.get(index=USER_RANK_KEYWORD_TASK_INDEX, doc_type=USER_RANK_KEYWORD_TASK_TYPE, id=task_id) item = task_detail['_source'] item['status'] = 1 item['result'] = json.dumps(results) item['text_results'] = json.dumps(text_results) item['number'] = len(results) es_user_portrait.index(index=USER_RANK_KEYWORD_TASK_INDEX, doc_type=USER_RANK_KEYWORD_TASK_TYPE, id=task_id, body=item) return "1"
def new_get_user_weibo(uid, sort_type): results = [] weibo_list = [] now_date = ts2datetime(time.time()) #run_type if RUN_TYPE == 0: now_date = RUN_TEST_TIME sort_type = 'timestamp' #step1:get user name print '708' try: user_profile_result = es_user_profile.get(index=profile_index_name, doc_type=profile_index_type,\ id=uid, _source=False, fields=['nick_name']) except: user_profile_result = {} print '714',len(user_profile_result) if user_profile_result: uname = user_profile_result['fields']['nick_name'][0] else: uname = '' #step2:get user weibo for i in range(7, 0, -1): if RUN_TYPE == 1: iter_date = ts2datetime(datetime2ts(now_date) - i * DAY) else: iter_date = '2013-09-01' index_name = flow_text_index_name_pre + iter_date print '726' try: weibo_result = es_flow_text.search(index=index_name, doc_type=flow_text_index_type,\ body={'query':{'filtered':{'filter':{'term': {'uid': uid}}}}, 'size':MAX_VALUE})['hits']['hits'] #print weibo_result except: weibo_result = [] print '732',len(weibo_result) if weibo_result: weibo_list.extend(weibo_result) #sort_weibo_list = sorted(weibo_list, key=lambda x:x['_source'][sort_type], reverse=True)[:100] mid_set = set() for weibo_item in weibo_list: source = weibo_item['_source'] mid = source['mid'] uid = source['uid'] text = source['text'] ip = source['ip'] timestamp = source['timestamp'] date = ts2date(timestamp) sentiment = source['sentiment'] weibo_url = weiboinfo2url(uid, mid) #run_type if RUN_TYPE == 1: try: retweet_count = source['retweeted'] except: retweet_count = 0 try: comment_count = source['comment'] except: comment_count = 0 try: sensitive_score = source['sensitive'] except: sensitive_score = 0 else: retweet_count = 0 comment_count = 0 sensitive_score = 0 city = ip2city(ip) if mid not in mid_set: results.append([mid, uid, text, ip, city,timestamp, date, retweet_count, comment_count, sensitive_score, weibo_url]) mid_set.add(mid) if sort_type == 'timestamp': sort_results = sorted(results, key=lambda x:x[5], reverse=True) elif sort_type == 'retweet_count': sort_results = sorted(results, key=lambda x:x[7], reverse=True) elif sort_type == 'comment_count': sort_results = sorted(results, key=lambda x:x[8], reverse=True) elif sort_type == 'sensitive': sort_results = sorted(results, key=lambda x:x[9], reverse=True) print '778' return sort_results
def key_words_search(task_id, search_type , pre , during , start_time , keyword_list , search_key = '' , sort_norm = '', sort_scope = '' ,time = 7 , isall = False, number = 100): number = int(number) should = [] for key in keyword_list: if search_type == "hashtag": should.append({"prefix":{"text": "#" + key + "#"}}) else: should.append({"wildcard":{"text": "*" +key + "*"}}) index_list = [] date = ts2datetime(start_time) index_name = pre + date while during: if es_flow_text.indices.exists(index=index_name): index_list.append(index_name) start_time = start_time + DAY date = ts2datetime(start_time) index_name = pre + date during -= 1 print index_list uid_set = set() text_results = [] sorted_text_results = [] query_body = { "query":{ "bool":{ "must":should } }, "sort":{"user_fansnum":{"order":"desc"}}, "size":5000 } results = es_flow_text.search(index = index_list , doc_type = 'text' , body = query_body, _source=False, fields=["uid", "user_fansnum","text", "message_type", "sentiment","timestamp", "geo", "retweeted", "comment"])["hits"]["hits"] id_index = 0 index_list = [] un_uid_list = [] for item in results : if item['fields']['uid'][0] not in uid_set: uid_set.add(item['fields']['uid'][0]) un_uid_list.append(item['fields']['uid'][0]) index_list.append(id_index) id_index += 1 #get_all_filed(sort_norm , time) uid_list = [] print "un_uid_list: ", len(un_uid_list) portrait_list = [] count = 0 in_index = 0 if not isall and un_uid_list : # 库内 portrait_results = es_user_portrait.mget(index=USER_INDEX_NAME, doc_type=USER_INDEX_TYPE, body={"ids":un_uid_list}, _source=False, fields=['uname'])["docs"] for item in portrait_results: if item["found"]: portrait_list.append(item['_id']) nick_name = item['fields']['uname'][0] if nick_name == 'unknown': nick_name = item['_id'] index = index_list[in_index] weibo_url = weiboinfo2url(results[index]['fields']['uid'][0], results[index]['_id']) text_results.extend([results[index]['fields']['uid'][0], results[index]['fields']['user_fansnum'][0], results[index]['fields']['text'][0], results[index]['fields']['message_type'][0], results[index]['fields']['sentiment'][0], ts2date(results[index]['fields']['timestamp'][0]), results[index]['fields']['geo'][0], results[index]['fields']['retweeted'][0], results[index]['fields']['comment'][0], nick_name, weibo_url]) count += 1 if count == number: break print "portrait_len, ", len(portrait_list) in_index += 1 if portrait_list: uid_list = in_sort_filter(time,sort_norm ,sort_scope ,None , portrait_list , True, number) # sort for iter_uid in uid_list: iter_index = portrait_list.index(iter_uid) sorted_text_results.append(text_results[i]) elif un_uid_list: profile_result = es_user_profile.mget(index="weibo_user", doc_type="user", body={"ids":un_uid_list}, fields=['nick_name'])["docs"] for i in range(len(profile_result)): index = index_list[i] try: nick_name = profile_result[i]['fields']['nick_name'][0] except: nick_name = un_uid_list[i] item = results[index] weibo_url = weiboinfo2url(item['fields']['uid'][0], results[index]['_id']) text_results.append([item['fields']['uid'][0], item['fields']['user_fansnum'][0], item['fields']['text'][0], item['fields']['message_type'][0], item['fields']['sentiment'][0], ts2date(item['fields']['timestamp'][0]), results[index]['fields']['geo'][0], results[index]['fields']['retweeted'][0], results[index]['fields']['comment'][0], nick_name, weibo_url]) if i == number: break uid_list = all_sort_filter(un_uid_list[:number] , sort_norm , time ,True, number) sorted_text_results = [] f = open("small.txt", "wb") for iter_uid in uid_list: iter_index = un_uid_list.index(iter_uid) f.write(str(iter_uid)+"\n") sorted_text_results.append(text_results[iter_index]) f.close() print "filter_uid_list: ", len(uid_list) if uid_list: results = make_up_user_info(uid_list,isall,time,sort_norm) else: results = [] print "results: ", len(results) # 修改状态 task_detail = es_user_portrait.get(index=USER_RANK_KEYWORD_TASK_INDEX , doc_type=USER_RANK_KEYWORD_TASK_TYPE, id=task_id) item = task_detail['_source'] item['status'] = 1 item['result'] = json.dumps(results) item['text_results'] = json.dumps(sorted_text_results) item['number'] = len(results) es_user_portrait.index(index = USER_RANK_KEYWORD_TASK_INDEX , doc_type=USER_RANK_KEYWORD_TASK_TYPE , id=task_id, body=item) return "1"