def end_track_task(task_name): status = 0 try: task_exist = es.get(index=index_name, doc_type=index_type, id=task_name)['_source'] except: return 'task name not exist' task_status = task_exist['status'] if status == '0': return 'task have end' else: task_exist['status'] = 0 # made end time now_ts = time.time() now_date = ts2datetime(now_ts) now_date_ts = datetime2ts(now_date) time_segment = int((now_ts - now_date_ts) / 900) + 1 end_ts = now_date_ts + time_segment * 900 end_date = ts2date(end_ts) task_exist['end_date'] = end_date task_user = task_exist['uid_list'] status = change_user_count(task_user) if status == 0: return 'change user task count fail' else: es.index(index=index_name, doc_type=index_type, id=task_name, body=task_exist) status = delete_task_redis(task_name) if status == 0: return 'delete task from redis fail' else: return 'success change status to end'
def get_activity_weibo(task_name, submit_user, start_ts, time_segment=FOUR_HOUR): results = [] #step1: get task_name uid task_id = submit_user + task_name try: group_result = es_group_result.get(index=group_index_name, doc_type=group_index_type ,\ id=task_id, _source=False, fields=['uid_list']) except: group_result = {} if group_result == {}: return 'task name invalid' try: uid_list = group_result['fields']['uid_list'] except: uid_list = [] if uid_list == []: return 'task uid list null' #step2: get uid2uname uid2uname = {} try: user_portrait_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type, \ body = {'ids':uid_list}, _source=False, fields=['uname'])['docs'] except: user_portrait_result = [] for item in user_portrait_result: uid = item['_id'] if item['found'] == True: uname = item['fields']['uname'][0] uid2uname[uid] = uname #step3: search time_segment weibo end_ts = start_ts + time_segment time_date = ts2datetime(start_ts) flow_text_index_name = flow_text_index_name_pre + time_date query = [] query.append({'terms': {'uid': uid_list}}) query.append({'range': {'timestamp': {'gte': start_ts, 'lt': end_ts}}}) try: flow_text_es_result = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type, \ body={'query':{'bool':{'must':query}}, 'sort':'timestamp', 'size':MAX_VALUE})['hits']['hits'] except: flow_text_es_result = [] for item in flow_text_es_result: weibo = {} source = item['_source'] weibo['timestamp'] = ts2date(source['timestamp']) weibo['ip'] = source['ip'] weibo['text'] = source['text'] if source['geo']: weibo['geo'] = '\t'.join(source['geo']) else: weibo['geo'] = '' results.append(weibo) return results
def get_influence_content(uid, timestamp_from, timestamp_to): weibo_list = [] # split timestamp range to new_range_dict_list from_date_ts = datetime2ts(ts2datetime(timestamp_from)) to_date_ts = datetime2ts(ts2datetime(timestamp_to)) new_range_dict_list = [] if from_date_ts != to_date_ts: iter_date_ts = from_date_ts while iter_date_ts < to_date_ts: iter_next_date_ts = iter_date_ts + DAY new_range_dict_list.append({"range": {"timestamp": {"gte": iter_date_ts, "lt": iter_next_date_ts}}}) iter_date_ts = iter_next_date_ts if new_range_dict_list[0]["range"]["timestamp"]["gte"] < timestamp_from: new_range_dict_list[0]["range"]["timestamp"]["gte"] = timestamp_from if new_range_dict_list[-1]["range"]["timestamp"]["lt"] > timestamp_to: new_range_dict_list[-1]["range"]["timestamp"]["lt"] = timestamp_to else: new_range_dict_list = [{"range": {"timestamp": {"gte": timestamp_from, "lt": timestamp_to}}}] # iter date to search flow_text iter_result = [] for range_item in new_range_dict_list: range_from_ts = range_item["range"]["timestamp"]["gte"] range_from_date = ts2datetime(range_from_ts) flow_text_index_name = flow_text_index_name_pre + range_from_date query = [] query.append({"term": {"uid": uid}}) query.append(range_item) try: flow_text_exist = es_flow_text.search( index=flow_text_index_name, doc_type=flow_text_index_type, body={"query": {"bool": {"must": query}}, "sort": [{"timestamp": "asc"}]}, )["hits"]["hits"] except: flow_text_exist = [] iter_result.extend(flow_text_exist) # get weibo list for item in flow_text_exist: source = item["_source"] weibo = {} weibo["timestamp"] = ts2date(source["timestamp"]) weibo["ip"] = source["ip"] weibo["text"] = source["text"] if source["geo"]: weibo["geo"] = "\t".join(source["geo"].split("&")) else: weibo["geo"] = "" weibo_list.append(weibo) return weibo_list
def get_text(top_list, date, order): # input: [[mid1, no.1], [mid2, no.2], ['mid3', no.3]] # output: [[text1, no.1], [text2, no.2], [text3, no.3]] # mid, retweeted, comment, text, geo, timestamp, sentiment, mid_url results = [] index_flow_text = pre_text_index + date #index_list = get_text_index(date) if len(top_list) != 0: # no one mid_list = [] for item in top_list: mid_list.append(item[0]) search_result = es.mget(index=index_flow_text, doc_type=flow_text_index_type, body={"ids":mid_list})["docs"] for i in range(len(top_list)): temp = [] temp.extend(top_list[i]) if search_result[i]['found']: source = search_result[i]['_source'] temp.append(source["text"]) temp.append(source["geo"]) temp.append(ts2date(source["timestamp"])) temp.append(source["sentiment"]) temp.append(weiboinfo2url(source['uid'], source['mid'])) temp.append(uid_url+source['uid']) temp.append(source['uid']) try: uname = es_profile.get(index=profile_index_name, doc_type=profile_index_type, id=source['uid'])["_source"]["nick_name"] temp.append(uname) except: temp.append(source['uid']) #temp.append(source['sensitive_words_string'].replace("&", " ")) temp.append(int(source["timestamp"])) else: temp.extend(["", "", "", "", "", "", "", "", time.time()]) results.append(temp) if results: if int(order) == 1:#"sensitive" results = sorted(results, key=lambda x:x[3], reverse=True) elif int(order) == 2: #retweet results = sorted(results, key=lambda x:x[1], reverse=True) elif int(order) == 3: #comment results = sorted(results, key=lambda x:x[2], reverse=True) else: results = sorted(results, key=lambda x:x[-1], reverse=False) return results
def full_text_search(keywords, uid, start_time, end_time, size): results = [] uid_list = [] user_profile_list = [] query_body = { "query": { "filtered":{ "filter":{ "bool": { "must": [] } } } }, "size":size, "sort":{"timestamp":{"order": 'desc'}} } if RUN_TYPE: query_body["sort"] = {"user_fansnum":{"order": 'desc'}} if uid: query_body["query"]["filtered"]["filter"]["bool"]["must"].append({"term":{"uid":uid}}) if keywords: keywords_list = keywords.split(',') for word in keywords_list: query_body["query"]["filtered"]["filter"]["bool"]["must"].append({'wildcard':{'text':{'wildcard':'*'+word+'*'}}}) index_list = [] exist_bool = es_flow_text.indices.exists(index="flow_text_"+end_time) if start_time: start_ts = datetime2ts(start_time) end_ts = datetime2ts(end_time) ts = end_ts while 1: index_name = "flow_text_"+ts2datetime(ts) exist_bool = es_flow_text.indices.exists(index=index_name) if exist_bool: index_list.append(index_name) if ts == start_ts: break else: ts -= 3600*24 print index_list # 没有可行的es if not index_list: return [] search_results = es_flow_text.search(index=index_list, doc_type="text", body=query_body)["hits"]["hits"] for item in search_results: uid_list.append(item['_source']['uid']) history_max = get_history_max() personal_field = ["nick_name", "fansnum", "statusnum","user_location"] user_info = get_user_profile(uid_list, personal_field) bci_results = ES_CLUSTER_FLOW1.mget(index="bci_history", doc_type="bci", body={"ids":uid_list}, _source=False, fields=["bci_day_last"])["docs"] sensitive_results = es_sensitive_history.mget(index="sensitive_history", doc_type="sensitive", body={"ids":uid_list}, _source=False, fields=["last_value"])["docs"] count = 0 for item in search_results: item = item['_source'] uid_list.append(item['uid']) iter_item = [] iter_item.append(item['uid']) iter_item.append(user_info[count][1]) iter_item.append(item['text']) iter_item.append(ts2date(item['timestamp'])) iter_item.append(item['geo']) if item.get("sensitive_words_string", ''): iter_item.append(item['sensitive_words_string'].split('&')) else: iter_item.append([]) iter_item.append(item.get('retweeted', 0)) iter_item.append(item.get('comment', 0)) count += 1 results.append(iter_item) user_set = set() count = 0 for item in user_info: if item[0] in user_set: continue else: user_set.add(item[0]) if bci_results[count]["found"]: bci_value = bci_results[count]["fields"]["bci_day_last"][0] item.append(normalize_index(bci_value, history_max["max_bci"])) else: item.append(0) if sensitive_results[count]["found"]: sensitive_value = sensitive_results[count]['fields']['last_value'][0] item.append(normalize_index(sensitive_value, history_max["max_sensitive"])) else: item.append(0) user_profile_list.append(item) return results, user_profile_list
def get_activity_weibo(task_name, submit_user, start_ts, time_segment=FOUR_HOUR): results = [] # step1: get task_name uid task_id = submit_user + task_name try: group_result = es_group_result.get( index=group_index_name, doc_type=group_index_type, id=task_id, _source=False, fields=["uid_list"] ) except: group_result = {} if group_result == {}: return "task name invalid" try: uid_list = group_result["fields"]["uid_list"] except: uid_list = [] if uid_list == []: return "task uid list null" # step2: get uid2uname uid2uname = {} try: user_portrait_result = es_user_portrait.mget( index=portrait_index_name, doc_type=portrait_index_type, body={"ids": uid_list}, _source=False, fields=["uname"], )["docs"] except: user_portrait_result = [] for item in user_portrait_result: uid = item["_id"] if item["found"] == True: uname = item["fields"]["uname"][0] uid2uname[uid] = uname # step3: search time_segment weibo end_ts = start_ts + time_segment time_date = ts2datetime(start_ts) flow_text_index_name = flow_text_index_name_pre + time_date query = [] query.append({"terms": {"uid": uid_list}}) query.append({"range": {"timestamp": {"gte": start_ts, "lt": end_ts}}}) try: flow_text_es_result = es_flow_text.search( index=flow_text_index_name, doc_type=flow_text_index_type, body={"query": {"bool": {"must": query}}, "sort": "timestamp", "size": MAX_VALUE}, )["hits"]["hits"] except: flow_text_es_result = [] for item in flow_text_es_result: weibo = {} source = item["_source"] weibo["timestamp"] = ts2date(source["timestamp"]) weibo["ip"] = source["ip"] weibo["text"] = source["text"] if source["geo"]: weibo["geo"] = "\t".join(source["geo"]) else: weibo["geo"] = "" results.append(weibo) return results
def full_text_search(keywords, uid, start_time, end_time, size): results = [] uid_list = [] user_profile_list = [] query_body = { "query": { "bool": { "must": [] } }, "size":size, "sort":{"timestamp":{"order": 'desc'}} } if RUN_TYPE: query_body["sort"] = {"user_fansnum":{"order": 'desc'}} if uid: query_body["query"]["bool"]["must"].append({"term":{"uid":uid}}) if keywords: keywords_list = keywords.split(',') for word in keywords_list: query_body["query"]["bool"]["must"].append({'wildcard':{'text':{'wildcard':'*'+word+'*'}}}) index_list = [] exist_bool = es_flow_text.indices.exists(index="flow_text_"+end_time) if start_time: start_ts = datetime2ts(start_time) end_ts = datetime2ts(end_time) ts = end_ts while 1: index_name = "flow_text_"+ts2datetime(ts) exist_bool = es_flow_text.indices.exists(index=index_name) if exist_bool: index_list.append(index_name) if ts == start_ts: break else: ts -= 3600*24 print index_list # 没有可行的es if not index_list: return [[], []] search_results = es_flow_text.search(index=index_list, doc_type="text", body=query_body)["hits"]["hits"] for item in search_results: uid_list.append(item['_source']['uid']) user_info = [] if uid_list: history_max = get_history_max() personal_field = ["nick_name", "fansnum", "statusnum","user_location"] user_info = get_user_profile(uid_list, personal_field) bci_results = ES_CLUSTER_FLOW1.mget(index="bci_history", doc_type="bci", body={"ids":uid_list}, _source=False, fields=["bci_day_last"])["docs"] in_portrait = es_user_portrait.mget(index="sensitive_user_portrait", doc_type="user", body={"ids":uid_list}, _source=False)["docs"] sensitive_results = es_sensitive_history.mget(index="sensitive_history", doc_type="sensitive", body={"ids":uid_list}, _source=False, fields=["last_value"])["docs"] print "len search: ", len(search_results) count = 0 # uid uname text date geo sensitive_words retweeted comment for item in search_results: item = item['_source'] uid_list.append(item['uid']) iter_item = [] iter_item.append(item['uid']) iter_item.append(user_info[count][1]) iter_item.append(item['text']) iter_item.append(ts2date(item['timestamp'])) iter_item.append(item['geo']) if item.get("sensitive_words_string", ''): iter_item.append(item['sensitive_words_string'].split('&')) else: iter_item.append([]) iter_item.append(item.get('retweeted', 0)) iter_item.append(item.get('comment', 0)) count += 1 results.append(iter_item) user_set = set() count = 0 # uid "nick_name", "fansnum", "statusnum","user_location", bci, sensitive for item in user_info: if item[0] in user_set: continue else: user_set.add(item[0]) if bci_results[count]["found"]: if bci_results[count].has_key("fields"): bci_value = bci_results[count]["fields"]["bci_day_last"][0] else: bci_value = 0 item.append(normalize_index(bci_value, history_max["max_bci"])) else: item.append(0) if sensitive_results[count]["found"]: if sensitive_results[count].has_key("fields"): sensitive_value = sensitive_results[count]['fields']['last_value'][0] else: sensitive_value = 0 item.append(normalize_index(sensitive_value, history_max["max_sensitive"])) else: item.append(0) if in_portrait[count]["found"]: item.append("1") else: item.append("0") user_profile_list.append(item) return results, user_profile_list
def compute_mid_result(task_name, task_submit_date): result = {'count_0':{}, 'count_1':{}, 'sentiment_0_126':{}, 'sentiment_0_127':{}, 'sentiment_0_128':{},\ 'sentiment_0_129':{}, 'sentiment_0_130':{}, 'sensitive_score':{}, 'geo_0':{}, 'geo_1':{},\ 'hashtag_0':{}, 'hashtag_1':{}, 'sentiment_1_126':{}, 'sentiment_1_127':{}, \ 'sentiment_1_128':{}, 'sentiment_1_129':{}, 'sentiment_1_130':{}} #geo & hashtag: day #other: 15min search_time_segment = 3600 * 4 #start_ts = datetime2ts(task_submit_date) start_ts = date2ts(task_submit_date) now_ts = time.time() now_date = ts2datetime(now_ts) #test now_ts = datetime2ts('2013-09-08') date_ts = datetime2ts(now_date) segment = int((now_ts - date_ts) / 900) + 1 end_ts = date_ts + segment * 900 #every search time-range: 4 hour----bulk action to search begin_ts = start_ts while True: if begin_ts >= end_ts: break compute_ts = ts2date(begin_ts) #print 'compute ts:', compute_ts query_body = {'range':{'timestamp':{'from': begin_ts, 'to':begin_ts+search_time_segment}}} try: mid_result_list = es.search(index=monitor_index_name, doc_type=task_name, body={'query':query_body, 'size':100000, 'sort':[{'timestamp':{'order': 'asc'}}]})['hits']['hits'] except Exception, e: raise e if mid_result_list: for mid_result_item in mid_result_list: result_item = mid_result_item['_source'] timestamp = result_item['timestamp'] #attr_count #print 'compute_count' count_dict = json.loads(result_item['count']) for sensitive in count_dict: count_key = 'count_' + sensitive result[count_key][str(timestamp)] = count_dict[sensitive] #attr_sentiment #print 'compute_sentiment' sensitive_sentiment_dict = json.loads(result_item['sentiment']) for sensitive in sensitive_sentiment_dict: sentiment_dict = sensitive_sentiment_dict[sensitive] for sentiment in sentiment_dict: sentiment_key = 'sentiment_'+sensitive+'_'+sentiment result[sentiment_key][str(timestamp)] = sentiment_dict[sentiment] #attr_sensitive_score #print 'compute_sensitive_word' if 'sensitive_word' in result_item: sensitive_word_dict = json.loads(result_item['sensitive_word']) else: sensitive_word_dict = {} ts_word_score = 0 for word in sensitive_word_dict: #print 'word:', json.dumps(word.encode('utf-8')), word.encode('utf-8'), type(word.encode('utf-8')) search_word = word.encode('utf-8') #print 'search_word:', search_word, type(search_word) try: word_identify = json.loads(word_r.hget('sensitive_words', search_word)) except: word_identify = [2] ts_word_score += sensitive_word_dict[word] * word_identify[0] result['sensitive_score'][str(timestamp)] = ts_word_score #attr_geo #print 'compute geo' timestamp_date = ts2datetime(timestamp) sensitive_geo_dict = json.loads(result_item['geo']) for sensitive in sensitive_geo_dict: if timestamp_date not in result['geo_'+sensitive]: result['geo_'+sensitive][timestamp_date] = {} geo_dict = sensitive_geo_dict[sensitive] for geo in geo_dict: try: result['geo_'+sensitive][timestamp_date][geo] += geo_dict[geo] except: result['geo_'+sensitive][timestamp_date][geo] = geo_dict[geo] #attr_hashtag #print 'compute hashtag' if 'hashtag' in result_item: sensitive_hashtag_dict = json.loads(result_item['hashtag']) else: sensitive_hashtag_dict = {} result['hashtag_0'][timestamp_date] = {} result['hashtag_1'][timestamp_date] = {} for sensitive in sensitive_hashtag_dict: for sensitive in sensitive_hashtag_dict: if timestamp_date not in result['hashtag_'+sensitive]: result['hashtag_'+sensitive][timestamp_date] = {} hashtag_dict = sensitive_hashtag_dict[sensitive] for hashtag in hashtag_dict: try: result['hashtag_'+sensitive][timestamp_date][hashtag] += hashtag_dict[hashtag] except: result['hashtag_'+sensitive][timestamp_date][hashtag] = hashtag_dict[hashtag] begin_ts += search_time_segment
query_body.append({'bool':{'should':nest_body_list}}) # range search: timestamp timestamp+900 query_body.append({'range':{'timestamp':{'from':timestamp, 'to':timestamp+900}}}) # match search: sensitive_status query_body.append({'term':{'sensitive': sensitive_status}}) try: weibo_result = es.search(index=text_index_name, doc_type=text_index_type, \ body={'query':{'bool':{'must': query_body}}, 'sort':[{'timestamp':{'order':'asc'}}], 'size':10000})['hits']['hits'] except Exception, e: raise e for weibo_item in weibo_result: weibo_dict = weibo_item['_source'] uid = weibo_dict['uid'] uname = uid2uname(uid) timestamp = weibo_dict['timestamp'] date = ts2date(timestamp) geo = weibo_dict['geo'] text = weibo_dict['text'] result.append([uid, uname, date, geo, text]) return result # show weibo when click sentiment node def get_sentiment_weibo(task_name, sentiment, timestamp): result = [] #step1: get task user #step2: search weibo by condition: task_user, sensitive_status, sentiment, timestamp task_exist = identify_task(task_name) if not task_exist: return 'the task is not exist' task_user = task_exist['uid_list']
def get_origin_weibo_detail(ts, user, task_name, size, order, message_type=1): _id = user + '-' + task_name task_detail = es_user_portrait.get(index=index_sensing_task, doc_type=_id, id=ts)['_source'] mid_value = json.loads(task_detail['mid_topic_value']) duplicate_dict = json.loads(task_detail['duplicate_dict']) tmp_duplicate_dict = dict() for k,v in duplicate_dict.iteritems(): try: tmp_duplicate_dict[v].append(k) except: tmp_duplicate_dict[v] = [k, v] if message_type == 1: weibo_detail = json.loads(task_detail['origin_weibo_detail']) elif message_type == 2: weibo_detail = json.loads(task_detail['retweeted_weibo_detail']) else: weibo_detail = json.loads(task_detail['sensitive_weibo_detail']) weibo_detail_list = [] if weibo_detail: for iter_mid, item in weibo_detail.iteritems(): tmp = [] tmp.append(iter_mid) tmp.append(item[iter_mid]) tmp.append(item['retweeted']) tmp.append(item['comment']) weibo_detail_list.append(tmp) mid_list = weibo_detail.keys() results = [] query_body = { "query":{ "filtered":{ "filter":{ "terms":{"mid": mid_list} } } }, "size": 1000, "sort": {"timestamp": {"order": "desc"}} } index_list = [] datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts-DAY) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) if exist_es: index_list.append(index_name) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es_1 = es_text.indices.exists(index_name_1) if exist_es_1: index_list.append(index_name_1) if index_list and mid_list: search_results = es_text.search(index=index_list, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] else: search_results = [] uid_list = [] text_dict = dict() # 文本信息 portrait_dict = dict() # 背景信息 sort_results = [] if search_results: for item in search_results: uid_list.append(item["_source"]['uid']) text_dict[item['_id']] = item['_source'] # _id是mid if uid_list: portrait_result = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list}, fields=['nick_name', 'photo_url'])["docs"] for item in portrait_result: if item['found']: portrait_dict[item['_id']] = {"nick_name": item["fields"]["nick_name"][0], "photo_url": item["fields"]["photo_url"][0]} else: portrait_dict[item['_id']] = {"nick_name": item['_id'], "photo_url":""} if order == "total": sorted_list = sorted(weibo_detail_list, key=lambda x:x[1], reverse=True) elif order == "retweeted": sorted_list = sorted(weibo_detail_list, key=lambda x:x[2], reverse=True) elif order == "comment": sorted_list = sorted(weibo_detail_list, key=lambda x:x[3], reverse=True) else: sorted_list = weibo_detail_list count_n = 0 results_dict = dict() mid_index_dict = dict() for item in sorted_list: # size mid = item[0] iter_text = text_dict.get(mid, {}) temp = [] # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type if iter_text: uid = iter_text['uid'] temp.append(uid) iter_portrait = portrait_dict.get(uid, {}) if iter_portrait: temp.append(iter_portrait['nick_name']) temp.append(iter_portrait['photo_url']) else: temp.extend([uid,'']) temp.append(iter_text["text"]) temp.append(iter_text["sentiment"]) temp.append(ts2date(iter_text['timestamp'])) temp.append(iter_text['geo']) if message_type == 1: temp.append(1) elif message_type == 2: temp.append(3) else: temp.append(iter_text['message_type']) temp.append(item[2]) temp.append(item[3]) temp.append(iter_text.get('sensitive', 0)) temp.append(iter_text['timestamp']) temp.append(mid_value[mid]) temp.append(mid) results.append(temp) count_n += 1 results = sorted(results, key=operator.itemgetter(-4, -2, -6), reverse=True) # -4 -2 -3 sort_results = [] count = 0 for item in results: sort_results.append([item]) mid_index_dict[item[-1]] = count count += 1 if tmp_duplicate_dict: remove_list = [] value_list = tmp_duplicate_dict.values() # [[mid, mid], ] for item in value_list: tmp = [] for mid in item: if mid_index_dict.get(mid, 0): tmp.append(mid_index_dict[mid]) if len(tmp) > 1: tmp_min = min(tmp) else: continue tmp.remove(tmp_min) for iter_count in tmp: sort_results[tmp_min].extend(sort_results[iter_count]) remove_list.append(sort_results[iter_count]) if remove_list: for item in remove_list: sort_results.remove(item) return sort_results
def get_positive_weibo_detail(ts, social_sensors, keywords_list, size, sentiment_type=1): former_mid_list = query_mid_list(ts-time_interval, keywords_list, time_segment, social_sensors) # 前一段时间内的微博mid list current_mid_list = query_mid_list(ts, keywords_list, time_interval, social_sensors) mid_list = [] mid_list.extend(former_mid_list) mid_list.extend(current_mid_list) query_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ {"range":{ "timestamp":{ "gte": ts - time_interval, "lt": ts } }}, ], "should":[ {"terms": {"root_mid": mid_list}}, {"terms": {"mid": mid_list}}, {"terms":{"keywords_string": keywords_list}} ] } } } }, "sort": {"timestamp": {"order": "desc"}}, "size": 100 } #if social_sensors and int(sentiment_type) == 1: # query_body["query"]["filtered"]["filter"]["bool"]["must"].append({"terms":{"uid": social_sensors}}) if int(sentiment_type) == 1 or int(sentiment_type) == 0: query_body["query"]["filtered"]["filter"]["bool"]["must"].append({"term":{"sentiment":sentiment_type}}) else: query_body["query"]["filtered"]["filter"]["bool"]["must"] = [{"terms":{"sentiment": ["2", "3"]}}] # 判断当前ts和ts-time_interval是否属于同一天,确定查询哪个es datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts-time_interval) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es_1 = es_text.indices.exists(index_name_1) # 1. 聚合原创微博mid list if datetime == datetime_1 and exist_es: search_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] elif datetime != datetime_1 and exist_es_1: search_results = es_text.search(index=index_name_1, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] else: search_results = [] uid_list = [] results = [] if search_results: for item in search_results: uid_list.append(item["_source"]['uid']) if uid_list: portrait_result = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list}, fields=['nick_name', 'photo_url'])["docs"] for i in range(len(uid_list)): item = search_results[i]['_source'] temp = [] # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type temp.append(item['uid']) if portrait_result[i]['found']: temp.append(portrait_result[i]["fields"]["nick_name"][0]) temp.append(portrait_result[i]["fields"]["photo_url"][0]) else: temp.append("unknown") temp.append("") temp.append(item["text"]) temp.append(item["sentiment"]) temp.append(ts2date(item['timestamp'])) temp.append(item['geo']) keywords_set = set(item['keywords_string'].split('&')) common_keywords = set(keywords_list) & keywords_set temp.append(list(common_keywords)) temp.append(item['message_type']) results.append(temp) return results
def get_retweet_weibo_detail(ts, user, task_name, size, text_type, type_value): _id = user + '-' + task_name task_detail = es_user_portrait.get(index=index_sensing_task, doc_type=_id, id=ts)['_source'] origin_weibo_detail = json.loads(task_detail['origin_weibo_detail']) retweeted_weibo_detail = json.loads(task_detail['retweeted_weibo_detail']) mid_list = [] mid_list.extend(origin_weibo_detail.keys()) mid_list.extend(retweeted_weibo_detail.keys()) query_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ {"range":{ "timestamp":{ "gte": ts - time_interval, "lt": ts } }}, {"terms": {"root_mid": mid_list}} ] } } } }, "sort": {"timestamp": {"order": "desc"}}, "size": 100 } if text_type == "message_type": query_body['query']['filtered']['filter']['bool']['must'].append({"term":{text_type: type_value}}) if text_type == "sentiment": #if isinstance(type_value, str): if len(type_value) == 1: query_body['query']['filtered']['filter']['bool']['must'].append({"term":{text_type: type_value}}) else: query_body['query']['filtered']['filter']['bool']['must'].append({"terms":{text_type: type_value}}) datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts-time_interval) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es_1 = es_text.indices.exists(index_name_1) # 1. 查询微博 if datetime == datetime_1 and exist_es: search_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] elif datetime != datetime_1 and exist_es_1: search_results = es_text.search(index=index_name_1, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] else: search_results = [] #print search_results # 2. 获取微博相关信息 results = [] uid_list = [] if search_results: for item in search_results: uid_list.append(item["_source"]['uid']) if uid_list: portrait_result = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list}, fields=['nick_name', 'photo_url'])["docs"] for i in range(len(uid_list)): item = search_results[i]['_source'] temp = [] # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type temp.append(item['uid']) if portrait_result[i]['found']: temp.append(portrait_result[i]["fields"]["nick_name"][0]) temp.append(portrait_result[i]["fields"]["photo_url"][0]) else: temp.append(item['uid']) temp.append("") temp.append(item["text"]) #print item['text'] temp.append(item["sentiment"]) temp.append(ts2date(item['timestamp'])) temp.append(item['geo']) temp.append(item["message_type"]) results.append(temp) return results
def get_influence_content(uid, timestamp_from, timestamp_to): weibo_list = [] #split timestamp range to new_range_dict_list from_date_ts = datetime2ts(ts2datetime(timestamp_from)) to_date_ts = datetime2ts(ts2datetime(timestamp_to)) new_range_dict_list = [] if from_date_ts != to_date_ts: iter_date_ts = from_date_ts while iter_date_ts < to_date_ts: iter_next_date_ts = iter_date_ts + DAY new_range_dict_list.append({ 'range': { 'timestamp': { 'gte': iter_date_ts, 'lt': iter_next_date_ts } } }) iter_date_ts = iter_next_date_ts if new_range_dict_list[0]['range']['timestamp']['gte'] < timestamp_from: new_range_dict_list[0]['range']['timestamp'][ 'gte'] = timestamp_from if new_range_dict_list[-1]['range']['timestamp']['lt'] > timestamp_to: new_range_dict_list[-1]['range']['timestamp']['lt'] = timestamp_to else: new_range_dict_list = [{ 'range': { 'timestamp': { 'gte': timestamp_from, 'lt': timestamp_to } } }] #iter date to search flow_text iter_result = [] for range_item in new_range_dict_list: range_from_ts = range_item['range']['timestamp']['gte'] range_from_date = ts2datetime(range_from_ts) flow_text_index_name = flow_text_index_name_pre + range_from_date query = [] query.append({'term': {'uid': uid}}) query.append(range_item) try: flow_text_exist = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\ body={'query':{'bool':{'must': query}}, 'sort':[{'timestamp':'asc'}]})['hits']['hits'] except: flow_text_exist = [] iter_result.extend(flow_text_exist) # get weibo list for item in flow_text_exist: source = item['_source'] weibo = {} weibo['timestamp'] = ts2date(source['timestamp']) weibo['ip'] = source['ip'] weibo['text'] = source['text'] if source['geo']: weibo['geo'] = '\t'.join(source['geo'].split('&')) else: weibo['geo'] = '' weibo_list.append(weibo) return weibo_list