def get_opinions(task_source, task_id, xnr_user_no, opinion_keywords_list, opinion_type, intel_type): query_item = 'text' nest_query_list = [] tweets_list = [] if task_source == 'weibo': if S_TYPE == 'test': current_time = datetime2ts(S_DATE) else: current_time = int(time.time()) index_name_list = get_flow_text_index_list(current_time, days=5) sort_item = 'retweeted' for keyword in opinion_keywords_list: nest_query_list.append( {'wildcard': { query_item: '*' + keyword + '*' }}) uid_list = [] if len(nest_query_list) == 1: SHOULD_PERCENT = 1 else: SHOULD_PERCENT = 1 if intel_type == 'all': query_body = { 'query': { 'bool': { 'should': nest_query_list, 'minimum_should_match': SHOULD_PERCENT } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } elif intel_type == 'follow': try: follow_results = es_xnr.get(index=weibo_xnr_fans_followers_index_name,doc_type=weibo_xnr_fans_followers_index_type,\ id=xnr_user_no)['_source'] if follow_results: for follow_result in follow_results: uid_list = follow_result['_source']['followers'] except: uid_list = [] query_body = { 'query': { 'bool': { 'should': nest_query_list, 'minimum_should_match': SHOULD_PERCENT, 'must': [{ 'terms': { 'uid': uid_list } }] } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } elif intel_type == 'influence': date = ts2datetime(current_time) if S_TYPE == 'test': date = S_DATE_BCI weibo_bci_index_name = weibo_bci_index_name_pre + date[:4] + date[ 5:7] + date[8:10] query_body_bci = { 'query': { 'match_all': {} }, 'sort': { 'user_index': { 'order': 'desc' } }, 'size': 500 } weino_bci_results = es_user_portrait.search( index=weibo_bci_index_name, doc_type=weibo_bci_index_type, body=query_body_bci)['hits']['hits'] if weino_bci_results: for bci_result in weino_bci_results: uid = bci_result['_source']['user'] uid_list.append(uid) query_body = { 'query': { 'bool': { 'should': nest_query_list, 'minimum_should_match': SHOULD_PERCENT, 'must': [{ 'terms': { 'uid': uid_list } }] } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } else: query_sensitive = { 'query': { 'match_all': {} }, "aggs": { "uids": { "terms": { "field": "uid", "order": { "avg_sensitive": "desc" } }, "aggs": { "avg_sensitive": { "avg": { "field": "sensitive" } } } } }, 'size': 500 } es_sensitive_result = es_flow_text.search(index=index_name_list,doc_type='text',\ body=query_sensitive)['aggregations']['uids']['buckets'] for item in es_sensitive_result: uid = item['key'] uid_list.append(uid) query_body = { 'query': { 'bool': { 'should': nest_query_list, 'minimum_should_match': SHOULD_PERCENT, 'must': [{ 'terms': { 'uid': uid_list } }] } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } # 得到tweets_list tweets_results = es_flow_text.search(index=index_name_list, doc_type='text', body=query_body)['hits']['hits'] if tweets_results: for item in tweets_results: item = item['_source'] weibo = item['text'] tweets_list.append(weibo) else: if S_TYPE == 'test': current_time = datetime2ts(S_DATE_FB) else: current_time = int(time.time()) uid_list = [] sort_item = 'share' opinion_keywords_list = [ word.encode('utf-8') for word in opinion_keywords_list ] en_keywords_list = trans(opinion_keywords_list, target_language='en') for i in range(len(opinion_keywords_list)): keyword = opinion_keywords_list[i].decode('utf-8') traditional_keyword = simplified2traditional(keyword) if len(en_keywords_list) == len(opinion_keywords_list): #确保翻译没出错 en_keyword = en_keywords_list[i] nest_query_list.append( {'wildcard': { query_item: '*' + en_keyword + '*' }}) nest_query_list.append( {'wildcard': { query_item: '*' + keyword + '*' }}) nest_query_list.append( {'wildcard': { query_item: '*' + traditional_keyword + '*' }}) if len(nest_query_list) == 1: SHOULD_PERCENT = 1 else: SHOULD_PERCENT = 1 if task_source == 'facebook': index_name_list = fb_get_flow_text_index_list(current_time, days=5) if intel_type == 'all': query_body = { 'query': { 'bool': { 'should': nest_query_list, 'minimum_should_match': SHOULD_PERCENT } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } elif intel_type == 'follow': try: follow_results = es_xnr.get(index=fb_xnr_fans_followers_index_name,doc_type=fb_xnr_fans_followers_index_type,\ id=xnr_user_no)['_source'] if follow_results: for follow_result in follow_results: uid_list = follow_result['_source']['fans_list'] except: uid_list = [] query_body = { 'query': { 'bool': { 'should': nest_query_list, 'minimum_should_match': SHOULD_PERCENT, 'must': [{ 'terms': { 'uid': uid_list } }] } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } elif intel_type == 'influence': fb_bci_index_name = fb_bci_index_name_pre + ts2datetime( current_time) query_body_bci = { 'query': { 'match_all': {} }, 'sort': { 'influence': { 'order': 'desc' } }, 'size': 500 } fb_bci_results = es_xnr.search( index=fb_bci_index_name, doc_type=fb_bci_index_type, body=query_body_bci)['hits']['hits'] #print 'fb_bci_results...',len(fb_bci_results) if fb_bci_results: for bci_result in fb_bci_results: uid = bci_result['_source']['uid'] uid_list.append(uid) query_body = { 'query': { 'bool': { 'should': nest_query_list, 'minimum_should_match': SHOULD_PERCENT, 'must': [{ 'terms': { 'uid': uid_list } }] } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } else: query_sensitive = { 'query': { 'match_all': {} }, "aggs": { "uids": { "terms": { "field": "uid", "order": { "avg_sensitive": "desc" } }, "aggs": { "avg_sensitive": { "avg": { "field": "sensitive" } } } } }, 'size': 500 } es_sensitive_result = es_xnr.search(index=index_name_list,doc_type='text',\ body=query_sensitive)['aggregations']['uids']['buckets'] #print 'es_sensitive_result...',len(es_sensitive_result) for item in es_sensitive_result: uid = item['key'] uid_list.append(uid) query_body = { 'query': { 'bool': { 'should': nest_query_list, 'minimum_should_match': SHOULD_PERCENT, 'must': [{ 'terms': { 'uid': uid_list } }] } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } #print 'query_body...',query_body tweets_results = es_xnr.search(index=index_name_list, doc_type='text', body=query_body)['hits']['hits'] if tweets_results: for item in tweets_results: item = item['_source'] weibo = item['text'] tweets_list.append(weibo) else: index_name_list = tw_get_flow_text_index_list(current_time, days=5) if intel_type == 'all': query_body = { 'query': { 'bool': { 'should': nest_query_list, 'minimum_should_match': SHOULD_PERCENT } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } elif intel_type == 'follow': try: follow_results = es_xnr.get(index=tw_xnr_fans_followers_index_name,doc_type=tw_xnr_fans_followers_index_type,\ id=xnr_user_no)['_source'] if follow_results: for follow_result in follow_results: uid_list = follow_result['_source'][ 'followers_list'] except: uid_list = [] query_body = { 'query': { 'bool': { 'should': nest_query_list, 'minimum_should_match': SHOULD_PERCENT, 'must': [{ 'terms': { 'uid': uid_list } }] } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } elif intel_type == 'influence': tw_bci_index_name = tw_bci_index_name_pre + ts2datetime( current_time) query_body_bci = { 'query': { 'match_all': {} }, 'sort': { 'influence': { 'order': 'desc' } }, 'size': 500 } tw_bci_results = es_xnr.search( index=tw_bci_index_name, doc_type=tw_bci_index_type, body=query_body_bci)['hits']['hits'] if tw_bci_results: for bci_result in tw_bci_results: uid = bci_result['_source']['uid'] uid_list.append(uid) query_body = { 'query': { 'bool': { 'should': nest_query_list, 'minimum_should_match': SHOULD_PERCENT, 'must': [{ 'terms': { 'uid': uid_list } }] } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } else: query_sensitive = { 'query': { 'match_all': {} }, "aggs": { "uids": { "terms": { "field": "uid", "order": { "avg_sensitive": "desc" } }, "aggs": { "avg_sensitive": { "avg": { "field": "sensitive" } } } } }, 'size': 500 } es_sensitive_result = es_xnr.search(index=index_name_list,doc_type='text',\ body=query_sensitive)['aggregations']['uids']['buckets'] for item in es_sensitive_result: uid = item['key'] uid_list.append(uid) query_body = { 'query': { 'bool': { 'should': nest_query_list, 'minimum_should_match': SHOULD_PERCENT, 'must': [{ 'terms': { 'uid': uid_list } }] } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } print 'index_name_list...', index_name_list print 'query_body........', query_body tweets_results = es_xnr.search(index=index_name_list, doc_type='text', body=query_body)['hits']['hits'] if tweets_results: for item in tweets_results: item = item['_source'] weibo = item['text'] tweets_list.append(weibo) if tweets_list: opinion_name, word_result, text_list = opinion_main(tweets_list, k_cluster=5) sub_opinion_results = dict() topic_keywords_list = [] summary_text_list = [] for topic, text in text_list.iteritems(): topic_name = opinion_name[topic] sub_opinion_results[topic_name] = text[:SUB_OPINION_WEIBO_LIMIT] topic_keywords_list.extend(topic_name.split('&')) summary_text_list.extend(text) #try: print 'summary_text_list..', len(summary_text_list) print 'topic_keywords_list..', topic_keywords_list summary = text_generation_main(summary_text_list, topic_keywords_list) #summary = summary_main(summary_text_list) #except: # summary = '' else: sub_opinion_results = {} summary = '' print '开始保存子观点计算结果......' print 'summary....', summary mark = save_intelligent_opinion_results(task_id, sub_opinion_results, summary, intel_type) return mark
def read_tracing_followers_tweet(): if S_TYPE == 'test': query_body = { 'query': { 'term': { 'xnr_user_no': 'WXNR0004' } }, 'size': MAX_SEARCH_SIZE } else: query_body = {'query': {'match_all': {}}, 'size': MAX_SEARCH_SIZE} results = es_xnr.search(index=weibo_xnr_fans_followers_index_name,doc_type=weibo_xnr_fans_followers_index_type,\ body=query_body)['hits']['hits'] if results: for result in results: result = result['_source'] #print 'result..',result try: xnr_user_no = result['xnr_user_no'] except: xnr_user_no = result['xnr_use_no'] if not xnr_user_no: continue print 'result...', result trace_follow_list = result['trace_follow_list'] if S_TYPE == 'test': current_time = datetime2ts(S_DATE) #trace_follow_list = TRACE_FOLLOW_LIST else: current_time = int(time.time()) current_date = ts2datetime(current_time) flow_text_index_name = flow_text_index_name_pre + current_date query_body_flow = { 'query': { 'filtered': { 'filter': { 'terms': { 'uid': trace_follow_list } } } }, 'size': MAX_SEARCH_SIZE } results_flow = es_flow_text.search(index=flow_text_index_name,doc_type=flow_text_index_type,\ body=query_body_flow)['hits']['hits'] if results_flow: for result_flow in results_flow: result_flow = result_flow['_source'] mid = result_flow['mid'] #先判断 之前是否已经存过该mid task_id = xnr_user_no + '_' + mid try: # 如果已添加则跳过 es_xnr.get(index=weibo_xnr_retweet_timing_list_index_name,doc_type=\ weibo_xnr_retweet_timing_list_index_type,id=task_id)['_source'] continue except: # 如果未添加过则加入列表 task_detail = {} task_detail['xnr_user_no'] = xnr_user_no task_detail['mid'] = mid task_detail['text'] = result_flow['text'] task_detail['uid'] = result_flow['uid'] task_detail['nick_name'], task_detail[ 'photo_url'] = uid2nick_name_photo( result_flow['uid']) task_detail['timestamp'] = result_flow['timestamp'] task_detail['timestamp_set'] = result_flow[ 'timestamp'] + random.randint( RETWEET_START_TS, RETWEET_END_TS) task_detail['compute_status'] = 0 es_xnr.index(index=weibo_xnr_retweet_timing_list_index_name,doc_type=\ weibo_xnr_retweet_timing_list_index_type,body=task_detail,id=task_id)
def read_flow_text(flow_text_index_name, current_date): #flow_text_index_name = facebook_flow_text_index_name_pre + current_date i = 0 label_count_dict = {} content_dict = {} print '!!!' while True: query_body = { 'query': { 'bool': { 'must': [{ 'term': { 'sensitive': 0 } }] } }, 'size': 1000, 'from': i * 1000 } # 原创、sensitive为0 #print '222' search_results = es_xnr.search(index=flow_text_index_name,doc_type=facebook_flow_text_index_type,\ body=query_body)['hits']['hits'] weibo_list = [] for result in search_results: result = result['_source'] weibo_list.append(result['text'].encode('utf-8')) label_list = triple_classifier_new(weibo_list) label_count = Counter(label_list) #print '333' for j in range(len(search_results)): label = label_list[j] try: if label_count_dict[label] < 20: content_dict[label].append(search_results[j]['_source']) label_count_dict[label] += 1 except: content_dict[label] = [search_results[j]['_source']] label_count_dict[label] = 1 i += 1 if i % 1000 == 0: print 'i...', i print 'label_count_dict...', label_count_dict # 循环终止条件 min_label_count = min(label_count_dict, key=label_count_dict.get) if label_count_dict[min_label_count] >= 20: break print 'label_count_dict::', label_count_dict for content_label, content_weibo in content_dict.iteritems(): _id = content_label index_name = fb_daily_interest_index_name_pre + '_' + current_date fb_daily_inerests_flow_text_mappings(index_name) item_dict = {} item_dict['timestamp'] = datetime2ts(current_date) item_dict['content'] = json.dumps(content_weibo) print es_xnr.index(index=index_name, doc_type=fb_daily_interest_index_type, id=_id, body=item_dict) print content_label, '====', len(content_weibo)
def get_trace_follow_operate(xnr_user_no, uid_string, nick_name_string): mark = False fail_nick_name_list = [] if uid_string: uid_list = uid_string.encode('utf-8').split(',') elif nick_name_string: nick_name_list = nick_name_string.encode('utf-8').split(',') uid_list = [] for nick_name in nick_name_list: query_body = { 'query': { 'filtered': { 'filter': { 'term': { 'nick_name': nick_name } } } }, '_source': ['uid'] } try: uid_results = es.search(index=facebook_user_index_name,doc_type=facebook_user_index_type,\ body=query_body)['hits']['hits'] uid_result = uid_result[0]['_source'] uid = uid_result['uid'] uid_list.append(uid) except: fail_nick_name_list.append(nick_name) try: result = es.get(index=fb_xnr_fans_followers_index_name,doc_type=fb_xnr_fans_followers_index_type,\ id=xnr_user_no)['_source'] try: trace_follow_list = result['trace_follow_list'] except: trace_follow_list = [] try: followers_list = result['fans_list'] except: followers_list = [] trace_follow_list = list(set(trace_follow_list) | set(uid_list)) followers_list = list(set(followers_list) | set(uid_list)) es.update(index=fb_xnr_fans_followers_index_name,doc_type=fb_xnr_fans_followers_index_type,\ id=xnr_user_no,body={'doc':{'trace_follow_list':trace_follow_list,'fans_list':followers_list}}) mark = True except: item_exists = {} item_exists['xnr_user_no'] = xnr_user_no item_exists['trace_follow_list'] = uid_list item_exists['fans_list'] = uid_list es.index(index=fb_xnr_fans_followers_index_name,doc_type=fb_xnr_fans_followers_index_type,\ id=xnr_user_no,body=item_exists) mark = True return [mark, fail_nick_name_list]
def get_hot_sensitive_recommend_at_user(sort_item): if S_TYPE == 'test': now_ts = datetime2ts(S_DATE_FB) else: now_ts = int(time.time()) datetime = ts2datetime(now_ts - 24 * 3600) #sort_item = 'sensitive' sort_item_2 = 'timestamp' index_name = facebook_flow_text_index_name_pre + datetime query_body = { 'query': { 'match_all': {} }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': HOT_EVENT_TOP_USER, '_source': ['uid', 'user_fansnum', 'retweeted', 'timestamp'] } # if sort_item == 'retweeted': # sort_item_2 = 'timestamp' # else: # sort_item_2 = 'retweeted' es_results = es.search(index=index_name, doc_type=facebook_flow_text_index_type, body=query_body)['hits']['hits'] uid_fansnum_dict = dict() if es_results: for result in es_results: result = result['_source'] uid = result['uid'] uid_fansnum_dict[uid] = {} uid_fansnum_dict[uid][sort_item_2] = result[sort_item_2] uid_fansnum_dict_sort_top = sorted(uid_fansnum_dict.items(), key=lambda x: x[1][sort_item_2], reverse=True) uid_set = set() for item in uid_fansnum_dict_sort_top: uid_set.add(item[0]) uid_list = list(uid_set) ## 根据uid,从weibo_user中得到 nick_name uid_nick_name_dict = dict() # uid不会变,而nick_name可能会变 es_results_user = es.mget(index=facebook_user_index_name, doc_type=facebook_user_index_type, body={'ids': uid_list})['docs'] i = 0 for result in es_results_user: if result['found'] == True: result = result['_source'] uid = result['uid'] nick_name = result['name'] if nick_name: i += 1 uid_nick_name_dict[uid] = nick_name if i >= HOT_AT_RECOMMEND_USER_TOP: break return uid_nick_name_dict
d = r.get(self.wxbot_id) if d: try: wx_id = eval(d)['wx_id'] wxbot_port = eval(d)['wxbot_port'] submitter = eval(d)['submitter'] mail = eval(d)['mail'] access_id = eval(d)['access_id'] remark = eval(d)['remark'] break except Exception, e: print e #check if already exist query_body_wx_exist = {'query': {'term': {'wx_id': wx_id}}} search_result = es_xnr.search(index=wx_xnr_index_name, doc_type=wx_xnr_index_type, body=query_body_wx_exist)['hits']['hits'] if search_result: #更改xnr信息并保存到es中 pass else: #print 'save_bot_info' wxxnr_data = { 'wx_id': wx_id, 'puid': self.self.puid, 'user_no': wxbot_id2user_no(self.wxbot_id), 'xnr_user_no': self.wxbot_id, 'wxbot_port': wxbot_port, 'create_ts': int(time.time()), 'nickname': self.self.name, 'remark': remark,
def create_date_warning(today_datetime): query_body = { 'query': { 'match_all': {} }, 'size': MAX_VALUE, 'sort': { 'date_time': { 'order': 'asc' } } } try: result = es_xnr.search(index=weibo_date_remind_index_name, doc_type=weibo_date_remind_index_type, body=query_body)['hits']['hits'] date_result = [] for item in result: #计算距离日期 date_time = item['_source']['date_time'] year = ts2yeartime(today_datetime) warming_date = year + '-' + date_time today_date = ts2datetime(today_datetime) countdown_num = (datetime2ts(warming_date) - datetime2ts(today_date)) / DAY if abs(countdown_num) < WARMING_DAY: #根据给定的关键词查询预警微博 print 'date_time:', date_time keywords = item['_source']['keywords'] date_warming = lookup_twitter_date_warming( keywords, today_datetime) item['_source']['twitter_date_warming_content'] = json.dumps( date_warming) item['_source']['validity'] = 0 item['_source']['timestamp'] = today_datetime task_id = str( item['_source']['create_time']) + '_' + str(today_datetime) #print 'task_id',task_id #print 'date_warming',date_warming #写入数据库 twitter_timing_warning_index_name = twitter_timing_warning_index_name_pre + warming_date if date_warming: print twitter_timing_warning_index_name try: es_xnr_2.index( index=twitter_timing_warning_index_name, doc_type=twitter_timing_warning_index_name, body=item['_source'], id=task_id) mark = True except: mark = False else: pass date_result.append(mark) else: pass except: date_result = [] return date_result
def xnr_keywords_compute(xnr_user_no): #查询好友列表 followers_list = lookup_xnr_concernedusers(xnr_user_no) lookup_condition_list = [] print 'xnr_user_no, followers_list:', xnr_user_no, followers_list lookup_condition_list.append({ 'filtered': { 'filter': { 'bool': { 'must': { 'terms': { 'uid': followers_list } } } } } }) #根据日期确定查询表 if S_TYPE == 'test': date_time = test_date else: now_time = int(time.time()) date_time = ts2datetime(now_time) flow_text_index_name = twitter_flow_text_index_name_pre + date_time #按日期统计 # print lookup_condition_list for item_condition in lookup_condition_list: query_body = { 'query': item_condition, 'aggs': { 'keywords': { 'terms': { 'field': 'keywords_string', 'size': 1000 } } } } flow_text_exist=es_xnr.search(index=flow_text_index_name,doc_type=twitter_flow_text_index_type,\ body=query_body)['aggregations']['keywords']['buckets'] # print 'flow_text_exist:',flow_text_exist word_dict = dict() word_dict_new = dict() keywords_string = '' for item in flow_text_exist: word = item['key'] count = item['doc_count'] word_dict[word] = count keywords_string += '&' keywords_string += item['key'] k_dict = extract_keywords(keywords_string) for item_item in k_dict: keyword = item_item.word # print 'keyword::',type(keyword) word_dict_new[keyword] = word_dict[keyword] return word_dict_new
def query_mid_list(ts, social_sensors, time_segment, message_type=1): query_body = { "query": { "filtered": { "filter": { "bool": { "must": [{ "range": { "timestamp": { "gte": ts - time_segment, "lt": ts } } }, { "terms": { "uid": social_sensors } }] } } } }, "sort": { "sentiment": { "order": "desc" } }, "size": 10000 } mid_dict = dict() datetime_1 = ts2datetime(ts) datetime_2 = ts2datetime(ts - 24 * 3600) index_name_1 = flow_text_index_name_pre + datetime_1 index_name_2 = flow_text_index_name_pre + datetime_2 index_list = [] exist_es_1 = es_text.indices.exists(index_name_1) exist_es_2 = es_text.indices.exists(index_name_2) if exist_es_1: index_list.append(index_name_1) if exist_es_2: index_list.append(index_name_2) if index_list: search_results = es_text.search(index=index_list, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] else: search_results = [] origin_mid_list = set() if search_results: for item in search_results: origin_mid_list.add(item["_id"]) # if message_type == 1: # origin_mid_list.add(item["_id"]) # else: # origin_mid_list.add(item['_source']['root_mid']) # mid_dict[item['_source']['root_mid']] = item["_id"] # 源头微博和当前转发微博的mid # if message_type != 1: # # 保证获取的源头微博能在最近两天内找到 # filter_list = [] # filter_mid_dict = dict() # for iter_index in index_list: # exist_es = es_text.mget(index=iter_index, doc_type="text", body={"ids":list(origin_mid_list)})["docs"] # print 'es_text...',es_text # print 'index_list..',index_list # for item in exist_es: # if item["found"]: # filter_list.append(item["_id"]) # filter_mid_dict[item["_id"]] = mid_dict[item["_id"]] # origin_mid_list = filter_list # mid_dict = filter_mid_dict return list(origin_mid_list), mid_dict
def social_sensing(): all_tid_list, end_ts = count_statis() if S_TYPE == 'test': all_tid_list = ALL_TID_LIST index_list = [] for i in range(7): timestamp = end_ts - i * DAY flow_text_index_name = flow_text_index_name_pre + ts2datetime( timestamp) index_list.append(flow_text_index_name) #index_list = [flow_text_index_name_pre+date_1,flow_text_index_name_pre+date_2] print 'index_list...', index_list # 感知到的事, all_tid_list sensitive_text_list = [] tmp_sensitive_warning = "" text_dict = dict() # 文本信息 tid_value = dict() # 文本赋值 duplicate_dict = dict() # 重合字典 portrait_dict = dict() # 背景信息 classify_text_dict = dict() # 分类文本 classify_uid_list = [] classify_tid_list = [] duplicate_text_list = [] sensitive_words_dict = dict() sensitive_weibo_detail = {} all_text_dict = dict() tid_ts_dict = dict() # 文本发布时间 # 有事件发生时开始 #if 1: if index_list and all_tid_list: query_body = { "query": { "filtered": { "filter": { "terms": { "tid": all_tid_list } } } }, "size": 5000 } search_results = es.search(index=index_list, doc_type="text", body=query_body)['hits']['hits'] print "search tid len: ", len(search_results) if search_results: for item in search_results: iter_uid = item['_source']['uid'] iter_tid = item['_source']['tid'] tid_ts_dict[iter_tid] = item["_source"]["timestamp"] iter_text = item['_source']['text'].encode('utf-8', 'ignore') iter_sensitive = item['_source'].get('sensitive', 0) tmp_text = get_weibo(item['_source']) all_text_dict[iter_tid] = tmp_text duplicate_text_list.append({ "_id": iter_tid, "title": "", "content": iter_text.decode("utf-8", 'ignore') }) if iter_sensitive: tmp_sensitive_warning = signal_sensitive_variation #涉及到敏感词的微博 sensitive_words_dict[iter_tid] = iter_sensitive keywords_dict = json.loads(item['_source']['keywords_dict']) personal_keywords_dict = dict() for k, v in keywords_dict.iteritems(): k = k.encode('utf-8', 'ignore') personal_keywords_dict[k] = v classify_text_dict[iter_tid] = personal_keywords_dict #classify_uid_list.append(iter_uid) classify_tid_list.append(iter_tid) # 去重 print "start duplicate" if duplicate_text_list: dup_results = duplicate(duplicate_text_list) for item in dup_results: if item['duplicate']: duplicate_dict[item['_id']] = item['same_from'] # 分类 print "start classify" tid_value = dict() if classify_text_dict: #classify_results = topic_classfiy(classify_uid_list, classify_text_dict) classify_results = topic_classfiy(classify_tid_list, classify_text_dict) #print "classify_results: ", classify_results for k, v in classify_results.iteritems(): # tid:value #tid_value[k] = topic_value_dict[v[0]] tid_value[k] = v[0] # organize data tid_list = all_text_dict.keys() print "final tid:", len(tid_list) print "intersection: ", len(set(tid_list) & set(all_tid_list)) bulk_action = [] count = 0 #social_sensing_index_name = "tw_social_sensing_text_" + ts2datetime(end_ts) social_sensing_index_name = "tw_social_sensing_text" mappings_social_sensing_text(social_sensing_index_name) for tid in tid_list: iter_dict = dict() if duplicate_dict.has_key(tid): iter_dict["duplicate"] = duplicate_dict[tid] else: iter_dict["duplicate"] = "" iter_dict["compute_status"] = 0 # 尚未计算 iter_dict["topic_field"] = tid_value[tid] iter_dict["detect_ts"] = end_ts #iter_dict["xnr_user_no"] = xnr_user_no iter_dict.update(all_text_dict[tid]) count += 1 print 'iter_dict:::', iter_dict # _id = xnr_user_no + '_' + tid bulk_action.extend([{"index": {"_id": tid}}, iter_dict]) if count % 500 == 0: es.bulk(bulk_action, index=social_sensing_index_name, doc_type="text", timeout=600) bulk_action = [] if bulk_action: es.bulk(bulk_action, index=social_sensing_index_name, doc_type="text", timeout=600) return "1"
def count_statis(): end_ts = int(time.time()) if S_TYPE == 'test': end_ts = datetime2ts(S_DATE_FB) start_ts = end_ts - 12 * 3600 query_body = { 'query': { 'bool': { 'must': [{ 'range': { 'update_time': { 'gt': start_ts, 'lte': end_ts } } }] } }, 'aggs': { 'all_tids': { 'terms': { 'field': 'tid', 'order': { 'stats_share.max': 'desc' }, 'size': MAX_SIZE }, 'aggs': { 'stats_share': { 'stats': { 'field': 'share' } } } } } } twitter_count_index_name_1 = twitter_count_index_name_pre + ts2datetime( end_ts) twitter_count_index_name_2 = twitter_count_index_name_pre + ts2datetime( end_ts - DAY) twitter_count_index_name_list = [ twitter_count_index_name_1, twitter_count_index_name_2 ] print 'twitter_count_index_name_list...', twitter_count_index_name_list results = es.search(index=twitter_count_index_name_list ,doc_type='text',\ body=query_body)['aggregations']['all_tids']['buckets'] results_origin = copy.deepcopy(results) print 'start count aggs sort...' results.sort(key=lambda x: (x['stats_share']['max'] - x['stats_share']['min']), reverse=True) tid_list = [ item['key'] for item in results if (item['stats_share']['max'] - item['stats_share']['min']) >= HOT_LOWWER ] if len(tid_list) < TOP_HOT_FB: tid_list_2 = [ item['key'] for item in results_origin[:TOP_HOT_FB - len(tid_list)] ] tid_list.extend(tid_list_2) print 'all tid_list over...' print 'len..tid_list...', tid_list return tid_list, end_ts
def create_event_warning(xnr_user_no, today_datetime, write_mark): #获取事件名称 hashtag_list = get_hashtag(today_datetime) #print 'hashtag_list/:',hashtag_list facebook_flow_text_index_name = get_timets_set_indexset_list( facebook_flow_text_index_name_pre, today_datetime, today_datetime) #虚拟人的好友列表 friends_list = lookup_xnr_friends(xnr_user_no) event_warming_list = [] for event_item in hashtag_list: event_warming_content = dict() #事件名称、主要参与用户、典型微博、事件影响力、事件平均时间 event_warming_content['event_name'] = event_item['event_name'] event_influence_sum = 0 event_time_sum = 0 query_body = { 'query': { 'filtered': { 'filter': { 'bool': { 'must': [{ 'term': { 'hashtag': event_item['event_name'] } }, { 'range': { 'sensitive': { 'gte': 1 } } }] } } } }, 'size': MAX_WARMING_SIZE, 'sort': { 'sensitive': { 'order': 'desc' } } } event_results = es_xnr.search(index=facebook_flow_text_index_name, doc_type=facebook_flow_text_index_type, body=query_body)['hits']['hits'] if event_results: facebook_result = [] friends_num_dict = dict() alluser_num_dict = dict() #print 'sencond_time:::',int(time.time()) for item in event_results: #查询三个指标字段 fid_result = lookup_fid_attend_index(item['_source']['fid'], today_datetime) if fid_result: item['_source']['comment'] = fid_result['comment'] item['_source']['share'] = fid_result['share'] item['_source']['favorite'] = fid_result['favorite'] else: item['_source']['comment'] = 0 item['_source']['share'] = 0 item['_source']['favorite'] = 0 #print 'event_content:',item['_source']['text'] #统计用户信息 if alluser_num_dict.has_key(str(item['_source']['uid'])): friends_mark = set_intersection(item['_source']['uid'], friends_list) if friends_mark > 0: alluser_num_dict[str( item['_source']['uid'])] = alluser_num_dict[str( item['_source']['uid'])] + 1 * 2 else: alluser_num_dict[str( item['_source']['uid'])] = alluser_num_dict[str( item['_source']['uid'])] + 1 else: alluser_num_dict[str(item['_source']['uid'])] = 1 #计算影响力 origin_influence_value = (1 + item['_source']['comment'] + item['_source']['share'] + item['_source']['favorite']) * ( 1 + item['_source']['sensitive']) friends_value = judge_user_type(item['_source']['uid'], friends_list) item['_source'][ 'facebook_influence_value'] = origin_influence_value * friends_value #查询用户昵称 item['_source']['nick_name'] = get_user_nickname( item['_source']['uid']) facebook_result.append(item['_source']) #统计影响力、时间 event_influence_sum = event_influence_sum + item['_source'][ 'facebook_influence_value'] event_time_sum = event_time_sum + item['_source']['timestamp'] # print 'third_time:::',int(time.time()) #典型信息 facebook_result.sort(key=lambda k: (k.get('facebook_influence_value', 0)), reverse=True) event_warming_content['main_facebook_info'] = json.dumps( facebook_result) #事件影响力和事件时间 number = len(event_results) event_warming_content[ 'event_influence'] = event_influence_sum / number event_warming_content['event_time'] = event_time_sum / number #对用户进行排序 alluser_num_dict = sorted(alluser_num_dict.items(), key=lambda d: d[1], reverse=True) main_userid_list = [] for i in xrange(0, len(alluser_num_dict)): main_userid_list.append(alluser_num_dict[i][0]) #主要参与用户信息 main_user_info = [] user_es_result = es_xnr.mget(index=facebook_user_index_name, doc_type=facebook_user_index_type, body={'ids': main_userid_list})['docs'] # print 'user_es_result:',user_es_result for item in user_es_result: user_dict = dict() if item['found']: user_dict['uid'] = item['_id'] user_dict['username'] = item['_source']['username'] if item['_source'].has_key('talking_about_count'): user_dict['talking_about_count'] = item['_source'][ 'talking_about_count'] else: user_dict['talking_about_count'] = 0 if item['_source'].has_key('likes'): user_dict['likes'] = item['_source']['likes'] else: user_dict['likes'] = 0 if item['_source'].has_key('category'): user_dict['category'] = item['_source']['category'] else: user_dict['category'] = '' else: # user_dict['icon']='' user_dict['uid'] = item['_id'] user_dict['username'] = '' user_dict['talking_about_count'] = 0 user_dict['likes'] = 0 user_dict['category'] = '' main_user_info.append(user_dict) event_warming_content['main_user_info'] = json.dumps( main_user_info) # print 'fourth_time:::',int(time.time()) event_warming_content['xnr_user_no'] = xnr_user_no event_warming_content['validity'] = 0 event_warming_content['timestamp'] = today_datetime now_time = int(time.time()) # task_id=xnr_user_no+'_'+str(now_time) task_id = xnr_user_no + '_' + event_warming_content['event_name'] #写入数据库 if write_mark: # print 'today_datetime:::',ts2datetime(today_datetime) print 'task_id_event:', task_id mark = write_envent_warming(today_datetime, event_warming_content, task_id) event_warming_list.append(mark) else: event_warming_list.append(event_warming_content) else: pass # print 'fifth_time:::',int(time.time()) return event_warming_list
def create_personal_warning(xnr_user_no, today_datetime): #查询好友列表 friends_list = lookup_xnr_friends(xnr_user_no) #查询虚拟人uid xnr_uid = lookup_xnr_uid(xnr_user_no) #计算敏感度排名靠前的用户 query_body = { # 'query':{ # 'filtered':{ # 'filter':{ # 'terms':{'uid':friends_list} # } # } # }, 'aggs': { 'friends_sensitive_num': { 'terms': { 'field': 'uid' }, 'aggs': { 'sensitive_num': { 'sum': { 'field': 'sensitive' } } } } }, 'size': MAX_SEARCH_SIZE } facebook_flow_text_index_name = get_timets_set_indexset_list( facebook_flow_text_index_name_pre, today_datetime, today_datetime) try: first_sum_result=es_xnr.search(index=facebook_flow_text_index_name,doc_type=facebook_flow_text_index_type,\ body=query_body)['aggregations']['friends_sensitive_num']['buckets'] except: first_sum_result = [] #print 'first_sum_result',first_sum_result top_userlist = [] for i in xrange(0, len(first_sum_result)): user_sensitive = first_sum_result[i]['sensitive_num']['value'] if user_sensitive > 0: user_dict = dict() user_dict['uid'] = first_sum_result[i]['key'] friends_mark = judge_user_type(user_dict['uid'], friends_list) user_dict['sensitive'] = user_sensitive * friends_mark top_userlist.append(user_dict) else: pass ##################### #如果是好友,则用户敏感度计算值增加1.5倍 ##################### #查询敏感用户的敏感内容 results = [] for user in top_userlist: #print user user_detail = dict() user_detail['uid'] = user['uid'] user_detail['user_sensitive'] = user['sensitive'] user_lookup_id = user['uid'] print user_lookup_id # try: # #user_result=es_xnr.get(index=facebook_feedback_friends_index_name,doc_type=facebook_feedback_friends_index_type,id=user_lookup_id)['_source'] # user_result=es_xnr.get(index=facebook_user_index_name,doc_type=facebook_user_index_type,id=user['uid'])['_source'] # user_detail['user_name']=user_result['nick_name'] # except: # user_detail['user_name']='' user_detail['user_name'] = get_user_nickname(user['uid']) query_body = { 'query': { 'filtered': { 'filter': { 'bool': { 'must': [{ 'term': { 'uid': user['uid'] } }, { 'range': { 'sensitive': { 'gte': 1 } } }] } } } }, 'size': MAX_WARMING_SIZE, 'sort': { 'sensitive': { 'order': 'desc' } } } try: second_result = es_xnr.search( index=facebook_flow_text_index_name, doc_type=facebook_flow_text_index_type, body=query_body)['hits']['hits'] except: second_result = [] s_result = [] for item in second_result: #查询三个指标字段 fid_result = lookup_fid_attend_index(item['_source']['fid'], today_datetime) if fid_result: item['_source']['comment'] = fid_result['comment'] item['_source']['share'] = fid_result['share'] item['_source']['favorite'] = fid_result['favorite'] else: item['_source']['comment'] = 0 item['_source']['share'] = 0 item['_source']['favorite'] = 0 #查询用户昵称 item['_source']['nick_name'] = get_user_nickname( item['_source']['uid']) s_result.append(item['_source']) s_result.sort(key=lambda k: (k.get('sensitive', 0)), reverse=True) user_detail['content'] = json.dumps(s_result) user_detail['xnr_user_no'] = xnr_user_no user_detail['validity'] = 0 user_detail['timestamp'] = today_datetime #写入数据库 today_date = ts2datetime(today_datetime) facebook_user_warning_index_name = facebook_user_warning_index_name_pre + today_date task_id = xnr_user_no + '_' + user_detail['uid'] if s_result: try: es_xnr.index(index=facebook_user_warning_index_name, doc_type=facebook_user_warning_index_type, body=user_detail, id=task_id) mark = True except: mark = False else: pass results.append(mark) return results
def create_speech_warning(xnr_user_no, today_datetime): #查询好友列表 friends_list = lookup_xnr_friends(xnr_user_no) query_body = { 'query': { 'filtered': { 'filter': { 'bool': { 'must': { 'range': { 'sensitive': { 'gte': 1 } } } } } } }, 'size': MAX_SEARCH_SIZE, 'sort': { 'sensitive': { 'order': 'desc' } } } facebook_flow_text_index_name = get_timets_set_indexset_list( facebook_flow_text_index_name_pre, today_datetime, today_datetime) #print facebook_flow_text_index_name results = es_xnr.search(index=facebook_flow_text_index_name, doc_type=facebook_flow_text_index_type, body=query_body)['hits']['hits'] #print results result = [] for item in results: if item['_source']['uid'] in friends_list: item['_source']['content_type'] = 'friends' else: item['_source']['content_type'] = 'unfriends' item['_source']['validity'] = 0 item['_source']['xnr_user_no'] = xnr_user_no #查询三个指标字段 fid_result = lookup_fid_attend_index(item['_source']['fid'], today_datetime) if fid_result: item['_source']['comment'] = fid_result['comment'] item['_source']['share'] = fid_result['share'] item['_source']['favorite'] = fid_result['favorite'] else: item['_source']['comment'] = 0 item['_source']['share'] = 0 item['_source']['favorite'] = 0 #查询用户昵称 item['_source']['nick_name'] = get_user_nickname( item['_source']['uid']) task_id = xnr_user_no + '_' + item['_source']['fid'] #写入数据库 today_date = ts2datetime(today_datetime) facebook_speech_warning_index_name = facebook_speech_warning_index_name_pre + today_date #facebook_speech_warning_index_name=facebook_speech_warning_index_name_pre+FACEBOOK_FLOW_START_DATE # try: es_xnr.index(index=facebook_speech_warning_index_name, doc_type=facebook_speech_warning_index_type, body=item['_source'], id=task_id) mark = True # except: # mark=False result.append(mark) return result
def cron_compute_mark_qq(current_time): current_date = ts2datetime(current_time) current_time_new = datetime2ts(current_date) xnr_results = es.search(index=qq_xnr_index_name,doc_type=qq_xnr_index_type,\ body={'query':{'match_all':{}},'size':MAX_SEARCH_SIZE})['hits']['hits'] if S_TYPE == 'test': xnr_results = [{ '_source': { 'xnr_user_no': 'QXNR0007', 'qq_number': '1039598173' } }] for result in xnr_results: print 'result....', result xnr_user_no = result['_source']['xnr_user_no'] qq_number = result['_source']['qq_number'] #xnr_user_no = 'WXNR0004' influence_dict = get_influence_at_num(xnr_user_no, qq_number, current_time) penetration_dict = get_penetration_num(xnr_user_no, qq_number, current_time) safe_dict = qq_history_count(xnr_user_no, qq_number, current_time) #_id = xnr_user_no + '_' + current_date _id = xnr_user_no xnr_user_detail = {} xnr_user_detail['influence'] = influence_dict['mark'] xnr_user_detail['penetration'] = penetration_dict['mark'] xnr_user_detail['safe'] = safe_dict['mark'] xnr_user_detail['daily_be_at_num'] = influence_dict['daily_be_at_num'] xnr_user_detail['total_be_at_num'] = influence_dict['total_be_at_num'] xnr_user_detail['daily_sensitive_num'] = penetration_dict[ 'sensitive_info'] #xnr_user_detail['daily_sensitive_num'] = penetration_dict['daily_sensitive_num'] xnr_user_detail['total_post_num'] = safe_dict['total_post_num'] xnr_user_detail['daily_post_num'] = safe_dict['daily_post_num'] xnr_user_detail['date_time'] = current_date xnr_user_detail['timestamp'] = current_time_new xnr_user_detail['xnr_user_no'] = xnr_user_no xnr_user_detail['qq_number'] = qq_number qq_xnr_history_count_index_name = qq_xnr_history_count_index_name_pre + current_date try: #print 'xnr_user_detail...',xnr_user_detail print 'qq_xnr_history_count_index_name...', qq_xnr_history_count_index_name qq_xnr_history_count_mappings(qq_xnr_history_count_index_name) es.index(index=qq_xnr_history_count_index_name,doc_type=qq_xnr_history_count_index_type,\ id=_id,body=xnr_user_detail) mark = True except: mark = False return mark
def query_related_weibo(ts, origin_mid_list, time_segment): query_all_body = { "query": { "filtered": { "filter": { "bool": { "must": [{ "range": { "timestamp": { "gte": ts - time_segment, "lt": ts } } }, { "terms": { "root_mid": origin_mid_list } }] } } } }, "aggs": { "all_count": { "terms": { "field": "message_type" } } } } return_results = {"origin": 0, "retweeted": 0, "comment": 0} datetime_1 = ts2datetime(ts) datetime_2 = ts2datetime(ts - 24 * 3600) index_name_1 = flow_text_index_name_pre + datetime_1 index_name_2 = flow_text_index_name_pre + datetime_2 index_list = [] exist_es_1 = es_text.indices.exists(index_name_1) exist_es_2 = es_text.indices.exists(index_name_2) if exist_es_1: index_list.append(index_name_1) if exist_es_2: index_list.append(index_name_2) if index_list: results = es_text.search( index=index_list, doc_type=flow_text_index_type, body=query_all_body)['aggregations']['all_count']['buckets'] if results: for item in results: if int(item['key']) == 1: return_results['origin'] = item['doc_count'] elif int(item['key']) == 3: return_results['retweeted'] = item['doc_count'] elif int(item['key']) == 2: return_results['comment'] = item['doc_count'] else: pass return_results['total_count'] = sum(return_results.values()) return return_results
def compute_recommend_subopnion(task_detail): print '开始分析计算......' task_id = task_detail['task_id'].strip('"') keywords_string = task_detail['keywords_string'] keywords_list = keywords_string.split('&') ## 以 & 切分关键词,得到list xnr_user_no = task_detail['xnr_user_no'] mid = task_detail['mid'] query_item = 'keywords_string' nest_query_list = [] for keyword in keywords_list: nest_query_list.append({'wildcard':{query_item:'*'+keyword+'*'}}) ''' ## 重点关注当前虚拟人的关注用户 if S_TYPE == 'test': # followers_list = get_result['followers_list'] # nest_query_list.append({'terms':followers_list}) print '全部用户' else: get_result = es.get(index=fb_xnr_fans_followers_index_name,doc_type=fb_xnr_fans_followers_index_type,\ id=xnr_user_no)['_source'] followers_list = get_result['followers_list'] nest_query_list.append({'terms':followers_list}) ''' if S_TYPE == 'test': create_time = datetime2ts(S_DATE_FB) else: create_time = datehour2ts(ts2datehour(time.time()-3600)) #fb_get_flow_text_index_list(create_time) #index_name_list_list = fb_get_flow_text_index_list(now_timestamp) index_name_list = fb_get_flow_text_index_list(create_time) print 'index_name_list::',index_name_list es_results = es.search(index=index_name_list,doc_type='text',\ body={'query':{'bool':{'must':nest_query_list}},'size':MAX_SEARCH_SIZE})['hits']['hits'] fb_list = [] ## 内容推荐和子观点分析的输入 if es_results: for item in es_results: item = item['_source'] fb = item['text'] fb_list.append(fb) ## 内容推荐 ## 得到推荐句子列表 #print 'fb_list::::::',fb_list # print '开始内容推荐计算......' # if fb_list: # content_results = summary_main(fb_list) # else: # content_results = [] # print '开始保存内容推荐计算结果......' # mark = save_content_recommendation_results(xnr_user_no,mid,task_id.encode('utf-8'),content_results) # print 'mark_content:::',mark # if mark == False: # print '内容推荐结果保存过程中遇到错误,把计算任务重新push到队列中' # add_task_2_queue(keyword_task_queue_name,task_detail) # else: # print '内容推荐计算结果保存完毕......' ## 子观点分析 ''' 输入: fb_data:微博列表,[fb1,fb2,...] k_cluster:子话题个数 (默认为5) 输出: opinion_name:子话题名称字典,{topic1:name1,topic2:name2,...} word_result:子话题关键词对,{topic1:[w1,w2,...],topic2:[w1,w2,...],...} text_list:子话题对应的文本,{topic1:[text1,text2,...],topic2:[text1,text2,..],..} ''' print '开始子观点计算......' if fb_list: opinion_name,word_result,text_list = opinion_main(fb_list,k_cluster=5) sub_opinion_results = dict() for topic, text in text_list.iteritems(): topic_name = opinion_name[topic] sub_opinion_results[topic_name] = text[:SUB_OPINION_WEIBO_LIMIT] else: sub_opinion_results = {} print '开始保存子观点计算结果......' mark = save_subopnion_results(xnr_user_no,mid,task_id,sub_opinion_results) print 'mark_opinion:::',mark if mark == False: print '子观点计算结果保存过程中遇到错误,把计算任务重新push到队列中' add_task_2_queue(keyword_task_queue_name,task_detail) else: print '子观点计算结果保存完毕......'
def query_hot_weibo(ts, origin_mid_list, time_segment): query_all_body = { "query": { "filtered": { "filter": { "bool": { "must": [{ "range": { "timestamp": { "gte": ts - time_segment, "lt": ts } } }, { "terms": { "root_mid": origin_mid_list } }] } } } }, "aggs": { "all_mid": { "terms": { "field": "root_mid", "size": 400 }, "aggs": { "message_type": { "terms": { "field": "message_type" } } } } } } return_results = dict() datetime_1 = ts2datetime(ts) datetime_2 = ts2datetime(ts - 24 * 3600) index_name_1 = flow_text_index_name_pre + datetime_1 index_name_2 = flow_text_index_name_pre + datetime_2 index_list = [] exist_es_1 = es_text.indices.exists(index_name_1) exist_es_2 = es_text.indices.exists(index_name_2) if exist_es_1: index_list.append(index_name_1) if exist_es_2: index_list.append(index_name_2) index_list.append(flow_text_index_name_pre + ts2datetime(ts - 2 * 24 * 3600)) if index_list: results = es_text.search( index=index_list, doc_type=flow_text_index_type, body=query_all_body)['aggregations']['all_mid']['buckets'] if results: for item in results: temp_dict = dict() temp_dict[item['key']] = item['doc_count'] detail = item['message_type']['buckets'] detail_dict = dict() for iter_item in detail: detail_dict[iter_item['key']] = iter_item['doc_count'] temp_dict['retweeted'] = detail_dict.get(3, 0) temp_dict['comment'] = detail_dict.get(2, 0) return_results[item['key']] = temp_dict else: for item in origin_mid_list: temp_dict = dict() temp_dict[item] = 0 temp_dict['retweeted'] = 0 temp_dict['comment'] = 0 return_results[item] = temp_dict return return_results
def get_xnr_trace_community_detail(xnr_user_no, date_time): query_body = { 'query': { 'filtered': { 'filter': { 'bool': { 'must': [{ 'term': { 'xnr_user_no': xnr_user_no } }, { 'terms': { 'community_status': [0, 1, -2] } }, { 'range': { 'trace_time': { 'lt': date_time } } }] } } } }, 'size': 7, 'sort': { 'trace_time': { 'order': 'desc' } } } trace_index_name = weibo_trace_community_index_name_pre + xnr_user_no.lower( ) trace_community_detail = dict() if es_xnr.indices.exists(index=trace_index_name): trace_result = es_xnr.search(index=trace_index_name, doc_type=weibo_trace_community_index_type, body=query_body)['hits']['hits'] len_num = len(trace_result) total_num = 0 cluster_sum = 0 density_sum = 0 mean_influence_sum = 0 mean_sensitive_sum = 0 if len_num > 0: for item in trace_result: total_num = total_num + item['_source']['num'] cluster_sum = cluster_sum + item['_source']['cluster'] density_sum = density_sum + item['_source']['density'] mean_influence_sum = mean_influence_sum + item['_source'][ 'mean_influence'] mean_sensitive_sum = mean_sensitive_sum + item['_source'][ 'mean_sensitive'] trace_community_detail['min_num'] = (total_num / len_num) * 0.5 trace_community_detail['max_num'] = (total_num / len_num) * 1.5 trace_community_detail['cluster'] = (cluster_sum / len_num) * 0.75 trace_community_detail['density'] = (density_sum / len_num) * 0.75 trace_community_detail['mean_influence'] = (mean_influence_sum / len_num) * 0.5 trace_community_detail['mean_sensitive'] = (mean_sensitive_sum / len_num) * 0.5 else: trace_community_detail['min_num'] = MIN_COMMUNITY_NUM trace_community_detail['max_num'] = MAX_COMMUNITY_NUM trace_community_detail['cluster'] = COMMUNITY_DENSITY_CLUSTER trace_community_detail['density'] = COMMUNITY_DENSITY_CLUSTER trace_community_detail[ 'mean_influence'] = MIN_MEAN_COMMUNITY_INFLUENCE trace_community_detail[ 'mean_sensitive'] = MIN_MEAN_COMMUNITY_SENSITIVE else: trace_community_detail['min_num'] = MIN_COMMUNITY_NUM trace_community_detail['max_num'] = MAX_COMMUNITY_NUM trace_community_detail['cluster'] = COMMUNITY_DENSITY_CLUSTER trace_community_detail['density'] = COMMUNITY_DENSITY_CLUSTER trace_community_detail['mean_influence'] = MIN_MEAN_COMMUNITY_INFLUENCE trace_community_detail['mean_sensitive'] = MIN_MEAN_COMMUNITY_SENSITIVE return trace_community_detail
def aggregation_sentiment_related_weibo(ts, origin_mid_list, time_segment, message_type=1, uid_list=[]): if message_type == 1: query_all_body = { "query": { "filtered": { "filter": { "bool": { "must": [{ "range": { "timestamp": { "gte": ts - time_segment, "lt": ts } } }, { "terms": { "root_mid": origin_mid_list } }] } } } }, "aggs": { "all_sentiments": { "terms": { "field": "sentiment" } } } } else: query_all_body = { "query": { "filtered": { "filter": { "bool": { "must": [{ "range": { "timestamp": { "gte": ts - time_segment, "lt": ts } } }, { "terms": { "root_mid": origin_mid_list } }, { "terms": { "directed_uid": uid_list } }] } } } }, "aggs": { "all_sentiments": { "terms": { "field": "sentiment" } } } } results = {"0": 0, "1": 0, "2": 0, "3": 0, "4": 0, "5": 0, "6": 0} datetime_1 = ts2datetime(ts) datetime_2 = ts2datetime(ts - 24 * 3600) index_name_1 = flow_text_index_name_pre + datetime_1 index_name_2 = flow_text_index_name_pre + datetime_2 index_list = [] exist_es_1 = es_text.indices.exists(index_name_1) exist_es_2 = es_text.indices.exists(index_name_2) if exist_es_1: index_list.append(index_name_1) if exist_es_2: index_list.append(index_name_2) if index_list: search_results = es_text.search( index=index_list, doc_type=flow_text_index_type, body=query_all_body)['aggregations']['all_sentiments']['buckets'] if search_results: for item in search_results: key = item['key'] count = item['doc_count'] results[key] = count #print "results: ", results, sum(results.values()) return results
def newest_time_func(uid): query_body = { 'query': { 'term': { 'root_uid': uid } }, 'sort': { 'timestamp': { 'order': 'desc' } } } try: weibo_feedback_retweet_index_name = weibo_feedback_retweet_index_name_pre + '*' timestamp_retweet = es.search(index=weibo_feedback_retweet_index_name,doc_type=weibo_feedback_retweet_index_type,\ body=query_body)['hits']['hits'][0]['_source']['timestamp'] except: timestamp_retweet = 0 try: weibo_feedback_like_index_name = weibo_feedback_like_index_name_pre + '*' timestamp_like = es.search(index=weibo_feedback_like_index_name,doc_type=weibo_feedback_like_index_type,\ body=query_body)['hits']['hits'][0]['_source']['timestamp'] except: timestamp_like = 0 #timestamp_follow = es.search(index=weibo_feedback_follow_index_name,doc_type=weibo_feedback_follow_index_type,\ # body=query_body)['hits']['hits'][0]['_source']['timestamp'] #timestamp_fans = es.search(index=weibo_feedback_fans_index_name,doc_type=weibo_feedback_fans_index_type,\ #body=query_body)['hits']['hits'][0]['_source']['timestamp'] try: weibo_feedback_at_index_name = weibo_feedback_at_index_name_pre + '*' timestamp_at = es.search(index=weibo_feedback_at_index_name,doc_type=weibo_feedback_at_index_type,\ body=query_body)['hits']['hits'][0]['_source']['timestamp'] except: timestamp_at = 0 query_body_private = { 'query': { 'bool': { 'must': [{ 'term': { 'root_uid': uid } }] } }, 'sort': { 'timestamp': { 'order': 'desc' } } } try: weibo_feedback_private_index_name = weibo_feedback_private_index_name_pre + '*' timestamp_private = es.search(index=weibo_feedback_private_index_name,doc_type=weibo_feedback_private_index_type,\ body=query_body)['hits']['hits'][0]['_source']['timestamp'] except: timestamp_private = 0 ''' query_body_private_make = { 'query':{ 'bool':{ 'must':[ {'term':{'root_uid':uid}}, {'term':{'private_type':'make'}} ] } }, 'sort':{'timestamp':{'order':'desc'}} } timestamp_private_make = es.search(index=weibo_feedback_private_index_name,doc_type=weibo_feedback_private_index_type,\ body=query_body)['hits']['hits'][0]['_source']['timestamp'] ''' query_body_comment_receive = { 'query': { 'bool': { 'must': [{ 'term': { 'root_uid': uid } }, { 'term': { 'comment_type': 'receive' } }] } }, 'sort': { 'timestamp': { 'order': 'desc' } } } try: weibo_feedback_comment_index_name = weibo_feedback_comment_index_name_pre + '*' timestamp_comment_receive = es.search(index=weibo_feedback_comment_index_name,doc_type=weibo_feedback_comment_index_type,\ body=query_body)['hits']['hits'][0]['_source']['timestamp'] except: timestamp_comment_receive = 0 query_body_comment_make = { 'query': { 'bool': { 'must': [{ 'term': { 'root_uid': uid } }, { 'term': { 'comment_type': 'make' } }] } }, 'sort': { 'timestamp': { 'order': 'desc' } } } try: timestamp_comment_make = es.search(index=weibo_feedback_comment_index_name,doc_type=weibo_feedback_comment_index_type,\ body=query_body)['hits']['hits'][0]['_source']['timestamp'] except: timestamp_comment_make = 0 return timestamp_retweet, timestamp_like, timestamp_at, \ timestamp_private, timestamp_comment_receive, timestamp_comment_make
def social_sensing(task_detail): ''' with open("prediction_uid.pkl", "r") as f: uid_model = pickle.load(f) with open("prediction_weibo.pkl", "r") as f: weibo_model = pickle.load(f) ''' # 任务名 传感器 终止时间 之前状态 创建者 时间 task_name = task_detail[0] social_sensors = task_detail[1] #ts = int(task_detail[2]) ts = float(task_detail[2]) xnr_user_no = task_detail[3] print ts2date(ts) index_list = [] important_words = [] datetime_1 = ts2datetime(ts) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es = es_text.indices.exists(index=index_name_1) if exist_es: index_list.append(index_name_1) datetime_2 = ts2datetime(ts - DAY) index_name_2 = flow_text_index_name_pre + datetime_2 exist_es = es_text.indices.exists(index=index_name_2) if exist_es: index_list.append(index_name_2) if es_text.indices.exists(index=flow_text_index_name_pre + ts2datetime(ts - 2 * DAY)): index_list.append(flow_text_index_name_pre + ts2datetime(ts - 2 * DAY)) # PART 1 #forward_result = get_forward_numerical_info(task_name, ts, create_by) # 之前时间阶段内的原创微博list/retweeted forward_origin_weibo_list, forward_1 = query_mid_list( ts - time_interval, social_sensors, forward_time_range) forward_retweeted_weibo_list, forward_3 = query_mid_list( ts - time_interval, social_sensors, forward_time_range, 3) # 当前阶段内原创微博list current_mid_list, current_1 = query_mid_list(ts, social_sensors, time_interval) current_retweeted_mid_list, current_3 = query_mid_list( ts, social_sensors, time_interval, 3) all_mid_list = [] all_mid_list.extend(current_mid_list) all_mid_list.extend(current_retweeted_mid_list) all_mid_list.extend(forward_origin_weibo_list) all_mid_list.extend(forward_retweeted_weibo_list) all_origin_list = [] all_origin_list.extend(current_mid_list) all_origin_list.extend(forward_origin_weibo_list) all_origin_list = list(set(all_origin_list)) all_retweeted_list = [] all_retweeted_list.extend(current_retweeted_mid_list) all_retweeted_list.extend( forward_retweeted_weibo_list) #被转发微博的mid/root-mid all_retweeted_list = list(set(all_retweeted_list)) all_mid_list = filter_mid(all_mid_list) all_origin_list = filter_mid(all_origin_list) all_retweeted_list = filter_mid(all_retweeted_list) print "all mid list: ", len(all_mid_list) print "all_origin_list", len(all_origin_list) print "all_retweeted_list", len(all_retweeted_list) # 查询微博在当前时间内的转发和评论数, 聚合按照message_type #statistics_count = query_related_weibo(ts, all_mid_list, time_interval) if all_origin_list: #origin_weibo_detail = query_hot_weibo(ts, all_origin_list, time_interval) # 原创微博详情 origin_weibo_detail = dict() for mid in all_origin_list: retweet_count = es_text.count( index=index_list, doc_type="text", body={"query": { "bool": { "must": [{ "term": { "fid": mid } }] } }})["count"] comment_count = es_text.count( index=index_list, doc_type="text", body={"query": { "bool": { "must": [{ "term": { "fid": mid } }] } }})["count"] tmp = dict() tmp["retweeted"] = retweet_count tmp["comment"] = comment_count origin_weibo_detail[mid] = tmp else: origin_weibo_detail = {} print "len(origin_weibo_detail): ", len(origin_weibo_detail) if all_retweeted_list: retweeted_weibo_detail = dict() for mid in all_retweeted_list: retweet_count = es_text.count(index=index_list, doc_type="text", body={ "query": { "bool": { "must": [{ "term": { "root_mid": mid } }, { "term": { "message_type": 3 } }] } } })["count"] comment_count = es_text.count(index=index_list, doc_type="text", body={ "query": { "bool": { "must": [{ "term": { "root_mid": mid } }, { "term": { "message_type": 2 } }] } } })["count"] tmp = dict() tmp["retweeted"] = retweet_count tmp["comment"] = comment_count retweeted_weibo_detail[mid] = tmp #retweeted_weibo_detail = query_hot_weibo(ts, all_retweeted_list, time_interval) # 转发微博详情 else: retweeted_weibo_detail = {} print "len(retweeted_weibo_detail): ", len(retweeted_weibo_detail) #current_total_count = statistics_count['total_count'] # 当前阶段内所有微博总数 #current_retweeted_count = statistics_count['retweeted'] #current_comment_count = statistics_count['comment'] #all_mid_list = list(set(all_origin_list[:100]) | set(all_retweeted_list[:100])) # 感知到的事, all_mid_list sensitive_text_list = [] tmp_sensitive_warning = "" text_dict = dict() # 文本信息 mid_value = dict() # 文本赋值 duplicate_dict = dict() # 重合字典 portrait_dict = dict() # 背景信息 classify_text_dict = dict() # 分类文本 classify_uid_list = [] duplicate_text_list = [] sensitive_words_dict = dict() sensitive_weibo_detail = {} trendline_dict = dict() all_text_dict = dict() # 有事件发生时开始 if 1: print "index_list:", index_list if index_list and all_mid_list: query_body = { "query": { "filtered": { "filter": { "terms": { "mid": all_mid_list } } } }, "size": 5000 } search_results = es_text.search(index=index_list, doc_type="text", body=query_body)['hits']['hits'] print "search mid len: ", len(search_results) tmp_sensitive_warning = "" text_dict = dict() # 文本信息 mid_value = dict() # 文本赋值 duplicate_dict = dict() # 重合字典 portrait_dict = dict() # 背景信息 classify_text_dict = dict() # 分类文本 #classify_uid_list = [] classify_mid_list = [] duplicate_text_list = [] sensitive_words_dict = dict() mid_ts_dict = dict() # 文本发布时间 uid_prediction_dict = dict() weibo_prediction_dict = dict() trendline_dict = dict() feature_prediction_list = [] # feature mid_prediction_list = [] # dui ying mid if search_results: for item in search_results: iter_uid = item['_source']['uid'] iter_mid = item['_source']['mid'] mid_ts_dict[iter_mid] = item["_source"]["timestamp"] iter_text = item['_source']['text'].encode( 'utf-8', 'ignore') iter_sensitive = item['_source'].get('sensitive', 0) tmp_text = get_weibo(item['_source']) all_text_dict[iter_mid] = tmp_text duplicate_text_list.append({ "_id": iter_mid, "title": "", "content": iter_text.decode("utf-8", 'ignore') }) if iter_sensitive: tmp_sensitive_warning = signal_sensitive_variation #涉及到敏感词的微博 sensitive_words_dict[iter_mid] = iter_sensitive keywords_dict = json.loads( item['_source']['keywords_dict']) personal_keywords_dict = dict() for k, v in keywords_dict.iteritems(): k = k.encode('utf-8', 'ignore') personal_keywords_dict[k] = v classify_text_dict[iter_mid] = personal_keywords_dict #classify_uid_list.append(iter_uid) classify_mid_list.append(iter_mid) # 去重 print "start duplicate" if duplicate_text_list: dup_results = duplicate(duplicate_text_list) for item in dup_results: if item['duplicate']: duplicate_dict[item['_id']] = item['same_from'] # 分类 print "start classify" mid_value = dict() if classify_text_dict: #classify_results = topic_classfiy(classify_uid_list, classify_text_dict) classify_results = topic_classfiy(classify_mid_list, classify_text_dict) #print "classify_results: ", classify_results for k, v in classify_results.iteritems(): # mid:value #mid_value[k] = topic_value_dict[v[0]] mid_value[k] = v[0] #feature_list = organize_feature(k, mid_ts_dict[k]) #feature_prediction_list.append(feature_list) # feature list #mid_prediction_list.append(k) # corresponding # prediction """ print "start prediction" weibo_prediction_result = weibo_model.predict(feature_prediction_list) uid_prediction_result = uid_model.predict(feature_prediction_list) for i in range(len(mid_prediction_list)): if i % 100 == 0: print i uid_prediction_dict[mid_prediction_list[i]] = uid_prediction_result[i] weibo_prediction_dict[mid_prediction_list[i]] = weibo_prediction_result[i] tmp_trendline = trendline_list(mid_prediction_list[i], weibo_prediction_result[i], mid_ts_dict[mid_prediction_list[i]]) trendline_dict[mid_prediction_list[i]] = tmp_trendline """ # organize data mid_list = all_text_dict.keys() print "final mid:", len(mid_list) print "intersection: ", len(set(mid_list) & set(all_mid_list)) bulk_action = [] count = 0 for mid in mid_list: iter_dict = dict() if origin_weibo_detail.has_key(mid): iter_dict.update(origin_weibo_detail[mid]) iter_dict["type"] = 1 elif retweeted_weibo_detail.has_key(mid): iter_dict.update(retweeted_weibo_detail[mid]) iter_dict["type"] = 3 else: iter_dict["retweeted"] = 0 iter_dict["comment"] = 0 print "mid in all_mid_list: ", mid in set(all_mid_list) #iter_dict["trendline"] = json.dumps(trendline_dict[mid]) if duplicate_dict.has_key(mid): iter_dict["duplicate"] = duplicate_dict[mid] else: iter_dict["duplicate"] = "" #iter_dict["uid_prediction"] = uid_prediction_dict[mid] #iter_dict["weibo_prediction"] = weibo_prediction_dict[mid] iter_dict["compute_status"] = 0 # 尚未计算 iter_dict["topic_field"] = mid_value[mid] iter_dict["detect_ts"] = ts iter_dict["xnr_user_no"] = xnr_user_no iter_dict.update(all_text_dict[mid]) count += 1 print 'iter_dict:::', iter_dict _id = xnr_user_no + '_' + mid bulk_action.extend([{"index": {"_id": _id}}, iter_dict]) if count % 500 == 0: es_xnr.bulk(bulk_action, index="social_sensing_text", doc_type="text", timeout=600) bulk_action = [] if bulk_action: es_xnr.bulk(bulk_action, index="social_sensing_text", doc_type="text", timeout=600) return "1"
def get_un_trace_follow_operate(xnr_user_no, uid_string, nick_name_string): mark = False fail_nick_name_list = [] fail_uids = [] if uid_string: uid_list = uid_string.encode('utf-8').split(',') elif nick_name_string: nick_name_list = nick_name_string.encode('utf-8').split(',') uid_list = [] for nick_name in nick_name_list: query_body = { 'query': { 'filtered': { 'filter': { 'term': { 'nick_name': nick_name } } } }, '_source': ['uid'] } try: uid_results = es.search(index=facebook_user_index_name,doc_type=facebook_user_index_type,\ body=query_body)['hits']['hits'] uid_result = uid_result[0]['_source'] uid = uid_result['uid'] uid_list.append(uid) except: fail_nick_name_list.append(nick_name) try: result = es.get(index=fb_xnr_fans_followers_index_name,doc_type=fb_xnr_fans_followers_index_type,\ id=xnr_user_no)['_source'] trace_follow_list = result['trace_follow_list'] # 共同uids comment_uids = list(set(trace_follow_list).intersection(set(uid_list))) # 取消失败uid fail_uids = list(set(comment_uids).difference(set(uid_list))) # 求差 trace_follow_list = list( set(trace_follow_list).difference(set(uid_list))) es.update(index=fb_xnr_fans_followers_index_name,doc_type=fb_xnr_fans_followers_index_type,\ id=xnr_user_no,body={'doc':{'trace_follow_list':trace_follow_list}}) mark = True except: mark = False return [mark, fail_uids, fail_nick_name_list]
def getgroup_v2(qq_xnr): group_dict = {} #step0: get qqbot_port if qq_xnr[:4] != 'QXNR': search_result = es.search(index=qq_xnr_index_name,doc_type=qq_xnr_index_type,\ body={'query':{'term':{'qq_number':qq_xnr}}})['hits']['hits'] qq_xnr = search_result[0]['_id'] #try: qq_xnr_es_result = es.get(index=qq_xnr_index_name,\ doc_type=qq_xnr_index_type, id=qq_xnr, _source=True)['_source'] group_info = json.loads(qq_xnr_es_result['group_info']) qqbot_port = qq_xnr_es_result['qqbot_port'] print 'qqbot_port..', qqbot_port # p_str = 'qq '+str(qqbot_port) + ' list buddy' # p_str = "qq "+str(qqbot_port) + " .List('buddy')" p_str = "qq " + str(qqbot_port) + " list buddy" print("111111111111111111111", p_str) p = subprocess.Popen(p_str, shell=True, \ stdout=subprocess.PIPE, stderr=subprocess.STDOUT) line_count = 0 print 'print p =======================' print p for line in p.stdout.readlines(): line_count += 1 print 'line.==========', line if line_count >= 5 and line_count % 2 == 1: item_line_list = line.split('|') try: #qq_group_number = str(int(item_line_list[2])) qq_uin_number = str(int(item_line_list[7])) #print 'qq_uin_number..',qq_uin_number qq_group_name = item_line_list[4] qq_mark_name = item_line_list[5] # group_dict[qq_group_number] = qq_group_name group_dict[qq_uin_number] = qq_group_name # 如果uin为空,则添加进去uin,如果不为空,则更新群名(因为群名可能修改) for key, value_dict in group_info.iteritems(): mark_name = value_dict['mark_name'] if not qq_mark_name: if qq_mark_name == mark_name: if not qq_group_name in value_dict['group_name']: group_info[key]['group_name'].append( qq_group_name) except: next group_info = json.dumps(group_info) es.update(index=qq_xnr_index_name, doc_type=qq_xnr_index_type, id=qq_xnr, body={'doc': { 'group_info': group_info }}) print 'group_dict::len..', len(group_dict) return group_dict
def match_flow_text(): current_time = int(time.time()) current_date = ts2datetime(current_time) new_xnr_flow_text_index_name = new_xnr_flow_text_index_name_pre + current_date new_weibo_xnr_flow_text_mappings(new_xnr_flow_text_index_name) #xnr_flow_text_index_name = xnr_flow_text_index_name_pre + current_date flow_text_index_name = flow_text_index_name_pre + current_date query_body = {'query': {'term': {'create_status': 2}}, 'size': MAX_VALUE} try: search_results = es_xnr.search(index=weibo_xnr_index_name,doc_type=weibo_xnr_index_type,\ body=query_body)['hits']['hits'] #print 'search_results...',search_results bulk_action = [] count = 0 for result in search_results: result = result['_source'] uid = result['uid'] xnr_user_no = result['xnr_user_no'] match_query_body = { 'query': { 'bool': { 'must': [{ 'term': { 'uid': uid } }] } }, 'size': MAX_VALUE } match_results = es_flow_text.search(index=flow_text_index_name,doc_type=flow_text_index_type,\ body=match_query_body)['hits']['hits'] print 'match_results..', match_results for match_item in match_results: match_item = match_item['_source'] keyword_dict = match_item['keywords_dict'] mid = match_item['mid'] keywords_dict = json.loads(keyword_dict) personal_keywords_dict = dict() classify_text_dict = dict() # 分类文本 mid_value = dict() for k, v in keywords_dict.iteritems(): k = k.encode('utf-8', 'ignore') personal_keywords_dict[k] = v classify_text_dict[mid] = personal_keywords_dict if classify_text_dict: classify_results = topic_classfiy([mid], classify_text_dict) for k, v in classify_results.iteritems(): # mid:value mid_value[k] = v match_item["topic_field_first"] = topic_en2ch_dict[ mid_value[mid][0]] match_item["topic_field"] = '&'.join(mid_value[mid]) match_item['xnr_user_no'] = xnr_user_no action = {'index': {'_id': mid}} source = match_item bulk_action.extend([action, source]) count += 1 if count % 1000 == 0: #print 'bulk_action..',bulk_action es_xnr.bulk(bulk_action, index=new_xnr_flow_text_index_name, doc_type=new_xnr_flow_text_index_type, timeout=600) if bulk_action: #print 'bulk_action..',bulk_action es_xnr.bulk(bulk_action, index=new_xnr_flow_text_index_name, doc_type=new_xnr_flow_text_index_type, timeout=600) except: return 'no tweets to update today'
def qq_history_count(xnr_user_no, qq_number, current_time): if S_TYPE == 'test': current_time = datetime2ts(QQ_S_DATE_ASSESSMENT) current_date = ts2datetime(current_time) last_date = ts2datetime(current_time - DAY) group_message_index_name = group_message_index_name_pre + current_date qq_xnr_history_count_index_name = qq_xnr_history_count_index_name_pre + last_date # 得到当天发帖数量 query_body = { 'query': { 'bool': { 'must': [{ 'term': { 'speaker_qq_number': qq_number } }, { 'term': { 'xnr_qq_number': qq_number } }] } } } count_result = es.count(index=group_message_index_name, doc_type=group_message_index_type, body=query_body) if count_result['_shards']['successful'] != 0: today_count = count_result['count'] else: print 'es index rank error' today_count = 0 # 得到历史发言总数 try: get_result = es.get(index=qq_xnr_history_count_index_name,doc_type=qq_xnr_history_count_index_type,\ id=xnr_user_no)['_source'] total_count_history = get_result['total_post_num'] except: total_count_history = 0 total_count_totay = total_count_history + today_count item_dict = dict() item_dict['total_post_num'] = total_count_totay item_dict['daily_post_num'] = today_count # xnr所在群当天发言最多的人 query_body_total_day = { 'query': { 'filtered': { 'filter': { 'term': { 'xnr_qq_number': qq_number } } } }, 'aggs': { 'all_speakers': { 'terms': { 'field': 'speaker_qq_number', "order": { "_count": "desc" } } } } } try: results_total_day = es_xnr.search(index=group_message_index_name,doc_type=group_message_index_type,\ body=query_body_total_day)['aggregations']['all_speakers']['buckets'] speaker_max = results_total_day[0]['doc_count'] except: speaker_max = today_count safe = (float(math.log(today_count + 1)) / (math.log(speaker_max + 1) + 1)) * 100 safe = round(safe, 2) # 保留两位小数 item_dict['mark'] = safe return item_dict
def publish_operate_timing(): query_body = { 'query': { 'filtered': { 'filter': { 'term': { 'task_status': 0 } } } }, 'size': MAX_SEARCH_SIZE } results = es_xnr.search(index=weibo_xnr_timing_list_index_name,doc_type=\ weibo_xnr_timing_list_index_type,body=query_body)['hits']['hits'] print 'results::', results if results: for result in results: _id = result['_id'] result = result['_source'] timestamp_set = result['post_time'] print timestamp_set if timestamp_set <= int(time.time()): print '!!' text = result['text'].encode('utf-8') tweet_type = task_source_ch2en[result['task_source']] xnr_user_no = result['xnr_user_no'] try: p_url = result['p_url'] except: p_url = '' try: rank = result['rank'] except: rank = u'0' try: rankid = result['rankid'] except: rankid = '' #r_mid = result['mid'] es_get_result = es_xnr.get(index=weibo_xnr_index_name, doc_type=weibo_xnr_index_type, id=xnr_user_no)['_source'] weibo_mail_account = es_get_result['weibo_mail_account'] weibo_phone_account = es_get_result['weibo_phone_account'] password = es_get_result['password'] if weibo_mail_account: account_name = weibo_mail_account elif weibo_phone_account: account_name = weibo_phone_account else: return False mark = publish_tweet_func(account_name, password, text, p_url, rank, rankid, tweet_type, xnr_user_no) if mark[0]: #task_id = xnr_user_no + '_' + r_mid task_id = _id # item_exist = es_xnr.get(index=weibo_xnr_retweet_timing_list_index_name,doc_type=\ # weibo_xnr_retweet_timing_list_index_type,id=task_id)['_source'] item_exist = {} item_exist['task_status'] = 1 #item_exist['timstamp_post'] = int(time.time()) es_xnr.update(index=weibo_xnr_timing_list_index_name,doc_type=\ weibo_xnr_timing_list_index_type,id=task_id,body={'doc':item_exist}) # # 保存微博 # try: # save_mark = save_to_xnr_flow_text(tweet_type,xnr_user_no,text) # except: # print '保存微博过程遇到错误!' # save_mark = False else: continue
def get_penetration_num(xnr_user_no, qq_number, current_time): follow_group_sensitive = {} follow_group_sensitive['sensitive_info'] = {} get_result = es_xnr.get(index=qq_xnr_index_name, doc_type=qq_xnr_index_type, id=xnr_user_no)['_source'] #group_list = get_result['qq_groups'] group_list = [] group_info = json.loads(get_result['group_info']) for key, value_dict in group_info.iteritems(): group_name = value_dict['group_name'] group_list.extend(group_name) if S_TYPE == 'test': current_time = datetime2ts(QQ_S_DATE_ASSESSMENT) current_date = ts2datetime(current_time) group_message_index_name = group_message_index_name_pre + current_date query_body_info = { 'query': { 'filtered': { 'filter': { 'terms': { 'qq_group_nickname': group_list } } } }, 'aggs': { 'avg_sensitive': { 'avg': { 'field': 'sensitive_value' } } } } try: es_sensitive_result = es_xnr.search(index=group_message_index_name,doc_type=group_message_index_type,\ body=query_body_info)['aggregations'] sensitive_value = es_sensitive_result['avg_sensitive']['value'] if sensitive_value == None: sensitive_value = 0.0 follow_group_sensitive['sensitive_info'] = round(sensitive_value, 2) except: follow_group_sensitive['sensitive_info'] = 0 #if i == (WEEK-1): query_body_max = { 'query': { 'filtered': { 'filter': { 'terms': { 'qq_group_nickname': group_list } } } }, 'sort': { 'sensitive_value': { 'order': 'desc' } } } try: max_results = es_xnr.search(index=group_message_index_name,doc_type=group_message_index_type,\ body=query_body_max)['hits']['hits'] max_sensitive = max_results[0]['_source']['sensitive_value'] except: max_sensitive = 0 penetration = (math.log(sensitive_value + 1) / (math.log(max_sensitive + 1) + 1)) * 100 penetration = round(penetration, 2) follow_group_sensitive['mark'] = penetration return follow_group_sensitive
def retweet_operate_timing(): query_body = { 'query': { 'filtered': { 'filter': { 'term': { 'compute_status': 0 } } } } } results = es_xnr.search(index=weibo_xnr_retweet_timing_list_index_name,doc_type=\ weibo_xnr_retweet_timing_list_index_type,body=query_body)['hits']['hits'] if results: for result in results: result = result['_source'] timestamp_set = result['timestamp_set'] if timestamp_set <= int(time.time()): text = result['text'].encode('utf-8') tweet_type = 'trace_follow_tweet' xnr_user_no = result['xnr_user_no'] r_mid = result['mid'] es_get_result = es_xnr.get(index=weibo_xnr_index_name, doc_type=weibo_xnr_index_type, id=xnr_user_no)['_source'] weibo_mail_account = es_get_result['weibo_mail_account'] weibo_phone_account = es_get_result['weibo_phone_account'] password = es_get_result['password'] if weibo_mail_account: account_name = weibo_mail_account elif weibo_phone_account: account_name = weibo_phone_account else: return False print 'text::', text print 'r_mid:::', r_mid text = '' # 空转发 mark = retweet_tweet_func(account_name, password, text, r_mid, tweet_type, xnr_user_no) print 'mark::', mark[0] if mark[0]: task_id = xnr_user_no + '_' + r_mid # item_exist = es_xnr.get(index=weibo_xnr_retweet_timing_list_index_name,doc_type=\ # weibo_xnr_retweet_timing_list_index_type,id=task_id)['_source'] item_exist = {} item_exist['compute_status'] = 1 #item_exist['timstamp_post'] = int(time.time()) es_xnr.update(index=weibo_xnr_retweet_timing_list_index_name,doc_type=\ weibo_xnr_retweet_timing_list_index_type,id=task_id,body={'doc':item_exist}) # # 保存微博 # try: # save_mark = save_to_xnr_flow_text(tweet_type,xnr_user_no,text) # except: # print '保存微博过程遇到错误!' # save_mark = False else: continue
def detect_by_keywords(keywords, datetime_list): keywords_list = [] model = gensim.models.KeyedVectors.load_word2vec_format(WORD2VEC_PATH, binary=True) for word in keywords: simi_list = model.most_similar(word, topn=20) for simi_word in simi_list: keywords_list.append(simi_word[0]) group_uid_list = set() if datetime_list == []: return [] query_item = 'text' flow_text_index_name_list = [] for datetime in datetime_list: flow_text_index_name = facebook_flow_text_index_name_pre + datetime flow_text_index_name_list.append(flow_text_index_name) nest_query_list = [] #文本中可能存在英文或者繁体字,所以都匹配一下 en_keywords_list = trans(keywords_list, target_language='en') for i in range(len(keywords_list)): keyword = keywords_list[i] traditional_keyword = simplified2traditional(keyword) if len(en_keywords_list) == len(keywords_list): #确保翻译没出错 en_keyword = en_keywords_list[i] nest_query_list.append( {'wildcard': { query_item: '*' + en_keyword + '*' }}) nest_query_list.append({'wildcard': {query_item: '*' + keyword + '*'}}) nest_query_list.append( {'wildcard': { query_item: '*' + traditional_keyword + '*' }}) count = MAX_DETECT_COUNT if len(nest_query_list) == 1: SHOULD_PERCENT = 1 # 绝对数量。 保证至少匹配一个词 else: SHOULD_PERCENT = '3' # 相对数量。 2个词时,保证匹配2个词,3个词时,保证匹配2个词 query_body = { 'query': { 'bool': { 'should': nest_query_list, 'minimum_should_match': SHOULD_PERCENT, # 'must_not':{'terms':{'uid':white_uid_list}} } }, 'aggs': { 'all_uids': { 'terms': { 'field': 'uid', 'order': { '_count': 'desc' }, 'size': count } } } } es_results = es_xnr.search(index=flow_text_index_name_list,doc_type=facebook_flow_text_index_type,\ body=query_body,request_timeout=999999)['aggregations']['all_uids']['buckets'] for i in range(len(es_results)): uid = es_results[i]['key'] group_uid_list.add(uid) group_uid_list = list(group_uid_list) return group_uid_list