def lookup_active_user(classify_id,xnr_id,start_time,end_time): time_gap = end_time - start_time now_time = time.time() test_time_gap = datetime2ts(ts2datetime(now_time)) - datetime2ts(S_DATE_TW) if S_TYPE == 'test': today_date_time = datetime2ts(S_DATE_TW) start_time = start_time - test_time_gap end_time = end_time - test_time_gap from_date_ts=datetime2ts(ts2datetime(start_time)) to_date_ts=datetime2ts(ts2datetime(end_time)) bci_index_name = tw_bci_index_name_pre + ''.join(ts2datetime(end_time - DAY)) userlist = lookup_xnr_concernedusers(xnr_id) if classify_id == 1: condition_list=[{'bool':{'must':{'terms':{'uid':userlist}}}}] elif classify_id == 2: condition_list=[{'bool':{'must_not':[{'terms':{'uid':userlist}}]}}] elif classify_id == 0: condition_list=[{'match_all':{}}] print userlist,classify_id,condition_list results = [] for item in condition_list: query_body={ 'query':item, 'size':HOT_WEIBO_NUM, #查询影响力排名前50的用户即可 'sort':{'influence':{'order':'desc'}} } try: flow_text_exist=es_xnr.search(index=bci_index_name,\ doc_type=tw_bci_index_type,body=query_body)['hits']['hits'] search_uid_list = [item['_source']['uid'] for item in flow_text_exist] user_exist = es_xnr.search(index=twitter_user_index_name,\ doc_type=twitter_user_index_type,body={'query':{'terms':{'uid':search_uid_list}}})['hits']['hits'] user_dict = dict() for item in user_exist: uid = item['_source']['uid'] user_dict[uid] = item['_source'] for item in flow_text_exist: influence = item['_source']['influence'] active = item['_source']['active'] uid = item['_source']['uid'] try: user_info = user_dict[uid] uname = user_info['name'] location = user_info['locale'] link = user_info['link'] except: uname = '' location = '' link = '' results.append({'uid':uid, 'influence':influence, 'active':active, \ 'uname': uname, 'location':location, 'link': link}) except Exception,e: print e results = []
def lookup_todaytwitter_date_warming(keywords,today_datetime): keyword_query_list=[] for keyword in keywords: keyword_query_list.append({'wildcard':{'text':'*'+keyword.encode('utf-8')+'*'}}) twitter_flow_text_index_name=get_timets_set_indexset_list(twitter_flow_text_index_name_pre,today_datetime,today_datetime) query_body={ 'query':{ 'bool':{ 'should':keyword_query_list } }, 'size':MAX_WARMING_SIZE } try: temp_result=es_xnr_2.search(index=twitter_flow_text_index_name,doc_type=twitter_flow_text_index_type,body=query_body)['hits']['hits'] date_result=[] for item in temp_result: #查询三个指标字段 tid_result=lookup_tid_attend_index(item['_source']['tid'],today_datetime) if tid_result: item['_source']['comment']=tid_result['comment'] item['_source']['share']=tid_result['share'] item['_source']['favorite']=tid_result['favorite'] else: item['_source']['comment']=0 item['_source']['share']=0 item['_source']['favorite']=0 #查询用户昵称 item['_source']['nick_name']=get_user_nickname(item['_source']['uid']) date_result.append(item['_source']) except: date_result=[] return date_result
def lookup_today_keywords(from_ts,to_ts,xnr_user_no): userslist=lookup_xnr_concernedusers(xnr_user_no) query_body={ 'query':{ 'filtered':{ 'filter':{ 'bool':{ 'must':[ {'terms':{'uid':userslist}}, {'range':{'timestamp':{'gte':from_ts,'lte':to_ts}}} ] } } } }, 'aggs':{ 'keywords':{ 'terms':{ 'field':'keywords_string', 'size': 50 } } } } flow_text_index_name = twitter_flow_text_index_name_pre + ts2datetime(to_ts) flow_text_exist=es_xnr.search(index=flow_text_index_name,doc_type=twitter_flow_text_index_type,\ body=query_body)['aggregations']['keywords']['buckets'] word_dict = dict() for item in flow_text_exist: word = item['key'] count = item['doc_count'] word_dict[word] = count return word_dict
def lookup_history_keywords(from_ts, to_ts, xnr_user_no): query_body = { 'query': { 'filtered': { 'filter': { 'bool': { 'must': [{ 'term': { 'xnr_user_no': xnr_user_no } }, { 'range': { 'timestamp': { 'gte': from_ts, 'lte': to_ts } } }] } } } } } es_result=es_xnr.search(index=facebook_keyword_count_index_name,\ doc_type=facebook_keyword_count_index_type,body=query_body)['hits']['hits'] if not es_result: es_result = dict() return es_result all_keywords_dict = dict() for item in es_result: keywords_dict = json.loads(item['_source']['keyword_value_string']) all_keywords_dict = union_dict(all_keywords_dict, keywords_dict) return all_keywords_dict
def get_show_fb_xnr(submitter): fb_xnr_dict = {} query_body = { 'query': { 'bool': { 'must': [{ 'term': { 'submitter': submitter } }, { 'term': { 'create_status': 2 } }] } }, 'size': MAX_SEARCH_SIZE } es_results = es.search(index=fb_xnr_index_name, doc_type=fb_xnr_index_type, body=query_body)['hits']['hits'] if es_results: for result in es_results: result = result['_source'] fb_xnr_dict[result['xnr_user_no']] = result['nick_name'] return fb_xnr_dict
def lookup_history_fullkeywords(from_ts, to_ts): query_body = { 'query': { 'filtered': { 'filter': { 'bool': { 'must': [{ 'range': { 'timestamp': { 'gte': from_ts, 'lte': to_ts } } }] } } } }, 'size': 100 } #print 'from_ts:', ts2date(from_ts) #print 'to_ts:', ts2date(to_ts) es_result=es_xnr.search(index=facebook_full_keyword_index_name,\ doc_type=facebook_full_keyword_index_type,body=query_body)['hits']['hits'] if not es_result: es_result = dict() return es_result all_keywords_dict = dict() for item in es_result: keywords_dict = json.loads(item['_source']['keyword_value_string']) all_keywords_dict = union_dict(all_keywords_dict, keywords_dict) #print 'history keyword_dict:', all_keywords_dict return all_keywords_dict
def get_tw_xnr_list(user_account, status_start, status_end): query_body = { 'query': { 'filtered': { 'filter': { 'bool': { 'must': [{ 'term': { 'submitter': user_account } }, { 'range': { 'create_status': { 'gte': status_start, 'lt': status_end } } }] } } } }, 'size': USER_XNR_NUM } try: user_result = es_xnr_2.search(index=tw_xnr_index_name, doc_type=tw_xnr_index_type, body=query_body)['hits']['hits'] xnr_user_no_list = [] for item in user_result: xnr_user_no_list.append(item['_source']['xnr_user_no']) except: xnr_user_no_list = [] return xnr_user_no_list
def get_save_step_one(task_detail): es_results = es.search(index=fb_xnr_index_name,doc_type=fb_xnr_index_type,body={'query':{'match_all':{}},\ 'sort':{'user_no':{'order':'desc'}}})['hits']['hits'] if es_results: user_no_max = es_results[0]['_source']['user_no'] user_no_current = user_no_max + 1 else: user_no_current = 1 task_detail['user_no'] = user_no_current task_id = user_no2fb_id(user_no_current) #五位数 WXNR0001 print 'task_id' print task_id try: item_exist = dict() item_exist['user_no'] = task_detail['user_no'] item_exist['domain_name'] = task_detail['domain_name'] item_exist['role_name'] = task_detail['role_name'] item_exist['psy_feature'] = '&'.join( task_detail['psy_feature'].encode('utf-8').split(',')) item_exist['political_side'] = task_detail['political_side'] item_exist['business_goal'] = '&'.join( task_detail['business_goal'].encode('utf-8').split(',')) # item_exist['daily_interests'] = '&'.join(task_detail['daily_interests'].encode('utf-8').split(',')) item_exist['monitor_keywords'] = '&'.join( task_detail['monitor_keywords'].encode('utf-8').split(',')) item_exist['create_status'] = 0 # 第一步完成 print es.index(index=fb_xnr_index_name, doc_type=fb_xnr_index_type, id=task_id, body=item_exist) mark = True except: mark = False return mark
def lookup_history_user_warming(xnr_user_no,start_time,end_time): query_body={ 'query':{ 'filtered':{ 'filter':{ 'bool':{ 'must':[ {'term':{'xnr_user_no':xnr_user_no}}, {'range':{ 'timestamp':{ 'gte':start_time, 'lte':end_time } }} ] } } } }, 'sort':{'user_sensitive':{'order':'asc'}} , 'size':MAX_WARMING_SIZE } user_warming_list=get_xnr_warming_index_listname(twitter_user_warning_index_name_pre,start_time,end_time) try: temp_results=es_xnr_2.search(index=user_warming_list,doc_type=twitter_user_warning_index_type,body=query_body)['hits']['hits'] results=[] for item in temp_results: results.append(item['_source']) results.sort(key=lambda k:(k.get('user_sensitive',0)),reverse=True) except: results=[] # print 'r:',results return results
def lookup_today_fullkeywords(from_ts, to_ts): query_body = { 'query': { 'filtered': { 'filter': { 'bool': { 'must': [{ 'range': { 'timestamp': { 'gte': from_ts, 'lte': to_ts } } }] } } } }, 'aggs': { 'keywords': { 'terms': { 'field': 'keywords_string', 'size': 80 } } } } flow_text_index_name = facebook_flow_text_index_name_pre + ts2datetime( to_ts) try: flow_text_exist=es_xnr.search(index=flow_text_index_name,doc_type=facebook_flow_text_index_type,\ body=query_body)['aggregations']['keywords']['buckets'] word_dict = dict() word_dict_new = dict() keywords_string = '' for item in flow_text_exist: word = item['key'] count = item['doc_count'] word_dict[word] = count keywords_string += '&' keywords_string += item['key'] k_dict = extract_keywords(keywords_string) for item_item in k_dict: keyword = item_item.word # print 'keyword::',keyword,type(keyword) if word_dict.has_key(keyword): word_dict_new[keyword] = word_dict[keyword] else: word_dict_new[keyword] = 1 # print 'count:',word_dict_new[keyword] except: word_dict_new = dict() return word_dict_new
def lookup_history_speech_warming(xnr_user_no,show_type,start_time,end_time): show_condition_list=[] if show_type == 0: #全部用户 show_condition_list.append({'must':[{'term':{'xnr_user_no':xnr_user_no}},{'range':{'timestamp':{'gte':start_time,'lte':end_time}}}]}) elif show_type == 1:#好友 show_condition_list.append({'must':[{'term':{'content_type':'friends'}},{'term':{'xnr_user_no':xnr_user_no}},{'range':{'timestamp':{'gte':start_time,'lte':end_time}}}]}) elif show_type == 2:#非好友 show_condition_list.append({'must':[{'term':{'content_type':'unfriends'}},{'term':{'xnr_user_no':xnr_user_no}},{'range':{'timestamp':{'gte':start_time,'lte':end_time}}}]}) query_body={ 'query':{ 'filtered':{ 'filter':{ 'bool':show_condition_list[0] } } }, 'size':SPEECH_WARMING_NUM, 'sort':{'sensitive':{'order':'desc'}} } speech_warming_list=get_xnr_warming_index_listname(twitter_speech_warning_index_name_pre,start_time,end_time) #print speech_warming_list try: temp_results=es_xnr_2.search(index=speech_warming_list,doc_type=twitter_speech_warning_index_type,body=query_body)['hits']['hits'] #print temp_results results=[] for item in temp_results: results.append(item['_source']) results.sort(key=lambda k:(k.get('sensitive',0)),reverse=True) except: results=[] return results
def get_modify_userinfo(task_detail): item_dict = {} nick_name = task_detail['nick_name'] location_list = task_detail['location'].encode('utf-8').split(',') try: item_dict['location_province'] = location_list[0] item_dict['location_city'] = location_list[1] except: item_dict['location_province'] = location_list[0] item_dict['location_city'] = location_list[0] item_dict['description'] = task_detail['description'] gender = task_detail['gender'] if gender == u'男': item_dict['gender'] = 'man' else: item_dict['gender'] = 'woman' age = task_detail['age'] birth_year = time.localtime().tm_year - int(age) month = '%02d' % random.randint(0, 13) day = '%02d' % random.randint(0, 29) item_dict['birth'] = [str(birth_year), month, day] query_body = { 'query': { 'filtered': { 'filter': { 'term': { 'nick_name': nick_name } } } } } es_results = es.search(index=fb_xnr_index_name, doc_type=fb_xnr_index_type, body=query_body)['hits']['hits'] xnr_result = es_results[0]['_source'] try: fb_mail_account = xnr_result['fb_mail_account'] except: fb_mail_account = '' try: fb_phone_account = xnr_result['fb_phone_account'] except: fb_phone_account = '' if fb_mail_account: account_name = fb_mail_account else: account_name = fb_phone_account password = xnr_result['password'] uid = xnr_result['uid'] try: result = change_userinfo(account_name, password, uid, item_dict) except: result = False return result
def get_show_example_model(): es_results = es.search(index=tw_example_model_index_name,doc_type=tw_example_model_index_type,\ body={'query':{'match_all':{}}})['hits']['hits'] result_all = [] for result in es_results: result = result['_source'] result_all.append(result) return result_all
def get_show_domain(): domain_name_dict = {} query_body = {'query':{'match_all':{}},'size':MAX_SEARCH_SIZE} es_results = es.search(index=fb_domain_index_name,doc_type=fb_domain_index_type,body=query_body)['hits']['hits'] if es_results: for result in es_results: result = result['_source'] domain_name_dict[result['domain_pinyin']] = result['domain_name'] return domain_name_dict
def get_fb_xnr_no(): user_no_max = 0 if not r.exists(fb_xnr_max_no): #如果当前redis没有记录,则去es数据库查找补上 es_results = es.search(index=fb_xnr_index_name,doc_type=fb_xnr_index_type,body={'query':{'match_all':{}},\ 'sort':{'user_no':{'order':'desc'}}})['hits']['hits'] if es_results: user_no_max = es_results[0]['_source']['user_no'] else: #如果当前redis有记录,则取用 user_no_max = int(r.get(fb_xnr_max_no)) return user_no_max
def get_xnr_info(task_detail): nick_name = task_detail['nick_name'] query_body = { 'query':{ 'filtered':{ 'filter':{ 'term':{'nick_name':nick_name} } } } } es_results = es.search(index=fb_xnr_index_name,doc_type=fb_xnr_index_type,body=query_body)['hits']['hits'] return es_results
def get_nick_name_unique(nick_name): query_body = {'query': {'term': {'nick_name': nick_name}}} es_profile_results = es_user_profile.search( index=profile_index_name, doc_type=profile_index_type, body=query_body)['hits']['hits'] es_xnr_results = es.search(index=fb_xnr_index_name, doc_type=fb_xnr_index_type, body=query_body)['hits']['hits'] if es_profile_results and es_xnr_results: mark = False else: mark = True return mark
def lookup_twitter_date_warming_content(start_year,end_year,date_time,date_name,start_time,end_time,keywords): twitter_timing_warning_index_name_list = [] if start_year != end_year: start_year_int = int(start_year) end_year_int = int(end_year) iter_year = end_year_int while iter_year >= start_year_int: index_name = twitter_timing_warning_index_name_pre + str(start_year_int) + '-' + date_time if es_xnr_2.indices.exists(index=index_name): twitter_timing_warning_index_name_list.append(index_name) else: pass iter_year = iter_year - 1 else: index_name = twitter_timing_warning_index_name_pre + start_year + '-' + date_time if es_xnr_2.indices.exists(index=index_name): twitter_timing_warning_index_name_list.append(index_name) else: pass query_body={ 'query':{ 'filtered':{ 'filter':{ 'bool':{ 'must':[ {'term':{'date_name':date_name}} ] } } } }, 'sort':{'timestamp':{'order':'asc'}} , 'size':MAX_WARMING_SIZE } result=es_xnr_2.search(index=twitter_timing_warning_index_name_list,doc_type=twitter_timing_warning_index_type,body=query_body)['hits']['hits'] warming_content=[] for item in result: warming_content.extend(json.loads(item['_source']['twitter_date_warming_content'])) #当前时间范围内的预警信息 now_time = int(time.time()) if now_time >= start_time and now_time <= end_time: today_warming=lookup_todaytwitter_date_warming(keywords,now_time) warming_content.append(today_warming) else: pass return warming_content
def get_all_access_level_info(account_name): account_name = account_name query_body = {"query": {"match_all": {}}, "size": 9999} try: result = es_xnr_2.search(index=access_control_index_name, doc_type=access_control_index_type, body=query_body)['hits']['hits'] print result except Exception as e: print e return [] results = [] if result != []: for item in result: results.append(item['_source']) print(results) return results
def get_hashtag(today_datetime): twitter_flow_text_index_name=get_timets_set_indexset_list(twitter_flow_text_index_name_pre,today_datetime,today_datetime) query_body={ 'query':{ 'filtered':{ 'filter':{ 'bool':{ 'must':[ {'range':{'sensitive':{'gte':1}}} ] }} } }, 'aggs':{ 'all_hashtag':{ 'terms':{'field':'hashtag'}, 'aggs':{'sum_sensitive':{ 'sum':{'field':'sensitive'} } } } }, 'size':5 } twitter_text_exist=es_xnr_2.search(index=twitter_flow_text_index_name,doc_type=twitter_flow_text_index_type,\ body=query_body)['aggregations']['all_hashtag']['buckets'] hashtag_list = [] for item in twitter_text_exist: event_dict=dict() if item['key']: event_dict['event_name'] = item['key'] event_dict['event_count'] = item['doc_count'] event_dict['event_sensitive'] = item['sum_sensitive']['value'] hashtag_list.append(event_dict) else: pass hashtag_list.sort(key=lambda k:(k.get('event_sensitive',0),k.get('event_count',0)),reverse=True) # print hashtag_list return hashtag_list
def get_show_domain_group_summary(submitter): es_result = es.search(index=tw_domain_index_name,doc_type=tw_domain_index_type,\ body={'query':{'term':{'submitter':submitter}}})['hits']['hits'] if es_result: result_all = [] for result in es_result: item = {} result = result['_source'] item['group_size'] = result['group_size'] item['domain_name'] = result['domain_name'] item['create_time'] = result['create_time'] item['compute_status'] = result['compute_status'] item['create_type'] = result['create_type'] item['remark'] = result['remark'] item['description'] = result['description'] create_type = json.loads(result['create_type'].encode('utf-8')) result_all.append(item) else: return '当前账户尚未创建渗透领域' return result_all
def show_corpus_facebook(corpus_type): query_body = { 'query': { 'filtered': { 'filter': { 'term': { 'corpus_type': corpus_type } } } }, 'size': MAX_VALUE } result = es.search(index=facebook_xnr_corpus_index_name, doc_type=facebook_xnr_corpus_index_type, body=query_body)['hits']['hits'] results = [] for item in result: item['_source']['id'] = item['_id'] results.append(item['_source']) return results
def show_condition_corpus_tw(corpus_condition): query_body = { 'query': { 'filtered': { 'filter': { 'bool': { 'must': corpus_condition } } } }, 'size': MAX_VALUE } result = es.search(index=twitter_xnr_corpus_index_name, doc_type=twitter_xnr_corpus_index_type, body=query_body)['hits']['hits'] results = [] for item in result: item['_source']['id'] = item['_id'] results.append(item['_source']) return results
def lookup_tid_attend_index(tid,from_ts,to_ts): twitter_count_index_name=get_timets_set_indexset_list(twitter_count_index_name_pre,from_ts,to_ts) query_body={ 'query':{ 'filtered':{ 'filter':{ 'bool':{'must':{'term':{'tid':tid}}} } } }, 'size':1, 'sort':{'update_time':{'order':'desc'}} } try: result=es_xnr.search(index=twitter_count_index_name,doc_type=twitter_count_index_type,body=query_body)['hits']['hits'] # print 'result:',result,twitter_count_index_name tid_result=[] for item in result: tid_result.append(item['_source']) except: tid_result=[] return tid_result
def lookup_tid_attend_index(tid,today_datetime): twitter_count_index_name=get_timets_set_indexset_list(twitter_count_index_name_pre,today_datetime,today_datetime) query_body={ 'query':{ 'filtered':{ 'filter':{ 'bool':{'must':{'term':{'tid':tid}}} } } }, 'size':MAX_WARMING_SIZE, 'sort':{'update_time':{'order':'desc'}} } try: result=es_xnr_2.search(index=twitter_count_index_name,doc_type=twitter_count_index_type,body=query_body)['hits']['hits'] print result tid_result=[] for item in result: tid_result.append(item['_source']) except: tid_result=[] return tid_result
def lookup_today_personal_warming(xnr_user_no,start_time,end_time): #查询关注列表 lookup_type='followers_list' followers_list=lookup_xnr_fans_followers(xnr_user_no,lookup_type) #查询虚拟人uid xnr_uid=lookup_xnr_uid(xnr_user_no) #计算敏感度排名靠前的用户 query_body={ # 'query':{ # 'filtered':{ # 'filter':{ # 'terms':{'uid':friends_list} # } # } # }, 'aggs':{ 'friends_sensitive_num':{ 'terms':{'field':'uid'}, 'aggs':{ 'sensitive_num':{ 'sum':{'field':'sensitive'} } } } }, 'size':MAX_SEARCH_SIZE } twitter_flow_text_index_name=get_timets_set_indexset_list(twitter_flow_text_index_name_pre,start_time,end_time) try: first_sum_result=es_xnr_2.search(index=twitter_flow_text_index_name,doc_type=twitter_flow_text_index_type,\ body=query_body)['aggregations']['friends_sensitive_num']['buckets'] except: first_sum_result=[] #print first_sum_result top_userlist=[] for i in xrange(0,len(first_sum_result)): user_sensitive=first_sum_result[i]['sensitive_num']['value'] if user_sensitive > 0: user_dict=dict() user_dict['uid']=first_sum_result[i]['key'] followers_mark=judge_user_type(user_dict['uid'],followers_list) user_dict['sensitive']=user_sensitive*followers_mark top_userlist.append(user_dict) else: pass #查询敏感用户的敏感内容 results=[] for user in top_userlist: #print user user_detail=dict() user_detail['uid']=user['uid'] user_detail['user_sensitive']=user['sensitive'] user_lookup_id=user['uid'] print user_lookup_id #查询用户昵称 user_detail['user_name']=get_user_nickname(user['uid']) query_body={ 'query':{ 'filtered':{ 'filter':{ 'bool':{ 'must':[ {'term':{'uid':user['uid']}}, {'range':{'sensitive':{'gte':1,'lte':100}}} ] } } } }, 'size':MAX_WARMING_SIZE, 'sort':{'sensitive':{'order':'desc'}} } try: second_result=es_xnr_2.search(index=twitter_flow_text_index_name,doc_type=twitter_flow_text_index_type,body=query_body)['hits']['hits'] except: second_result=[] s_result=[] for item in second_result: #查询三个指标字段 tid_result=lookup_tid_attend_index(item['_source']['tid'],start_time) if tid_result: item['_source']['comment']=tid_result['comment'] item['_source']['share']=tid_result['share'] item['_source']['favorite']=tid_result['favorite'] else: item['_source']['comment']=0 item['_source']['share']=0 item['_source']['favorite']=0 #查询用户昵称 item['_source']['nick_name']=get_user_nickname(item['_source']['uid']) s_result.append(item['_source']) s_result.sort(key=lambda k:(k.get('sensitive',0)),reverse=True) user_detail['content']=json.dumps(s_result) user_detail['xnr_user_no']=xnr_user_no user_detail['validity']=0 user_detail['timestamp']=end_time results.append(user_detail) results.sort(key=lambda k:(k.get('user_sensitive',0)),reverse=True) return results
def create_event_warning(xnr_user_no,today_datetime,write_mark): #获取事件名称 hashtag_list = get_hashtag(today_datetime) print 'hashtag_list/:',hashtag_list twitter_flow_text_index_name=get_timets_set_indexset_list(twitter_flow_text_index_name_pre,today_datetime,today_datetime) #查询关注列表 lookup_type='followers_list' followers_list=lookup_xnr_fans_followers(xnr_user_no,lookup_type) event_warming_list=[] for event_item in hashtag_list: event_warming_content=dict() #事件名称、主要参与用户、典型微博、事件影响力、事件平均时间 event_warming_content['event_name']=event_item['event_name'] event_influence_sum=0 event_time_sum=0 query_body={ 'query':{ 'filtered':{ 'filter':{ 'bool':{ 'must':[ {'term':{'hashtag':event_item['event_name']}}, {'range':{'sensitive':{'gte':1}}} ] } } } }, 'size':MAX_WARMING_SIZE, 'sort':{'sensitive':{'order':'desc'}} } event_results=es_xnr_2.search(index=twitter_flow_text_index_name,doc_type=twitter_flow_text_index_type,body=query_body)['hits']['hits'] if event_results: twitter_result=[] alluser_num_dict=dict() #print 'sencond_time:::',int(time.time()) for item in event_results: #查询三个指标字段 tid_result=lookup_tid_attend_index(item['_source']['tid'],today_datetime) if tid_result: item['_source']['comment']=tid_result['comment'] item['_source']['share']=tid_result['share'] item['_source']['favorite']=tid_result['favorite'] else: item['_source']['comment']=0 item['_source']['share']=0 item['_source']['favorite']=0 #print 'event_content:',item['_source']['text'] #统计用户信息 if alluser_num_dict.has_key(str(item['_source']['uid'])): followers_mark=set_intersection(item['_source']['uid'],followers_list) if followers_mark > 0: alluser_num_dict[str(item['_source']['uid'])]=alluser_num_dict[str(item['_source']['uid'])]+1*2 else: alluser_num_dict[str(item['_source']['uid'])]=alluser_num_dict[str(item['_source']['uid'])]+1 else: alluser_num_dict[str(item['_source']['uid'])]=1 #计算影响力 origin_influence_value=(1+item['_source']['comment']+item['_source']['share']+item['_source']['favorite'])*(1+item['_source']['sensitive']) followers_value=judge_user_type(item['_source']['uid'],followers_list) item['_source']['twitter_influence_value']=origin_influence_value*followers_value #查询用户昵称 item['_source']['nick_name']=get_user_nickname(item['_source']['uid']) twitter_result.append(item['_source']) #统计影响力、时间 event_influence_sum=event_influence_sum+item['_source']['twitter_influence_value'] event_time_sum=event_time_sum+item['_source']['timestamp'] # print 'third_time:::',int(time.time()) #典型信息 twitter_result.sort(key=lambda k:(k.get('twitter_influence_value',0)),reverse=True) event_warming_content['main_twitter_info']=json.dumps(twitter_result) #事件影响力和事件时间 number=len(event_results) event_warming_content['event_influence']=event_influence_sum/number event_warming_content['event_time']=event_time_sum/number #对用户进行排序 alluser_num_dict=sorted(alluser_num_dict.items(),key=lambda d:d[1],reverse=True) main_userid_list=[] for i in xrange(0,len(alluser_num_dict)): main_userid_list.append(alluser_num_dict[i][0]) #主要参与用户信息 main_user_info=[] user_es_result=es_xnr_2.mget(index=twitter_user_index_name,doc_type=twitter_user_index_type,body={'ids':main_userid_list})['docs'] # print 'user_es_result:',user_es_result for item in user_es_result: user_dict=dict() if item['found']: user_dict['uid']=item['_id'] user_dict['username']=item['_source']['username'] if item['_source'].has_key('profileimageurl'): user_dict['profileimageurl']=item['_source']['profileimageurl'] else: user_dict['profileimageurl']='' if item['_source'].has_key('statuscount'): user_dict['statuscount']=item['_source']['statuscount'] else: user_dict['statuscount']=0 if item['_source'].has_key('followerscount'): user_dict['followerscount']=item['_source']['followerscount'] else: user_dict['followerscount']=0 if item['_source'].has_key('friendscount'): user_dict['friendscount']=item['_source']['friendscount'] else: user_dict['friendscount']=0 else: # user_dict['icon']='' user_dict['uid']=item['_id'] user_dict['username']='' user_dict['profileimageurl']='' user_dict['statuscount']=0 user_dict['followerscount']=0 user_dict['friendscount']=0 main_user_info.append(user_dict) event_warming_content['main_user_info']=json.dumps(main_user_info) # print 'fourth_time:::',int(time.time()) event_warming_content['xnr_user_no']=xnr_user_no event_warming_content['validity']=0 event_warming_content['timestamp']=today_datetime now_time=int(time.time()) task_id=xnr_user_no+'_'+str(now_time) #写入数据库 if write_mark: # print 'today_datetime:::',ts2datetime(today_datetime) mark=write_envent_warming(today_datetime,event_warming_content,task_id) event_warming_list.append(mark) else: event_warming_list.append(event_warming_content) else: pass # print 'fifth_time:::',int(time.time()) return event_warming_list
def lookup_hot_posts(from_ts, to_ts, xnr_id, classify_id, order_id): time_gap = to_ts - from_ts now_time = time.time() test_time_gap = datetime2ts(ts2datetime(now_time)) - datetime2ts(S_DATE_FB) if S_TYPE == 'test': today_date_time = datetime2ts(S_DATE_FB) from_ts = from_ts - test_time_gap to_ts = to_ts - test_time_gap from_date_ts = datetime2ts(ts2datetime(from_ts)) to_date_ts = datetime2ts(ts2datetime(to_ts)) print 'from_date_ts, to_date_ts:', ts2date(from_date_ts), ts2date( to_date_ts) print from_date_ts, to_date_ts flow_text_index_name_list = get_timets_set_indexset_list( facebook_flow_text_index_name_pre, from_ts, to_ts) userslist = lookup_xnr_friends(xnr_id) #全部用户 0,好友 1,非好友-1 range_time_list = { 'range': { 'timestamp': { 'gte': int(from_ts), 'lt': int(to_ts) } } } # print range_time_list user_condition_list = [] if classify_id == 1: user_condition_list = [{ 'bool': { 'must': [{ 'terms': { 'uid': userslist } }, range_time_list] } }] elif classify_id == 2: user_condition_list = [{ 'bool': { 'must': [range_time_list], 'must_not': [{ 'terms': { 'uid': userslist } }] } }] elif classify_id == 0: user_condition_list = [{'bool': {'must': [range_time_list]}}] query_body = { 'query': { 'filtered': { 'filter': user_condition_list } }, 'size': MAX_HOT_POST_SIZE, 'sort': { 'timestamp': { 'order': 'desc' } } } # try: es_result=es_xnr.search(index=flow_text_index_name_list,doc_type=facebook_flow_text_index_type,\ body=query_body)['hits']['hits'] hot_result = [] for item in es_result: #查询三个指标字段 fid_result = lookup_fid_attend_index(item['_source']['fid'], from_ts, to_ts) if fid_result: item['_source']['comment'] = fid_result['comment'] item['_source']['share'] = fid_result['share'] item['_source']['favorite'] = fid_result['favorite'] else: item['_source']['comment'] = 0 item['_source']['share'] = 0 item['_source']['favorite'] = 0 #查询用户昵称 item['_source']['nick_name'] = get_user_nickname( item['_source']['uid']) hot_result.append(item['_source']) # except: # hot_result=[] if order_id == 1: #按时间排序 sort_condition = 'timestamp' elif order_id == 2: #按热度排序 sort_condition = 'retweeted' elif order_id == 3: #按敏感度排序 sort_condition = 'sensitive' else: #默认设为按时间排序 sort_conditiont = 'timestamp' if hot_result: hot_result.sort(key=lambda k: (k.get(sort_condition, 0)), reverse=True) hot_result = hot_result[:50] return hot_result
def show_corpus_class(create_type, corpus_type): query_condition = [] if create_type and corpus_type: query_condition.append({ 'filtered': { 'filter': { 'bool': { 'must': [{ 'term': { 'create_type': create_type } }, { 'term': { 'corpus_type': corpus_type } }] } } } }) else: if create_type: query_condition.append({ 'filtered': { 'filter': { 'bool': { 'must': { 'term': { 'create_type': create_type } } } } } }) elif corpus_type: query_condition.append({ 'filtered': { 'filter': { 'bool': { 'must': { 'term': { 'corpus_type': corpus_type } } } } } }) else: query_condition.append({'match_all': {}}) print 'query_condition', query_condition query_body = { 'query': { 'filtered': { 'filter': { 'bool': { 'must': query_condition } } } }, 'size': MAX_SEARCH_SIZE } result = es.search(index=facebook_xnr_corpus_index_name, doc_type=facebook_xnr_corpus_index_type, body=query_body)['hits']['hits'] results = [] for item in result: item['_source']['id'] = item['_id'] results.append(item['_source']) return results
def export_group_info(domain_name, mail): mark = True res = { 'domain_name': domain_name, 'members_num': 0, 'create_info': { 'submitter': '', 'remark': '', 'create_type': '', 'create_time': '', }, 'members_uid': [], 'members_info': { # 'uid1': { # 'nickname': '', # 'gender': '', # 'location': '', # 'link': '', # } }, 'count_info': { 'location_count': { # 'zh_TW': 10, # 'us': 5 }, # 'gender_count': { # # 'f': 0, # # 'm': 40 # }, 'role_count': { # 'role1': 12, # 'role2': 7 }, 'words_preference': { # 'w1': 20, # 'w2': 10 }, 'topic_preference': { # 't1': 20, # 't2': 10 }, 'political_side': {}, } } domain_pinyin = pinyin.get(domain_name, format='strip', delimiter='_') domain_details = get_show_domain_description(domain_name) res['count_info']['political_side'] = domain_details['political_side'] res['count_info']['role_count'] = domain_details['role_distribute'] res['count_info']['topic_preference'] = domain_details['topic_preference'] res['count_info']['words_preference'] = domain_details['word_preference'] res['members_num'] = domain_details['group_size'] domain_info = es.get(index=tw_domain_index_name, doc_type=tw_domain_index_type, id=domain_pinyin)['_source'] res['create_info']['remark'] = domain_info['remark'] res['create_info']['submitter'] = domain_info['submitter'] res['create_info']['create_type'] = domain_info['create_type'] res['create_info']['create_time'] = ts2datetime_full( domain_info['create_time']) res['members_uid'] = domain_info['member_uids'] query_body = { "query": { "bool": { "must": [ { "terms": { "uid": res['members_uid'], } }, ] } }, "size": 9999, "fields": ["locale", "link", "uid", "gender", "username"] } user_info = es.search(profile_index_name, profile_index_type, query_body)['hits']['hits'] members_info = {} gender_count = {} location_count = {} for user in user_info: item = user['fields'] uid = item.get('uid', [''])[0] # gender = item.get('gender', [''])[0] location = item.get('location', [''])[0] members_info[uid] = { 'nickname': item.get('username', [''])[0], # 'gender': gender, 'location': location, 'link': 'https://twitter.com/' + item.get('userscreenname', [''])[0] } # if gender: # if gender in gender_count: # gender_count[gender] += 1 # else: # gender_count[gender] = 1 if location: if location in location_count: location_count[location] += 1 else: location_count[location] = 1 res['members_info'] = members_info res['count_info']['location_count'] = location_count res['count_info']['gender_count'] = gender_count export_filename = EXAMPLE_MODEL_PATH + domain_pinyin + '_' + ts2datetime_full( time.time()) + '.json' try: with open(export_filename, "w") as f: json.dump(res, f) try: sendfile2mail(mail, export_filename) except Exception, e: print e except: mark = False return mark