def get_attr_geo_track(uid_list): date_results = [] # results = {'2013-09-01':[(geo1, count1), (geo2, track2)], '2013-09-02'...} 7day now_ts = time.time() now_date = ts2datetime(now_ts) #test now_date = '2013-09-08' ts = datetime2ts(now_date) for i in range(7, 0, -1): timestamp = ts - i*24*3600 #print 'timestamp:', ts2datetime(timestamp) ip_dict = dict() results = r_cluster.hmget('ip_'+str(timestamp), uid_list) #print 'results:',results for item in results: if item: item_dict = json.loads(item) #print 'item_dict:', item_dict for ip_item in item_dict: try: ip_dict[ip_item] += item_dict[ip_item] except: ip_dict[ip_item] = item_dict[ip_item] geo_dict = ip2geo(ip_dict) sort_geo_dict = sorted(geo_dict.items(), key=lambda x:x[1], reverse=True) date_key = ts2datetime(timestamp) date_results.append([date_key, sort_geo_dict[:2]]) #print 'results:', date_results return {'geo_track': json.dumps(date_results)}
def get_activity_time(uid_list): results = {} now_ts = time.time() now_date = ts2datetime(now_ts) #run_type if RUN_TYPE == 1: timestamp = datetime2ts(now_date) else: timestamp = datetime2ts("2013-09-08") activity_list_dict = {} # {uid:[activity_list], uid:[]} for i in range(1,WEEK+1): ts = timestamp - DAY*i if WORK_TYPE != 0: r_result = redis_activity.hmget('activity_'+str(ts), uid_list) else: r_result = [] index_name = "activity_" + str(ts2datetime(ts)) exist_bool = es_cluster.indices.exists(index=index_name) if exist_bool: es_results = es_cluster.mget(index=index_name, doc_type="activity", body={"ids":uid_list})["docs"] for item in es_results: if item['found']: r_result.append(item['_source']['activity_dict']) else: r_result.append(json.dumps({})) else: r_result = [json.dumps(dict())]*len(uid_list) if r_result: for j in range(0, len(uid_list)): uid = uid_list[j] if uid not in activity_list_dict: activity_list_dict[uid] = [0 for i in range(0, 96)] user_r_result = r_result[j] if user_r_result: user_activity_dict = json.loads(user_r_result) for i in range(0, 96): try: count = user_activity_dict[str(i)] except: count = 0 activity_list_dict[uid].append(count) for uid in uid_list: activity_list = activity_list_dict[uid] statusnum = sum(activity_list) signal = np.array(activity_list) fftResult = np.abs(np.fft.fft(signal))**2 n = signal.size freq = np.fft.fftfreq(n, d=1) i = 0 max_val = 0 max_freq = 0 for val in fftResult: if val>max_val and freq[i]>0: max_val = val max_freq = freq[i] i += 1 results[uid] = {'statusnum': statusnum, 'activity_time': math.log(max_freq + 1)} return results
def filter_activity(user_set): results = [] #run_type if RUN_TYPE == 1: now_date = ts2datetime(time.time()) else: now_date = RUN_TEST_TIME ts = datetime2ts(now_date) - DAY date = ts2datetime(ts) timestamp = datetime2ts(date) for user in user_set: over_count = 0 for i in range(0,7): ts = timestamp - DAY*i result = redis_activity.hget('activity_'+str(ts), str(user)) if result: items_dict = json.loads(result) for item in items_dict: weibo_count = items_dict[item] if weibo_count > activity_threshold: over_count += 1 if over_count == 0: results.append(user) else: writer.writerow([user, 'activity']) return results
def filter_activity(user_set): results = [] now_date = ts2datetime(time.time()) now_date = '2013-09-08' ts = datetime2ts(now_date) - 24*3600 date = ts2datetime(ts) timestamp = datetime2ts(date) ts = ts.replace('-','') for user in user_set: over_count = 0 for i in range(0,7): ts = timestamp - 3600*24*i result = r_cluster.hget('activity_'+str(ts), str(user)) if result: item_dict = json.loads(result) sorted_dict = sorted(item_dict.iteritems(), key=lambda asd:asd[1], reverse=True) if sorted_dict[0][1] > activity_threshold: over_count = 1 if over_count == 0: results.append(user) else: writer.writerow([user, 'activity']) print 'after filter activity: ', len(results) return results
def key_words_search( search_type , pre , during , start_time , keyword , search_key = '' , sort_norm = '', sort_scope = '' ,time = 1 , isall = False): keywords = keyword.split(",") should = [] for key in keywords: if search_type == "hashtag": should.append({"prefix":{"text.text": "#" + key + "#"}}) else: should.append({"prefix":{"text.text":key}}) date = start_time index_name = pre + start_time while not es_9206.indices.exists(index= index_name) : new_time = datetime2ts(date) + DAY date = ts2datetime(new_time) index_name = pre + date during -= 1 uid_set = set() for i in range(during): print index_name query = {"query":{"bool":{"must":[],"must_not":[],"should":should}},"size":MAX_ITEMS,"sort":[],"facets":{},"fields":['uid']} try : temp = es_9206.search(index = index_name , doc_type = 'text' , body = query) result = temp['hits']['hits'] print "Fetch " + str(len(result)) for item in result : uid_set.add(item['fields']['uid'][0].encode("utf-8") ) except Exception,e: print e raise Exception('user_list failed!') new_time = datetime2ts(date) + DAY date = ts2datetime(new_time) index_name = pre + date i += 1
def get_interval_count(topic, start_ts, end_ts): results = [0] ts_list = [] #unit = 900 #during = Day during = interval_count_during start_ts = datetime2ts(ts2datetime(start_ts)) ts_list.append(start_ts) #end_ts = datetime2ts(ts2datetime(end_ts)) # deal with the time is not the whole day print 'before deal end_ts:', ts2date(end_ts) if end_ts - datetime2ts(ts2datetime(end_ts))!= 0: end_ts = datetime2ts(ts2datetime(end_ts)) + 3600 * 24 print 'get_interval_count start_ts:', ts2date(start_ts) print 'get_interval_count end_ts:', ts2date(end_ts) windowsize = (end_ts - start_ts) / Day interval = (end_ts - start_ts) / During for i in range(interval, 0, -1): begin_ts = end_ts - during * i over_ts = begin_ts + during ts_list.append(over_ts) items = db.session.query(PropagateCountNews).filter(PropagateCountNews.topic==topic ,\ PropagateCountNews.end<=over_ts ,\ PropagateCountNews.end>begin_ts ,\ PropagateCountNews.range==unit).all() if items: result = Merge_propagate(items) else: result = 0 results.append(float(result)) return ts_list, results
def search_weibo_task(user_name): c_result = {} query = {"query":{"bool":{"must":[{"term":{"user_rank_task.submit_user":user_name}}]}},"size":MAX_ITEMS,"sort":[{"create_time":{"order":"desc"}}],"fields":["status","search_type","keyword","submit_user","sort_scope","sort_norm","start_time","user_ts","end_time", "create_time", 'number']} if 1: return_list = [] result = es.search(index=WEIBO_RANK_KEYWORD_TASK_INDEX , doc_type=WEIBO_RANK_KEYWORD_TASK_TYPE , body=query)['hits'] c_result['flag'] = True for item in result['hits']: result_temp = {} result_temp['submit_user'] = item['fields']['submit_user'][0] result_temp['search_type'] = item['fields']['search_type'][0] result_temp['keyword'] = json.loads(item['fields']['keyword'][0]) result_temp['sort_scope'] = item['fields']['sort_scope'][0] result_temp['sort_norm'] = item['fields']['sort_norm'][0] result_temp['start_time'] = ts2datetime(item['fields']['start_time'][0]) result_temp['end_time'] = ts2datetime(item['fields']['end_time'][0]) result_temp['status'] = item['fields']['status'][0] result_temp['create_time'] = ts2date(item['fields']['create_time'][0]) result_temp['search_id'] = item['fields']['user_ts'][0] tmp = item['fields'].get('number', 0) if tmp: result_temp['number'] = int(tmp[0]) else: result_temp['number'] = 100 return_list.append(result_temp) c_result['data'] = return_list return c_result
def main(): RUN_TYPE = 0 if RUN_TYPE == 1: now_ts = time.time() else: now_ts = datetime2ts(RUN_TEST_TIME) now_ts = datetime2ts('2013-09-02') date = ts2datetime(now_ts - DAY) # auto recommendation: step 1:4 #step1: read from top es_daily_rank top_user_set, user_dict = search_from_es(date) #step2: filter black_uid black_user_set = read_black_user() subtract_user_set = top_user_set - black_user_set #step3: filter users have been in subtract_user_set = list(subtract_user_set) candidate_results = filter_in(subtract_user_set) #step4: filter rules about ip count& reposts/bereposts count&activity count results = filter_rules(candidate_results) #step5: get sensitive user sensitive_user = list(get_sensitive_user(date)) results = results - set(sensitive_user) # influence user - sensitive user new_date = ts2datetime(now_ts) hashname_influence = "recomment_" + new_date + "_influence" if results: for uid in results: #print uid r.hset(hashname_influence, uid, "0") hashname_sensitive = "recomment_" + new_date + "_sensitive" if sensitive_user: for uid in sensitive_user: #print "sensitive" r.hset(hashname_sensitive, uid, "0") """
def query_hot_weibo(ts, origin_mid_list, time_segment): query_all_body = { "query": { "filtered": { "filter": { "bool": { "must": [ {"range": { "timestamp":{ "gte": ts - time_segment, "lt": ts } }}, {"terms":{"root_mid":origin_mid_list}} ] } } } }, "aggs":{ "all_mid":{ "terms":{"field": "root_mid", "size":400}, "aggs":{ "message_type":{ "terms":{ "field":"message_type" } } } } } } return_results = dict() datetime_1 = ts2datetime(ts) datetime_2 = ts2datetime(ts-24*3600) index_name_1 = flow_text_index_name_pre + datetime_1 index_name_2 = flow_text_index_name_pre + datetime_2 index_list = [] exist_es_1 = es_text.indices.exists(index_name_1) exist_es_2 = es_text.indices.exists(index_name_2) if exist_es_1: index_list.append(index_name_1) if exist_es_2: index_list.append(index_name_2) if index_list: results = es_text.search(index=index_list, doc_type=flow_text_index_type,body=query_all_body)['aggregations']['all_mid']['buckets'] if results: for item in results: temp_dict = dict() temp_dict[item['key']] = item['doc_count'] detail = item['message_type']['buckets'] detail_dict = dict() for iter_item in detail: detail_dict[iter_item['key']] = iter_item['doc_count'] temp_dict['retweeted'] = detail_dict.get(3, 0) temp_dict['comment'] = detail_dict.get(2, 0) return_results[item['key']] = temp_dict return return_results
def filter_activity(user_set): results = [] now_date = ts2datetime(time.time()) # test now_date = '2013-09-08' ts = datetime2ts(now_date) - 24*3600 date = ts2datetime(ts) #print 'date:', date timestamp = datetime2ts(date) for user in user_set: over_count = 0 for i in range(0,7): ts = timestamp - 3600*24*i result = r_cluster.hget('activity_'+str(ts), str(user)) if result: items_dict = json.loads(result) for item in items_dict: weibo_count = items_dict[item] if weibo_count > activity_threshold: over_count += 1 if over_count == 0: results.append(user) else: writer.writerow([user, 'activity']) print 'after filter activity:', len(results) return results
def key_words_search( pre , time , start_time , keyword , type = 'in' ): date = start_time index_name = pre + start_time while not es.indices.exists(index= index_name) : time = datetime2ts(date) + DAY date = ts2datetime(time) index_name = pre + date time -= 1 uid_set = set() for i in range(time): print index_name query = {"query":{"bool":{"must":[{"prefix":{"text.text":keyword}}],"must_not":[],"should":[]}},"size":MAX_ITEMS,"sort":[],"facets":{},"fields":['uid']} try : temp = es.search(index = index_name , doc_type = 'text' , body = query) result = temp['hits']['hits'] print "Fetch " + str(len(result)) for item in result : uid_set.add(item['fields']['uid'][0].encode("utf-8") ) except Exception,e: print e raise Exception('user_list failed!') time = datetime2ts(date) + DAY date = ts2datetime(time) index_name = pre + date i += 1
def read_flow_text(uid_list): ''' 读取用户微博(返回结果没有微博情绪标签): 输入数据:uid_list(字符串型列表) 输出数据:word_dict(用户分词结果字典),weibo_list(用户微博列表) word_dict示例:{uid1:{'w1':f1,'w2':f2...}...} weibo_list示例:[[uid1,text1,ts1],[uid2,text2,ts2],...](每一条记录对应三个值:uid、text、timestamp) ''' word_dict = dict()#词频字典 weibo_list = []#微博列表 online_pattern_dict = {} # {uid:[online_pattern1, ..],...} now_ts = time.time() #run_type if RUN_TYPE == 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = datetime2ts(RUN_TEST_TIME) start_date_ts = now_date_ts - DAY * WEEK for i in range(0,WEEK): iter_date_ts = start_date_ts + DAY * i flow_text_index_date = ts2datetime(iter_date_ts) flow_text_index_name = flow_text_index_name_pre + flow_text_index_date print flow_text_index_name try: flow_text_exist = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\ body={'query':{'filtered':{'filter':{'terms':{'uid': uid_list}}}}, 'size': MAX_VALUE}, _source=False, fields=['text','uid','keywords_dict','timestamp'])['hits']['hits'] except: flow_text_exist = [] for flow_text_item in flow_text_exist: uid = flow_text_item['fields']['uid'][0].encode('utf-8') text = flow_text_item['fields']['text'][0].encode('utf-8') ts = flow_text_item['fields']['timestamp'][0] keywords_dict = json.loads(flow_text_item['fields']['keywords_dict'][0]) keywords_dict = json.dumps(keywords_dict, encoding="UTF-8", ensure_ascii=False) keywords_dict = eval(keywords_dict) if word_dict.has_key(uid): item_dict = Counter(word_dict[uid]) keywords_dict = Counter(keywords_dict) item_dict = dict(item_dict + keywords_dict) word_dict[uid] = item_dict else: word_dict[uid] = keywords_dict weibo_list.append([uid,text,ts]) #test online pattern online_pattern = u'weibo.com' try: user_online_pattern_dict = online_pattern_dict[uid] except: online_pattern_dict[uid] = {} try: online_pattern_dict[uid][online_pattern] += 1 except: online_pattern_dict[uid][online_pattern] = 1 return word_dict,weibo_list, online_pattern_dict, start_date_ts
def main(): scan_cursor = 0 count = 0 bulk_action = [] number = r.scard('user_set') print number if RUN_TYPE: ts = time.time() - DAY date = ts2datetime(ts) start_time = str(ts2datetime(time.time())) print "/cron/push_mid2redis.py&start&%s" %start_time else: date = '2013-09-05' index_name = flow_text_index_name_pre+date print index_name ts = time.time() while 1: re_scan = r.sscan("user_set", scan_cursor, count=3000) scan_cursor = re_scan[0] uid_list = re_scan[1] #具体数据 if len(uid_list): for uid in uid_list: detail_dict = r.hgetall(uid) for k,v in detail_dict.iteritems(): update_dict = dict() if "_origin_weibo_retweeted" in k and v: mid = k.split('_')[0] update_dict["retweeted"] = int(v) elif "_origin_weibo_comment" in k and v: mid = k.split('_')[0] update_dict["comment"] = int(v) elif '_retweeted_weibo_comment' in k and v: mid = k.split('_')[0] update_dict["comment"] = int(v) elif '_retweeted_weibo_retweeted' in k and v: mid = k.split('_')[0] update_dict["retweeted"] = int(v) else: pass if update_dict: action = {"update": {"_id": mid}} xdata = {"doc": update_dict} bulk_action.extend([action, xdata]) count += 1 if count % 400 == 0: r_flow.lpush('update_mid_list', json.dumps(bulk_action)) bulk_action = [] tp = time.time() #print "%s cost %s" %(count, tp-ts) ts = tp if int(scan_cursor) == 0: break if bulk_action: r_flow.lpush('update_mid_list', json.dumps(bulk_action)) print count
def aggregation_sentiment_related_weibo(ts, origin_mid_list, time_segment, message_type=1, uid_list=[]): if message_type == 1: query_all_body = { "query": { "filtered": { "filter": { "bool": { "must": [ {"range": {"timestamp": {"gte": ts - time_segment, "lt": ts}}}, {"terms": {"root_mid": origin_mid_list}}, ] } } } }, "aggs": {"all_sentiments": {"terms": {"field": "sentiment"}}}, } else: query_all_body = { "query": { "filtered": { "filter": { "bool": { "must": [ {"range": {"timestamp": {"gte": ts - time_segment, "lt": ts}}}, {"terms": {"root_mid": origin_mid_list}}, {"terms": {"directed_uid": uid_list}}, ] } } } }, "aggs": {"all_sentiments": {"terms": {"field": "sentiment"}}}, } results = {"0": 0, "1": 0, "2": 0, "3": 0, "4": 0, "5": 0, "6": 0} datetime_1 = ts2datetime(ts) datetime_2 = ts2datetime(ts - 24 * 3600) index_name_1 = flow_text_index_name_pre + datetime_1 index_name_2 = flow_text_index_name_pre + datetime_2 index_list = [] exist_es_1 = es_text.indices.exists(index_name_1) exist_es_2 = es_text.indices.exists(index_name_2) if exist_es_1: index_list.append(index_name_1) if exist_es_2: index_list.append(index_name_2) if index_list: search_results = es_text.search(index=index_list, doc_type=flow_text_index_type, body=query_all_body)[ "aggregations" ]["all_sentiments"]["buckets"] if search_results: for item in search_results: key = item["key"] count = item["doc_count"] results[key] = count print "results: ", results, sum(results.values()) return results
def main(): now_ts = time.time() delete_ts = datetime2ts(ts2datetime(now_ts-EXPIRE_TIME)) #待删除的时间戳 delete_date = ts2datetime(now_ts-EXPIRE_TIME) del_day = ts2datetime(now_ts-MONTH_TIME) index_name = flow_text_index_name_pre + del_day exist_es = es_flow_text.indices.exists(index=index_name) if exist_es: es_flow_text.indices.delete(index=index_name) index_bci = "bci_" + del_day.replace('-', '') exist_bci = ES_CLUSTER_FLOW1.indices.exists(index=index_bci) if exist_bci: ES_CLUSTER_FLOW1.indices.delete(index=index_bci) #delete @ redis_cluster.delete("at_"+str(delete_ts)) redis_cluster.delete("sensitive_at_"+str(delete_ts)) #delete ip redis_ip.delete('ip_'+str(delete_ts)) if WORK_TYPE == 0: exist_ip = es_cluster.indices.exists(index="ip_"+delete_date) if exist_ip: es_cluster.indices.delete(index="ip_"+delete_date) redis_ip.delete('sensitive_ip_'+str(delete_ts)) if WORK_TYPE == 0: exist_ip = es_cluster.indices.exists(index="sensitive_ip_"+delete_date) if exist_ip: es_cluster.indices.delete(index="sensitive_ip_"+delete_date) #delete activity redis_activity.delete('activity_'+str(delete_ts)) if WORK_TYPE == 0: exist_activity = es_cluster.indices.exists(index="activity_"+delete_date) if exist_activity: es_cluster.indices.delete(index="activity_"+delete_date) redis_activity.delete('sensitive_activity_'+str(delete_ts)) if WORK_TYPE == 0: exist_activity = es_cluster.indices.exists(index="sensitive_activity_"+delete_date) if exist_activity: es_cluster.indices.delete(index="sensitive_activity_"+delete_date) #delete hashtag redis_cluster.delete('hashtag_'+str(delete_ts)) redis_cluster.delete('sensitive_hashtag_'+str(delete_ts)) #delete sensitive words redis_cluster.delete('sensitive_'+str(delete_ts)) #delete recommendation r.delete('recomment_'+str(delete_date)+"_influence") r.delete('recomment_'+str(delete_date)+"_sensitive") r.delete("identify_in_sensitive_" + str(delete_date)) r.delete("identify_in_influence_" + str(delete_date)))
def query_hot_weibo(ts, origin_mid_list, time_segment, keywords_list, aggregation_field="root_mid", size=100): query_all_body = { "query": { "filtered": { "filter": { "bool": { "must": [ {"range": { "timestamp":{ "gte": ts - time_segment, "lt": ts } }}], "should": [ {"terms":{ "keywords_string": keywords_list } } ] } } } }, "aggs":{ "all_count":{ "terms":{"field": aggregation_field, "size": size} } } } datetime = ts2datetime(ts) # test #datetime = "2013-09-07" hot_mid_dict = dict() index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) if origin_mid_list and exist_es: query_all_body["query"]["filtered"]["filter"]["bool"]["should"].append({"terms": {"root_mid": origin_mid_list}}) query_all_body["query"]["filtered"]["filter"]["bool"]["should"].append({"terms": {"mid": origin_mid_list}}) results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_all_body)['aggregations']['all_count']['buckets'] if results: for item in results: hot_mid_dict[item['key']] = item['doc_count'] datetime_1 = ts2datetime(ts-time_segment) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es_1 = es_text.indices.exists(index_name_1) if datetime_1 != datetime and exist_es_1: query_all_body["query"]["filtered"]["filter"]["bool"]["should"].append({"terms": {"root_mid": origin_mid_list}}) query_all_body["query"]["filtered"]["filter"]["bool"]["should"].append({"terms": {"mid": origin_mid_list}}) results_1 = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_all_body)['aggregations']['all_count']['buckets'] if results_1: for item in results: hot_mid_dict[item['key']] = item['doc_count'] return hot_mid_dict
def update_his_item(history_item , today_bci , today_date): last_day = ts2datetime(today_date - DAY) warehousing_time = history_item['_source']['warehousing_time'] #get the warehousing time yyyy-mm-dd #get the days of warehousing day = int((today_date - datetime2ts(warehousing_time))/DAY + 1) item = history_item['_source'] try: item['bci_day_change'] = today_bci - item['bci_' + ts2datetime(today_date - 2 * DAY)] except Exception, e: print history_item['_id'] + ":" + e.message
def sort_task(user, keyword, status, start_time, end_time, submit_time): query_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ {"term":{"submit_user": user}} ] } } } }, "size": 10000, "sort":{"submit_time":{"order":"desc"}} } query_list = [] if keyword: keyword_list = keyword.split(',') query_list.append({"terms":{"keyword_string":keyword_list}}) if status != 2: query_list.append({"term":{"status": status}}) if start_time and end_time: start_ts = datetime2ts(start_time) end_ts = datetime2ts(end_time) query_list.append({"range":{"start_time":{"gte":start_ts, "lte":end_ts}}}) query_list.append({"range":{"end_time":{"gte":start_ts, "lte":end_ts}}}) if submit_time: query_list.append({"term":{"submit_time": submit_time}}) if query_list: query_body["query"]["filtered"]["filter"]["bool"]["must"].extend(query_list) #print query_body search_results = es.search(index=USER_RANK_KEYWORD_TASK_INDEX, doc_type=USER_RANK_KEYWORD_TASK_TYPE, body=query_body)["hits"]["hits"] results = [] if search_results: for item in search_results: iter_item = item['_source'] tmp = [] tmp.append(iter_item['search_type']) tmp.append(json.loads(iter_item['keyword'])) tmp.append(ts2datetime(iter_item['start_time'])) tmp.append(ts2datetime(iter_item['end_time'])) tmp.append(iter_item['range']) tmp.append(ts2date(iter_item['create_time'])) tmp.append(iter_item['status']) tmp.append(iter_item['sort_norm']) tmp.append(iter_item['sort_scope']) tmp.append(item['_id']) # task_name results.append(tmp) return results
def query_related_weibo(ts, origin_mid_list, time_segment): query_all_body = { "query": { "filtered": { "filter": { "bool": { "must": [ {"range": { "timestamp":{ "gte": ts - time_segment, "lt": ts } }}, {"terms":{"root_mid":origin_mid_list}} ] } } } }, "aggs":{ "all_count":{ "terms":{"field": "message_type"} } } } return_results = {"origin": 0, "retweeted": 0, "comment": 0} datetime_1 = ts2datetime(ts) datetime_2 = ts2datetime(ts-24*3600) index_name_1 = flow_text_index_name_pre + datetime_1 index_name_2 = flow_text_index_name_pre + datetime_2 index_list = [] exist_es_1 = es_text.indices.exists(index_name_1) exist_es_2 = es_text.indices.exists(index_name_2) if exist_es_1: index_list.append(index_name_1) if exist_es_2: index_list.append(index_name_2) if index_list: results = es_text.search(index=index_list, doc_type=flow_text_index_type,body=query_all_body)['aggregations']['all_count']['buckets'] if results: for item in results: if int(item['key']) == 1: return_results['origin'] = item['doc_count'] elif int(item['key']) == 3: return_results['retweeted'] = item['doc_count'] elif int(item['key']) == 2: return_results['comment'] = item['doc_count'] else: pass return_results['total_count'] = sum(return_results.values()) print "return_results: ", return_results return return_results
def main(): filter_uid = all_delete_uid() #record_time = time.strftime("%Y%m%d", time.localtime(time.time())) record_time = ts2datetime(time.time()) # 2013-09-08 former_time = ts2datetime(time.time() - 24*3600) # 2013-09-07 recommend_list = search_low_number(threshould) # recommended user to delete in portrait database print len(recommend_list) recommend_list = list(set(recommend_list).difference(filter_uid)) recommend_redis.hset("recommend_delete_list", record_time, json.dumps(recommend_list)) # 今日推荐出库名单 recommend_redis.hdel("recommend_delete_list", former_time) # 删除昨日推荐名单,只维护一个推荐出库名单 return 1
def get_influence(uid_list): result = {} now_ts = time.time() #run_type if RUN_TYPE == 1: now_date = ts2datetime(now_ts - DAY) else: now_date = ts2datetime(datetime2ts(RUN_TEST_TIME) - DAY) index_time = 'bci_' + ''.join(now_date.split('-')) index_type = 'bci' try: es_result = es.mget(index=index_time, doc_type=index_type, body={'ids': uid_list})['docs'] except Exception, e: raise e
def getResult(search_id): item = es.get(index=WEIBO_RANK_KEYWORD_TASK_INDEX , doc_type=WEIBO_RANK_KEYWORD_TASK_TYPE , id=search_id) try: result_obj = {} result_obj['keyword'] = json.loads(item['_source']['keyword']) result_obj['sort_scope'] = item['_source']['sort_scope'] result_obj['sort_norm'] = item['_source']['sort_norm'] result_obj['start_time'] = ts2datetime(item['_source']['start_time']) result_obj['end_time'] =ts2datetime(item['_source']['end_time']) result_obj['result'] = json.loads(item['_source']['result']) result_obj['text_results'] = json.loads(item['_source']['text_results']) result_obj['number'] = item['_source']['number'] return result_obj except : return []
def query_mid_list(ts, social_sensors, time_segment, message_type=1): query_body = { "query": { "filtered": { "filter": { "bool": { "must":[ {"range": { "timestamp": { "gte": ts - time_segment, "lt": ts } }}, {"terms":{"uid": social_sensors}}, {"term":{"message_type": message_type}} ] } } } }, "sort": {"sentiment": {"order": "desc"}}, "size": 10000 } datetime_1 = ts2datetime(ts) datetime_2 = ts2datetime(ts-24*3600) index_name_1 = flow_text_index_name_pre + datetime_1 index_name_2 = flow_text_index_name_pre + datetime_2 index_list = [] exist_es_1 = es_text.indices.exists(index_name_1) exist_es_2 = es_text.indices.exists(index_name_2) if exist_es_1: index_list.append(index_name_1) if exist_es_2: index_list.append(index_name_2) if index_list: search_results = es_text.search(index=index_list, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] else: search_results = [] origin_mid_list = set() if search_results: for item in search_results: if message_type == 1: origin_mid_list.add(item["_id"]) else: origin_mid_list.add(item['_source']['root_mid']) return list(origin_mid_list)
def scan_mapper(pre, sen_pre, r): if RUN_TYPE: ts = datetime2ts(ts2datetime(time.time - DAY)) else: ts = datetime2ts('2013-09-01') ts = str(ts) hash_name = pre + ts sen_hash_name = sen_pre + ts cursor = 0 count = 0 tb = time.time() while 1: re_scan = r.hscan(hash_name, cursor, count=1000) cursor = re_scan[0] ip_dict = re_scan[1] uid_list = ip_dict.keys() if uid_list: r.lpush('act_uid_list', json.dumps(uid_list)) count += len(uid_list) ts = time.time() print '%s : %s' %(count, ts - tb) tb = ts if cursor == 0: print count break
def get_influence(uid): result = 0 now_ts = time.time() now_date = ts2datetime(now_ts - 3600*24) # test now_date = '2013-09-07' index_time = ''.join(now_date.split('-')) index_type = 'bci' try: result = es.get(index=index_time, id=uid, doc_type=index_type)['_source']['user_index'] #print 'result_dict:', result ''' query_body = { 'query':{ 'filtered':{ 'query':{ 'match_all':{} }, 'filter':{ 'range':{ 'user_index':{ 'gte':result } } } } } } rank = es.count(index=index_time, doc_type=index_type, body=query_body)['count'] #print 'rank:', rank ''' except: return 0 return result
def update_day_sensitive(uid_list): results = {} for uid in uid_list: results[uid] = {"sensitive": 0, "sensitive_string": "", "sensitive_dict": json.dumps({})} all_results = {} now_ts = time.time() if RUN_TYPE == 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = datetime2ts("2013-09-03") today_sensitive_dict = {} sensitive_results = redis_cluster.hmget("sensitive_" + str(now_date_ts), uid_list) for item in sensitive_results: for uid, words_dict in item.iteritems(): sensitive_index = 0 sensitive_words_dict = {} if words_dict: sensitive_words_dict = json.dumps(words_dict) for word, count in words_dict.iter_items(): tmp_stage = r_sensitive.hget("sensitive_words", word) if tmp_stage: tmp = json.loads(tmp_stage) sensitive_index += sensitive_score_dict[tmp[0]] * count sensitive_words_string = "&".join(sensitive_words_dict.keys()) results[uid] = { "sensitive": sensitive_index, "sensitive_words_string": sensitive_words_string, "sensitive_words_dict": sensitive_words_dict, } return results
def update_day_hashtag(uid_list): results = {} all_results = {} now_ts = time.time() if RUN_TYPE == 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = datetime2ts("2013-09-02") for i in range(WEEK, 0, -1): ts = now_date_ts - DAY * i count = 0 hashtag_results = redis_cluster.hmget("hashtag_" + str(ts), uid_list) for uid in uid_list: if uid not in results: results[uid] = {} hashtag_item = hashtag_results[count] if hashtag_item: hashtag_dict = json.loads(hashtag_item) else: hashtag_dict = {} for hashtag in hashtag_dict: try: results[uid][hashtag] += 1 except: results[uid][hashtag] = 1 count += 1 for uid in uid_list: user_hashtag_dict = results[uid] hashtag_string = "&".join(user_hashtag_dict.keys()) all_results[uid] = {"hashtag_string": hashtag_string, "hashtag_dict": json.dumps(user_hashtag_dict)} return all_results
def update_day_sensitive(uid_list): results = {} count = 0 for uid in uid_list: results[uid] = {"sensitive": 0, 'sensitive_string': "", 'sensitive_dict': json.dumps({})} all_results = {} now_ts = time.time() if RUN_TYPE == 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = datetime2ts('2013-09-02') today_sensitive_dict = {} sensitive_results = redis_cluster.hmget("sensitive_"+str(now_date_ts), uid_list) for item in sensitive_results: if not item: count += 1 continue print type(item) uid = uid_list[count] item = json.loads(item) sensitive_index = 0 sensitive_words_dict = {} for word, count in item.iteritems(): tmp_stage = r_sensitive.hget("sensitive_words", word) if tmp_stage: tmp = json.loads(tmp_stage) sensitive_index += sensitive_score_dict[str(tmp[0])] * count sensitive_words_string = "&".join(item.keys()) results[uid] = {'sensitive': sensitive_index, "sensitive_words_string":sensitive_words_string, "sensitive_words_dict":item} count += 1 return results
def get_importance(uid, domain, topic): result = 0 domain_result = 0 domain_list = domain.split(' ') #print 'domain_list:', domain_list for domain in domain_list: try: domain_result += domain_weight_dict[domain] except: pass topic_result = 0 topic_list = topic.split(' ') #print 'topic_list:', topic_list for topic in topic_list: try: topic_result += topic_weight_dict[topic] except: pass #get fansnum, origin_weibo_retweeted_total_number, retweeted_weibo_retweeted_total_number now_ts = time.time() date = ts2datetime(now_ts-3600*24) #test date = '2013-09-07' index_time = ''.join(date.split('-')) index_type = 'bci' try: es_result = es.get(index=index_time, doc_type=index_type, id=uid)['_source'] fansnum = es_result['user_fansnum'] retweetednum = es_result['origin_weibo_retweeted_total_number'] + es_result['retweeted_weibo_retweeted_total_number'] result = importance_weight_dict['fansnum']*fansnum + importance_weight_dict['retweeted_num']*retweetednum + \ importance_weight_dict['domain']*domain_result + importance_weight_dict['topic']*topic_result #print 'importance result:', result return result except: return 0
def get_important_user(ts, origin_mid_list, time_segment): query_all_body = { "query": { "filtered": { "filter": { "bool": { "must": [{"range": {"timestamp": {"gte": ts - time_segment, "lt": ts}}}], "should": [{"terms": {"root_mid": origin_mid_list}}, {"terms": {"mid": origin_mid_list}}], } } } }, "sort": {"user_fansnum": {"order": "desc"}}, "size": 1000, } datetime = ts2datetime(ts - time_segment) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) results = [] if origin_mid_list and exist_es: search_results = es_text.search( index=index_name, doc_type=flow_text_index_type, body=query_all_body, _source=False )["hits"]["hits"] if search_results: for item in search_results: results.append(item["_id"]) return results
user_list.append(item['_id']) return user_list def main(): filter_uid = all_delete_uid() #record_time = time.strftime("%Y%m%d", time.localtime(time.time())) record_time = ts2datetime(time.time()) # 2013-09-08 former_time = ts2datetime(time.time() - 24*3600) # 2013-09-07 recommend_list = search_low_number(threshould) # recommended user to delete in portrait database print len(recommend_list) recommend_list = list(set(recommend_list).difference(filter_uid)) recommend_redis.hset("recommend_delete_list", record_time, json.dumps(recommend_list)) # 今日推荐出库名单 recommend_redis.hdel("recommend_delete_list", former_time) # 删除昨日推荐名单,只维护一个推荐出库名单 return 1 if __name__ == "__main__": current_path = os.getcwd() file_path = os.path.join(current_path, 'recommend_to_delete.py') now_ts = ts2datetime(ts) print_log = "&".join([file_path, "start", now_ts]) print print_log #打印开始信息 main() now_ts = ts2datetime(ts) print_log = "&".join([file_path, "end", now_ts]) print print_log # 打印终止信息
}, } } } } #current_time = time.time() #facebook_feedback_friends_index_name = facebook_feedback_friends_index_name_pre + ts2datetime(current_time) if not es.indices.exists(index=facebook_feedback_friends_index_name): es.indices.create(index=facebook_feedback_friends_index_name, body=index_info, ignore=400) if __name__ == '__main__': current_time = time.time() date = ts2datetime(current_time + 24 * 3600) facebook_feedback_like_mappings(facebook_feedback_like_index_name_pre + date) facebook_feedback_retweet_mappings( facebook_feedback_retweet_index_name_pre + date) facebook_feedback_at_mappings(facebook_feedback_at_index_name_pre + date) facebook_feedback_comment_mappings( facebook_feedback_comment_index_name_pre + date) facebook_feedback_private_mappings( facebook_feedback_private_index_name_pre + date) facebook_feedback_friends_mappings()
def read_flow_text(uid_list): ''' 读取用户微博(返回结果没有微博情绪标签): 输入数据:uid_list(字符串型列表) 输出数据:word_dict(用户分词结果字典),weibo_list(用户微博列表) word_dict示例:{uid1:{'w1':f1,'w2':f2...}...} weibo_list示例:[[uid1,text1,ts1],[uid2,text2,ts2],...](每一条记录对应三个值:uid、text、timestamp) ''' word_dict = dict() #词频字典 weibo_list = [] #微博列表 online_pattern_dict = {} # {uid:[online_pattern1, ..],...} now_ts = time.time() #run_type if RUN_TYPE == 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = datetime2ts(RUN_TEST_TIME) start_date_ts = now_date_ts - DAY * WEEK for i in range(0, WEEK): iter_date_ts = start_date_ts + DAY * i flow_text_index_date = ts2datetime(iter_date_ts) flow_text_index_name = flow_text_index_name_pre + flow_text_index_date print flow_text_index_name try: flow_text_exist = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\ body={'query':{'filtered':{'filter':{'terms':{'uid': uid_list}}}}, 'size': MAX_VALUE}, _source=False, fields=['text','uid','keywords_dict','timestamp'])['hits']['hits'] except: flow_text_exist = [] for flow_text_item in flow_text_exist: uid = flow_text_item['fields']['uid'][0].encode('utf-8') text = flow_text_item['fields']['text'][0].encode('utf-8') ts = flow_text_item['fields']['timestamp'][0] keywords_dict = json.loads( flow_text_item['fields']['keywords_dict'][0]) keywords_dict = json.dumps(keywords_dict, encoding="UTF-8", ensure_ascii=False) keywords_dict = eval(keywords_dict) if word_dict.has_key(uid): item_dict = Counter(word_dict[uid]) keywords_dict = Counter(keywords_dict) item_dict = dict(item_dict + keywords_dict) word_dict[uid] = item_dict else: word_dict[uid] = keywords_dict weibo_list.append([uid, text, ts]) #test online pattern online_pattern = u'weibo.com' try: user_online_pattern_dict = online_pattern_dict[uid] except: online_pattern_dict[uid] = {} try: online_pattern_dict[uid][online_pattern] += 1 except: online_pattern_dict[uid][online_pattern] = 1 return word_dict, weibo_list, online_pattern_dict, start_date_ts
def create_event_warning(xnr_user_no,today_datetime,write_mark): #获取事件名称 hashtag_list = get_hashtag(today_datetime) #print 'hashtag_list::',hashtag_list flow_text_index_name = get_day_flow_text_index_list(today_datetime) #虚拟人的粉丝列表和关注列表 try: es_xnr_result=es_xnr.get(index=weibo_xnr_fans_followers_index_name,doc_type=weibo_xnr_fans_followers_index_type,id=xnr_user_no)['_source'] followers_list=es_xnr_result['followers_list'] fans_list=es_xnr_result['fans_list'] except: followers_list=[] fans_list=[] event_warming_list=[] event_num=0 for event_item in hashtag_list: event_sensitive_count=0 event_warming_content=dict() #事件名称、主要参与用户、典型微博、事件影响力、事件平均时间 event_warming_content['event_name']=event_item['event_name'] print 'event_name:',event_item event_num=event_num+1 print 'event_num:::',event_num print 'first_time:::',int(time.time()) event_influence_sum=0 event_time_sum=0 query_body={ 'query':{ # 'bool':{ # 'must':[{'wildcard':{'text':'*'+event_item[0]+'*'}}, # {'range':{'sensitive':{'gte':1}}}] # } 'filtered':{ 'filter':{ 'bool':{ 'must':[ {'term':{'hashtag':event_item['event_name']}}, {'range':{'sensitive':{'gte':1}}} ] } } } }, 'size':MAX_WARMING_SIZE, 'sort':{'sensitive':{'order':'desc'}} } #try: event_results=es_flow_text.search(index=flow_text_index_name,doc_type=flow_text_index_type,body=query_body)['hits']['hits'] if event_results: weibo_result=[] fans_num_dict=dict() followers_num_dict=dict() alluser_num_dict=dict() print 'sencond_time:::',int(time.time()) for item in event_results: #print 'event_content:',item['_source']['text'] #统计用户信息 if alluser_num_dict.has_key(str(item['_source']['uid'])): followers_mark=set_intersection(item['_source']['uid'],followers_list) if followers_mark > 0: alluser_num_dict[str(item['_source']['uid'])]=alluser_num_dict[str(item['_source']['uid'])]+1*2 else: alluser_num_dict[str(item['_source']['uid'])]=alluser_num_dict[str(item['_source']['uid'])]+1 else: alluser_num_dict[str(item['_source']['uid'])]=1 #计算影响力 origin_influence_value=(1+item['_source']['comment']+item['_source']['retweeted'])*(1+item['_source']['sensitive']) # fans_value=judge_user_type(item['_source']['uid'],fans_list) followers_value=judge_user_type(item['_source']['uid'],followers_list) item['_source']['weibo_influence_value']=origin_influence_value*(followers_value) item['_source']['nick_name']=get_user_nickname(item['_source']['uid']) weibo_result.append(item['_source']) #统计影响力、时间 event_influence_sum=event_influence_sum+item['_source']['weibo_influence_value'] event_time_sum=event_time_sum+item['_source']['timestamp'] print 'third_time:::',int(time.time()) #典型微博信息 weibo_result.sort(key=lambda k:(k.get('weibo_influence_value',0)),reverse=True) event_warming_content['main_weibo_info']=json.dumps(weibo_result) #事件影响力和事件时间 number=len(event_results) event_warming_content['event_influence']=event_influence_sum/number event_warming_content['event_time']=event_time_sum/number # except: # event_warming_content['main_weibo_info']=[] # event_warming_content['event_influence']=0 # event_warming_content['event_time']=0 # try: #对用户进行排序 alluser_num_dict=sorted(alluser_num_dict.items(),key=lambda d:d[1],reverse=True) main_userid_list=[] for i in xrange(0,len(alluser_num_dict)): main_userid_list.append(alluser_num_dict[i][0]) #主要参与用户信息 main_user_info=[] user_es_result=es_user_profile.mget(index=profile_index_name,doc_type=profile_index_type,body={'ids':main_userid_list})['docs'] for item in user_es_result: user_dict=dict() if item['found']: user_dict['photo_url']=item['_source']['photo_url'] user_dict['uid']=item['_id'] user_dict['nick_name']=item['_source']['nick_name'] user_dict['favoritesnum']=item['_source']['favoritesnum'] user_dict['fansnum']=item['_source']['fansnum'] else: user_dict['photo_url']='' user_dict['uid']=item['_id'] user_dict['nick_name']='' user_dict['favoritesnum']=0 user_dict['fansnum']=0 main_user_info.append(user_dict) event_warming_content['main_user_info']=json.dumps(main_user_info) # except: # event_warming_content['main_user_info']=[] print 'fourth_time:::',int(time.time()) event_warming_content['xnr_user_no']=xnr_user_no event_warming_content['validity']=0 event_warming_content['timestamp']=today_datetime now_time=int(time.time()) task_id=xnr_user_no+'_'+str(now_time) if write_mark: print 'today_datetime:::',ts2datetime(today_datetime) mark=write_envent_warming(today_datetime,event_warming_content,task_id) event_warming_list.append(mark) else: event_warming_list.append(event_warming_content) else: pass print 'fifth_time:::',int(time.time()) return event_warming_list
reload(sys) sys.path.append('../../') from time_utils import ts2datetime, datetime2ts, ts2yeartime from parameter import MAX_VALUE, DAY, WARMING_DAY from global_config import S_TYPE, FACEBOOK_FLOW_START_DATE from elasticsearch import Elasticsearch from global_utils import es_xnr_2 as es, es_xnr from global_utils import facebook_user_warning_index_name_pre,facebook_user_warning_index_type,\ facebook_event_warning_index_name_pre,facebook_event_warning_index_type,\ facebook_speech_warning_index_name_pre,facebook_speech_warning_index_type,\ facebook_timing_warning_index_name_pre,facebook_timing_warning_index_type,\ weibo_date_remind_index_name,weibo_date_remind_index_type,\ facebook_warning_corpus_index_name,facebook_warning_corpus_index_type NOW_DATE = ts2datetime(int(time.time()) - 8 * DAY) print 'NOW_DATE:', NOW_DATE def facebook_user_warning_mappings(TODAY_DATE): index_info = { 'settings': { 'number_of_replicas': 0, 'number_of_shards': 5 }, 'mappings': { facebook_user_warning_index_type: { 'properties': { 'xnr_user_no': { # 虚拟人 'type': 'string', 'index': 'not_analyzed'
def create_speech_warning(xnr_user_no, today_datetime): #查询关注列表 lookup_type = 'followers_list' followers_list = lookup_xnr_fans_followers(xnr_user_no, lookup_type) query_body = { 'query': { 'filtered': { 'filter': { 'bool': { 'must': { 'range': { 'sensitive': { 'gte': 1 } } } } } } }, 'size': MAX_SEARCH_SIZE, 'sort': { 'sensitive': { 'order': 'desc' } } } twitter_flow_text_index_name = get_timets_set_indexset_list( twitter_flow_text_index_name_pre, today_datetime, today_datetime) #print twitter_flow_text_index_name results = es_xnr_2.search(index=twitter_flow_text_index_name, doc_type=twitter_flow_text_index_type, body=query_body)['hits']['hits'] #print results result = [] for item in results: if item['_source']['uid'] in followers_list: item['_source']['content_type'] = 'follow' else: item['_source']['content_type'] = 'unfollow' item['_source']['validity'] = 0 item['_source']['xnr_user_no'] = xnr_user_no #查询三个指标字段 tid_result = lookup_tid_attend_index(item['_source']['tid'], today_datetime) if tid_result: item['_source']['comment'] = tid_result['comment'] item['_source']['share'] = tid_result['share'] item['_source']['favorite'] = tid_result['favorite'] else: item['_source']['comment'] = 0 item['_source']['share'] = 0 item['_source']['favorite'] = 0 #查询用户昵称 item['_source']['nick_name'] = get_user_nickname( item['_source']['uid']) task_id = xnr_user_no + '_' + item['_source']['tid'] #写入数据库 today_date = ts2datetime(today_datetime) twitter_speech_warning_index_name = twitter_speech_warning_index_name_pre + today_date # try: es_xnr_2.index(index=twitter_speech_warning_index_name, doc_type=twitter_speech_warning_index_type, body=item['_source'], id=task_id) mark = True # except: # mark=False result.append(mark) return result
def trace_xnr_community(community, trace_datetime): #传的是ts #step1:获取跟踪社区list # community_list = get_trace_community(trace_datetime) #针对每个社区进行处理 all_influence = get_evaluate_max(weibo_bci_history_index_name, weibo_bci_history_index_type, 'bci_week_ave') all_sensitive = get_evaluate_max(weibo_sensitive_history_index_name, weibo_sensitive_history_index_type, 'sensitive_week_ave') result_mark = [] # for community in community_list: community_detail = dict() community_detail['xnr_user_no'] = community['xnr_user_no'] community_detail['community_id'] = community['community_id'] community_detail['community_name'] = community['community_name'] community_detail['create_time'] = community['create_time'] community_detail['trace_time'] = trace_datetime community_detail['trace_date'] = ts2datetime(trace_datetime) community_detail['num'] = community['num'] community_detail['nodes'] = community['nodes'] #判断一下,对于刚生成社区的预警,指标值取生成的 create_date = ts2datetime(community['create_time']) trace_date = ts2datetime(trace_datetime) if create_date == trace_date: print '新社区!' community_detail['density'] = community['density'] community_detail['cluster'] = community['cluster'] community_detail['max_influence'] = community['max_influence'] community_detail['mean_influence'] = community['mean_influence'] community_detail['max_sensitive'] = community['max_sensitive'] community_detail['mean_sensitive'] = community['mean_sensitive'] community_detail['warning_type'] = community['warning_type'] community_detail['num_warning'] = 0 community_detail['num_warning_descrp'] = "" community_detail['num_warning_content'] = "" community_detail['sensitive_warning'] = 0 community_detail['sensitive_warning_descrp'] = "" community_detail['sensitive_warning_content'] = "" community_detail['influence_warning'] = 0 community_detail['influence_warning_descrp'] = "" community_detail['influence_warning_content'] = "" community_detail['density_warning'] = 0 community_detail['density_warning_descrp'] = "" community_detail['density_warning_content'] = "" for item in community['warning_type']: if item == '人物突增预警': community_detail['num_warning'] = 1 community_detail['num_warning_descrp'],\ community_detail['num_warning_content'] = get_person_warning(community['community_id'],community['nodes'],community['xnr_user_no'],trace_datetime) elif item == '影响力剧增预警': community_detail['influence_warning'] = 1 community_detail['influence_warning_descrp'],\ community_detail['influence_warning_content'] = get_influence_warning(community,trace_datetime) elif item == '敏感度剧增预警': community_detail['sensitive_warning'] = 1 community_detail['sensitive_warning_descrp'],\ community_detail['sensitive_warning_content'] = get_sensitive_warning(community,trace_datetime) elif item == '社区聚集预警': community_detail['density_warning'] = 1 community_detail['density_warning_descrp'],\ community_detail['density_warning_content'] = get_density_warning(community,trace_datetime) else: print '旧社区!' #trace_index_result = group_evaluate(community['xnr_user_no'],community['nodes'],all_influence,all_sensitive) trace_index_result = group_evaluate_trace(community['xnr_user_no'], community['nodes'], all_influence, all_sensitive, trace_datetime, G=None) community_detail['density'] = trace_index_result['density'] community_detail['cluster'] = trace_index_result['cluster'] community_detail['max_influence'] = trace_index_result['max_influence'] community_detail['mean_influence'] = trace_index_result[ 'mean_influence'] community_detail['max_sensitive'] = trace_index_result['max_sensitive'] community_detail['mean_sensitive'] = trace_index_result[ 'mean_sensitive'] #预警处理 warning_result = get_warning_reslut(community_detail, trace_datetime) community_detail['warning_type'] = warning_result['warning_type'] community_detail['num_warning'] = warning_result['num_warning'] community_detail['num_warning_descrp'] = warning_result[ 'num_warning_descrp'] community_detail['num_warning_content'] = warning_result[ 'num_warning_content'] community_detail['sensitive_warning'] = warning_result[ 'sensitive_warning'] community_detail['sensitive_warning_descrp'] = warning_result[ 'sensitive_warning_descrp'] community_detail['sensitive_warning_content'] = warning_result[ 'sensitive_warning_content'] community_detail['influence_warning'] = warning_result[ 'influence_warning'] community_detail['influence_warning_descrp'] = warning_result[ 'influence_warning_descrp'] community_detail['influence_warning_content'] = warning_result[ 'influence_warning_content'] community_detail['density_warning'] = warning_result['density_warning'] community_detail['density_warning_descrp'] = warning_result[ 'density_warning_descrp'] community_detail['density_warning_content'] = warning_result[ 'density_warning_content'] community_detail[ 'warning_rank'] = warning_result['num_warning'] + warning_result[ 'sensitive_warning'] + warning_result[ 'influence_warning'] + warning_result['density_warning'] #更新显示 update_warningrank_mark = update_warning_rank(community_detail, trace_datetime) #存储至数据库 save_community_mark = save_community_detail(community_detail, community['xnr_user_no']) result_mark.append(save_community_mark) return result_mark
redis_cluster.delete('sensitive_hashtag_'+str(delete_ts)) #delete sensitive words redis_cluster.delete('sensitive_'+str(delete_ts)) #delete recommendation r.delete('recomment_'+str(delete_date)+"_influence") r.delete('recomment_'+str(delete_date)+"_sensitive") r.delete("identify_in_sensitive_" + str(delete_date)) r.delete("identify_in_influence_" + str(delete_date))) if __name__ == "__main__": now_ts = time.time() current_path = os.getcwd() file_path_redis = os.path.join(current_path, 'delete_redis.py') print_log = "&".join([file_path_redis, "start", ts2datetime(now_ts)]) print print_log now_datetime = datetime2ts(ts2datetime(now_ts)) new_ip_number = r_cluster.hlen('new_ip_'+str(now_datetime)) new_hashtag_number = r_cluster.hlen('hashtag_'+str(now_datetime)) #if new_ip_number and new_hashtag_number: # flow2/flow4写入新数据,可以清楚8天前数据 # main() now_ts = time.time() print_log = "&".join([file_path_redis, "end", ts2datetime(now_ts)]) print print_log
def get_related_recommendation(task_detail): avg_sort_uid_dict = {} xnr_user_no = task_detail['xnr_user_no'] sort_item = task_detail['sort_item'] es_result = es.get(index=weibo_xnr_index_name, doc_type=weibo_xnr_index_type, id=xnr_user_no)['_source'] uid = es_result['uid'] monitor_keywords = es_result['monitor_keywords'] monitor_keywords_list = monitor_keywords.split(',') nest_query_list = [] #print 'monitor_keywords_list::',monitor_keywords_list for monitor_keyword in monitor_keywords_list: #print 'monitor_keyword::::',monitor_keyword nest_query_list.append( {'wildcard': { 'keywords_string': '*' + monitor_keyword + '*' }}) # else: try: recommend_list = es.get(index=weibo_xnr_fans_followers_index_name, doc_type=weibo_xnr_fans_followers_index_type, id=xnr_user_no)['_source']['followers_list'] except: recommend_list = [] recommend_set_list = list(set(recommend_list)) if S_TYPE == 'test': current_date = S_DATE else: current_date = ts2datetime(int(time.time() - 24 * 3600)) flow_text_index_name = flow_text_index_name_pre + current_date if sort_item != 'friend': uid_list = [] #uid_list = recommend_set_list if sort_item == 'influence': sort_item = 'user_fansnum' query_body_rec = { 'query': { 'bool': { 'should': nest_query_list } }, 'aggs': { 'uid_list': { 'terms': { 'field': 'uid', 'size': TOP_ACTIVE_SOCIAL, 'order': { 'avg_sort': 'desc' } }, 'aggs': { 'avg_sort': { 'avg': { 'field': sort_item } } } } } } es_rec_result = es_flow_text.search( index=flow_text_index_name, doc_type='text', body=query_body_rec)['aggregations']['uid_list']['buckets'] #print 'es_rec_result///',es_rec_result for item in es_rec_result: uid = item['key'] uid_list.append(uid) avg_sort_uid_dict[uid] = {} if sort_item == 'user_fansnum': avg_sort_uid_dict[uid]['sort_item_value'] = int( item['avg_sort']['value']) else: avg_sort_uid_dict[uid]['sort_item_value'] = round( item['avg_sort']['value'], 2) else: if S_TYPE == 'test': uid_list = FRIEND_LIST #sort_item = 'sensitive' else: uid_list = [] ''' friends_list_results = es_user_profile.mget(index=profile_index_name,doc_type=profile_index_type,body={'ids':recommend_set_list})['docs'] for result in friends_list_results: friends_list = friends_list + result['friend_list'] ''' friends_list = get_friends_list(recommend_set_list) friends_set_list = list(set(friends_list)) #uid_list = friends_set_list sort_item_new = 'fansnum' query_body_rec = { 'query': { 'bool': { 'must': [{ 'terms': { 'uid': friends_set_list } }, { 'bool': { 'should': nest_query_list } }] } }, 'aggs': { 'uid_list': { 'terms': { 'field': 'uid', 'size': TOP_ACTIVE_SOCIAL, 'order': { 'avg_sort': 'desc' } }, 'aggs': { 'avg_sort': { 'avg': { 'field': sort_item_new } } } } } } es_friend_result = es_flow_text.search( index=flow_text_index_name, doc_type='text', body=query_body_rec)['aggregations']['uid_list']['buckets'] for item in es_friend_result: uid = item['key'] uid_list.append(uid) avg_sort_uid_dict[uid] = {} if not item['avg_sort']['value']: avg_sort_uid_dict[uid]['sort_item_value'] = 0 else: avg_sort_uid_dict[uid]['sort_item_value'] = int( item['avg_sort']['value']) results_all = [] for uid in uid_list: #if sort_item == 'friend': query_body = { 'query': { 'filtered': { 'filter': { 'term': { 'uid': uid } } } } } es_results = es_user_portrait.search(index=portrait_index_name, doc_type=portrait_index_type, body=query_body)['hits']['hits'] if es_results: #print 'portrait--',es_results[0]['_source'].keys() for item in es_results: uid = item['_source']['uid'] #nick_name,photo_url = uid2nick_name_photo(uid) item['_source']['nick_name'] = uid #nick_name item['_source']['photo_url'] = '' #photo_url weibo_type = judge_follow_type(xnr_user_no, uid) sensor_mark = judge_sensing_sensor(xnr_user_no, uid) item['_source']['weibo_type'] = weibo_type item['_source']['sensor_mark'] = sensor_mark try: del item['_source']['group'] del item['_source']['activity_geo_dict'] except: pass if sort_item == 'friend': if S_TYPE == 'test': item['_source']['fansnum'] = item['_source']['fansnum'] else: item['_source']['fansnum'] = avg_sort_uid_dict[uid][ 'sort_item_value'] elif sort_item == 'sensitive': item['_source']['sensitive'] = avg_sort_uid_dict[uid][ 'sort_item_value'] item['_source']['fansnum'] = item['_source']['fansnum'] else: item['_source']['fansnum'] = avg_sort_uid_dict[uid][ 'sort_item_value'] if S_TYPE == 'test': current_time = datetime2ts(S_DATE) else: current_time = int(time.time()) index_name = get_flow_text_index_list(current_time) query_body = { 'query': { 'bool': { 'must': [{ 'term': { 'uid': uid } }, { 'terms': { 'message_type': [1, 3] } }] } }, 'sort': { 'retweeted': { 'order': 'desc' } }, 'size': 5 } es_weibo_results = es_flow_text.search( index=index_name, doc_type=flow_text_index_type, body=query_body)['hits']['hits'] weibo_list = [] for weibo in es_weibo_results: weibo = weibo['_source'] weibo_list.append(weibo) item['_source']['weibo_list'] = weibo_list item['_source']['portrait_status'] = True results_all.append(item['_source']) else: item_else = dict() item_else['uid'] = uid #nick_name,photo_url = uid2nick_name_photo(uid) item_else['nick_name'] = uid #nick_name item_else['photo_url'] = '' #photo_url weibo_type = judge_follow_type(xnr_user_no, uid) sensor_mark = judge_sensing_sensor(xnr_user_no, uid) item_else['weibo_type'] = weibo_type item_else['sensor_mark'] = sensor_mark item_else['portrait_status'] = False #if sort_item != 'friend': #item_else['sort_item_value'] = avg_sort_uid_dict[uid]['sort_item_value'] # else: # item_else['sort_item_value'] = '' if S_TYPE == 'test': current_time = datetime2ts(S_DATE) else: current_time = int(time.time()) index_name = get_flow_text_index_list(current_time) query_body = { 'query': { 'term': { 'uid': uid } }, 'sort': { 'retweeted': { 'order': 'desc' } } } es_weibo_results = es_flow_text.search( index=index_name, doc_type=flow_text_index_type, body=query_body)['hits']['hits'] weibo_list = [] for weibo in es_weibo_results: item_else['fansnum'] = weibo['_source']['user_fansnum'] weibo = weibo['_source'] weibo_list.append(weibo) item_else['weibo_list'] = weibo_list item_else['friendsnum'] = 0 item_else['statusnum'] = 0 if sort_item == 'sensitive': item_else['sensitive'] = avg_sort_uid_dict[uid][ 'sort_item_value'] else: item_else['fansnum'] = avg_sort_uid_dict[uid][ 'sort_item_value'] results_all.append(item_else) return results_all
if count % 1000 == 0 and count != 0: R_TOPIC.hmset(r_topic_name, hmset_dict) end_ts = time.time() #print '%s sec count 1000' % (end_ts - start_ts) except StopIteration: if hmset_dict: R_TOPIC.hmset(r_topic_name, hmset_dict) hmset_dict = {} break except Exception as e: raise e break if hmset_dict: R_TOPIC.hmset(r_topic_name, hmset_dict) #print 'all count:', count if __name__ == '__main__': log_time_ts = time.time() log_time_date = ts2datetime(log_time_ts) print 'cron/flow4/scan_topic2senitment.py&start&' + log_time_date del_topic_redis() scan_topic2redis() log_time_ts = time.time() log_time_date = ts2datetime(log_time_ts) print 'cron/flow4/scan_topic2senitment.py&end&' + log_time_date #topic_string = R_TOPIC.hget(r_topic_name, '2010832710') #print 'topic_string:', topic_string, type(topic_string)
def xnr_keywords_compute(xnr_user_no): #查询好友列表 friends_list = lookup_xnr_friends(xnr_user_no) lookup_condition_list = [] print 'xnr_user_no, friends_list:', xnr_user_no, friends_list lookup_condition_list.append({ 'filtered': { 'filter': { 'bool': { 'must': { 'terms': { 'uid': friends_list } } } } } }) #根据日期确定查询表 if S_TYPE == 'test': date_time = test_date else: now_time = int(time.time()) date_time = ts2datetime(now_time) flow_text_index_name = facebook_flow_text_index_name_pre + date_time #按日期统计 # print lookup_condition_list for item_condition in lookup_condition_list: query_body = { # 'query':item_condition, 'aggs': { 'keywords': { 'terms': { 'field': 'keywords_string', 'size': 1000 } } } } flow_text_exist=es_xnr_2.search(index=flow_text_index_name,doc_type=facebook_flow_text_index_type,\ body=query_body)['aggregations']['keywords']['buckets'] # print 'flow_text_exist:',flow_text_exist word_dict = dict() word_dict_new = dict() keywords_string = '' for item in flow_text_exist: word = item['key'] count = item['doc_count'] word_dict[word] = count keywords_string += '&' keywords_string += item['key'] k_dict = extract_keywords(keywords_string) for item_item in k_dict: keyword = item_item.word # print 'keyword::',type(keyword) word_dict_new[keyword] = word_dict[keyword] return word_dict_new
def scan_retweet(): count = 0 scan_cursor = 0 now_ts = time.time() now_date_ts = datetime2ts(ts2datetime(now_ts)) #get redis db number db_number = get_db_num(now_date_ts) #retweet/be_retweet es mappings ''' retweet_es_mappings(str(db_number)) be_retweet_es_mappings(str(db_number)) ''' #get redis db retweet_redis = retweet_redis_dict[str(db_number)] retweet_bulk_action = [] be_retweet_bulk_action = [] start_ts = time.time() #retweet count/be_retweet count retweet_count = 0 be_retweet_count = 0 while True: re_scan = retweet_redis.scan(scan_cursor, count=100) re_scan_cursor = re_scan[0] ''' if re_scan_cursor == 0: print 'scan finish' if retweet_bulk_action != []: es.bulk(retweet_bulk_action, index='retweet_'+str(db_number), doc_type='user') if be_retweet_bulk_action != []: es.bulk(be_retweet_bulk_action, index='be_retweet_'+str(db_number), doc_type='user') break ''' for item in re_scan[1]: count += 1 item_list = item.split('_') save_dict = {} if len(item_list) == 2: retweet_count += 1 uid = item_list[1] item_result = retweet_redis.hgetall(item) save_dict['uid'] = uid save_dict['uid_retweet'] = json.dumps(item_result) retweet_bulk_action.extend([{ 'index': { '_id': uid } }, save_dict]) elif len(item_list) == 3: be_retweet_count += 1 uid = item_list[2] item_result = retweet_redis.hgetall(item) save_dict['uid'] = uid save_dict['uid_be_retweet'] = json.dumps(item_result) be_retweet_bulk_action.extend([{ 'index': { '_id': uid } }, save_dict]) es.bulk(retweet_bulk_action, index='1225_retweet_' + str(db_number), doc_type='user') es.bulk(be_retweet_bulk_action, index='1225_be_retweet_' + str(db_number), doc_type='user') retweet_bulk_action = [] be_retweet_bulk_action = [] end_ts = time.time() print '%s sec scan %s count user:'******'count:', count print 'end'
def scan_comment(): count = 0 scan_cursor = 0 now_ts = time.time() now_date_ts = datetime2ts(ts2datetime(now_ts)) #get redis db number db_number = get_db_num(now_date_ts) #comment/be_comment es mappings ''' comment_es_mappings(str(db_number)) be_comment_es_mappings(str(db_number)) ''' #get redis db comment_redis = comment_redis_dict[str(db_number)] comment_bulk_action = [] be_comment_bulk_action = [] start_ts = time.time() #comment count/be_comment count comment_count = 0 be_comment_count = 0 while True: re_scan = comment_redis.scan(scan_cursor, count=100) re_scan_cursor = re_scan[0] for item in re_scan[1]: count += 1 item_list = item.split('_') save_dict = {} if len(item_list) == 2: comment_count += 1 uid = item_list[1] item_result = comment_redis.hgetall(item) save_dict['uid'] = uid save_dict['uid_comment'] = json.dumps(item_result) comment_bulk_action.extend([{ 'index': { '_id': uid } }, save_dict]) ''' elif len(item_list)==3: be_comment_count += 1 uid = item_list[2] item_result = comment_redis.hgetall(item) save_dict['uid'] = uid save_dict['uid_be_comment'] = json.dumps(item_result) be_comment_bulk_action.extend([{'index':{'_id': uid}}, save_dict]) ''' try: es.bulk(comment_bulk_action, index='1225_comment_' + str(db_number), doc_type='user') except: index_name = '1225_comment_' + str(db_number) split_bulk_action(comment_bulk_action, index_name) ''' try: es.bulk(be_comment_bulk_action, index='1225_be_comment_'+str(db_number), doc_type='user') except: index_name = '1225_be_comment_'+str(db_number) split_bulk_action(be_comment_bulk_action, index_name) ''' comment_bulk_action = [] #be_comment_bulk_action = [] end_ts = time.time() print '%s sec scan %s count user' % (end_ts - start_ts, count) start_ts = end_ts scan_cursor = re_scan[0] if scan_cursor == 0: break print 'count:', count print 'end'
def create_personal_warning(xnr_user_no,today_datetime): #查询关注列表 lookup_type='followers_list' followers_list=lookup_xnr_fans_followers(xnr_user_no,lookup_type) #查询虚拟人uid xnr_uid=lookup_xnr_uid(xnr_user_no) #计算敏感度排名靠前的用户 query_body={ # 'query':{ # 'filtered':{ # 'filter':{ # 'terms':{'uid':followers_list} # } # } # }, 'aggs':{ 'followers_sensitive_num':{ 'terms':{'field':'uid'}, 'aggs':{ 'sensitive_num':{ 'sum':{'field':'sensitive'} } } } }, 'size':MAX_SEARCH_SIZE } flow_text_index_name=get_day_flow_text_index_list(today_datetime) try: first_sum_result=es_flow_text.search(index=flow_text_index_name,doc_type=flow_text_index_type,\ body=query_body)['aggregations']['followers_sensitive_num']['buckets'] except: first_sum_result=[] #print first_sum_result top_userlist=[] for i in xrange(0,len(first_sum_result)): user_sensitive=first_sum_result[i]['sensitive_num']['value'] if user_sensitive > 0: user_dict=dict() user_dict['uid']=first_sum_result[i]['key'] followers_mark=judge_user_type(user_dict['uid'],followers_list) user_dict['sensitive']=user_sensitive*followers_mark top_userlist.append(user_dict) else: pass #################################### #如果是关注者则敏感度提升 #################################### #查询敏感用户的敏感微博内容 results=[] for user in top_userlist: #print user user_detail=dict() user_detail['uid']=user['uid'] user_detail['user_sensitive']=user['sensitive'] # user_lookup_id=xnr_uid+'_'+user['uid'] # print user_lookup_id # try: # #user_result=es_xnr.get(index=weibo_feedback_follow_index_name,doc_type=weibo_feedback_follow_index_type,id=user_lookup_id)['_source'] # user_result=es_user_profile.get(index=profile_index_name,doc_type=profile_index_type,id=user['uid'])['_source'] # user_detail['user_name']=user_result['nick_name'] # except: user_detail['user_name']=get_user_nickname(user['uid']) query_body={ 'query':{ 'filtered':{ 'filter':{ 'bool':{ 'must':[ {'term':{'uid':user['uid']}}, {'range':{'sensitive':{'gte':1}}} ] } } } }, 'size':MAX_WARMING_SIZE, 'sort':{'sensitive':{'order':'desc'}} } try: second_result=es_flow_text.search(index=flow_text_index_name,doc_type=flow_text_index_type,body=query_body)['hits']['hits'] except: second_result=[] s_result=[] #tem_word_one = '静坐' #tem_word_two = '集合' for item in second_result: #sensitive_words=item['_source']['sensitive_words_string'] #if ((sensitive_words==tem_word_one) or (sensitive_words==tem_word_two)): # pass #else: #查询用户昵称 item['_source']['nick_name']=get_user_nickname(item['_source']['uid']) s_result.append(item['_source']) s_result.sort(key=lambda k:(k.get('sensitive',0)),reverse=True) user_detail['content']=json.dumps(s_result) user_detail['xnr_user_no']=xnr_user_no user_detail['validity']=0 user_detail['timestamp']=today_datetime #写入数据库 today_date=ts2datetime(today_datetime) weibo_user_warning_index_name=weibo_user_warning_index_name_pre+today_date task_id=xnr_user_no+'_'+user_detail['uid'] #print weibo_user_warning_index_name #print user_detail if s_result: try: es_xnr.index(index=weibo_user_warning_index_name,doc_type=weibo_user_warning_index_type,body=user_detail,id=task_id) mark=True except: mark=False else: pass results.append(mark) return results
def create_date_warning(today_datetime): query_body = { 'query': { 'match_all': {} }, 'size': MAX_VALUE, 'sort': { 'date_time': { 'order': 'asc' } } } try: result = es_xnr.search(index=weibo_date_remind_index_name, doc_type=weibo_date_remind_index_type, body=query_body)['hits']['hits'] date_result = [] for item in result: #计算距离日期 date_time = item['_source']['date_time'] year = ts2yeartime(today_datetime) warming_date = year + '-' + date_time today_date = ts2datetime(today_datetime) countdown_num = (datetime2ts(warming_date) - datetime2ts(today_date)) / DAY if abs(countdown_num) < WARMING_DAY: #根据给定的关键词查询预警微博 print 'date_time:', date_time keywords = item['_source']['keywords'] date_warming = lookup_twitter_date_warming( keywords, today_datetime) item['_source']['twitter_date_warming_content'] = json.dumps( date_warming) item['_source']['validity'] = 0 item['_source']['timestamp'] = today_datetime task_id = str( item['_source']['create_time']) + '_' + str(today_datetime) #print 'task_id',task_id #print 'date_warming',date_warming #写入数据库 twitter_timing_warning_index_name = twitter_timing_warning_index_name_pre + warming_date if date_warming: print twitter_timing_warning_index_name try: es_xnr_2.index( index=twitter_timing_warning_index_name, doc_type=twitter_timing_warning_index_name, body=item['_source'], id=task_id) mark = True except: mark = False else: pass date_result.append(mark) else: pass except: date_result = [] return date_result
def active_social_recommend_daily(current_date): # 1. 获得所有已完成虚拟人 all_xnrs = get_all_xnrs() print 'all_xnrs', all_xnrs # 2. 对于每个虚拟人,计算 按粉丝数、按敏感度、按朋友圈 三个结果 并保存 for xnr_user_no in all_xnrs: for sort_item in ['influence', 'sensitive', 'friend']: task_detail = {} print 'sort_item..', sort_item task_detail['xnr_user_no'] = xnr_user_no task_detail['sort_item'] = sort_item # 计算 result = get_related_recommendation(task_detail) print 'result', len(result) # 保存 save_results_to_es(xnr_user_no, current_date, sort_item, result) if __name__ == '__main__': current_time = time.time() current_date = ts2datetime(current_time) start_ts = time.time() active_social_recommend_daily(current_date) end_ts = time.time() print 'cost..', end_ts - start_ts
def create_personal_warning(xnr_user_no, today_datetime): #查询关注列表 lookup_type = 'followers_list' followers_list = lookup_xnr_fans_followers(xnr_user_no, lookup_type) #查询虚拟人uid xnr_uid = lookup_xnr_uid(xnr_user_no) #计算敏感度排名靠前的用户 query_body = { # 'query':{ # 'filtered':{ # 'filter':{ # 'terms':{'uid':followers_list} # } # } # }, 'aggs': { 'friends_sensitive_num': { 'terms': { 'field': 'uid' }, 'aggs': { 'sensitive_num': { 'sum': { 'field': 'sensitive' } } } } }, 'size': MAX_SEARCH_SIZE } twitter_flow_text_index_name = get_timets_set_indexset_list( twitter_flow_text_index_name_pre, today_datetime, today_datetime) try: first_sum_result=es_xnr_2.search(index=twitter_flow_text_index_name,doc_type=twitter_flow_text_index_type,\ body=query_body)['aggregations']['friends_sensitive_num']['buckets'] except: first_sum_result = [] #print 'first_sum_result',first_sum_result top_userlist = [] for i in xrange(0, len(first_sum_result)): user_sensitive = first_sum_result[i]['sensitive_num']['value'] if user_sensitive > 0: user_dict = dict() user_dict['uid'] = first_sum_result[i]['key'] followers_mark = judge_user_type(user_dict['uid'], followers_list) user_dict['sensitive'] = user_sensitive * followers_mark top_userlist.append(user_dict) else: pass ##################### #如果是关注者,则用户敏感度计算值增加1.5倍 ##################### #查询敏感用户的敏感内容 results = [] for user in top_userlist: #print user user_detail = dict() user_detail['uid'] = user['uid'] user_detail['user_sensitive'] = user['sensitive'] user_lookup_id = user['uid'] print user_lookup_id #查询用户昵称 user_detail['user_name'] = get_user_nickname(user['uid']) query_body = { 'query': { 'filtered': { 'filter': { 'bool': { 'must': [{ 'term': { 'uid': user['uid'] } }, { 'range': { 'sensitive': { 'gte': 1 } } }] } } } }, 'size': MAX_WARMING_SIZE, 'sort': { 'sensitive': { 'order': 'desc' } } } try: second_result = es_xnr_2.search( index=twitter_flow_text_index_name, doc_type=twitter_flow_text_index_type, body=query_body)['hits']['hits'] except: second_result = [] s_result = [] for item in second_result: #查询三个指标字段 tid_result = lookup_tid_attend_index(item['_source']['tid'], today_datetime) if tid_result: item['_source']['comment'] = tid_result['comment'] item['_source']['share'] = tid_result['share'] item['_source']['favorite'] = tid_result['favorite'] else: item['_source']['comment'] = 0 item['_source']['share'] = 0 item['_source']['favorite'] = 0 #查询用户昵称 item['_source']['nick_name'] = get_user_nickname( item['_source']['uid']) s_result.append(item['_source']) s_result.sort(key=lambda k: (k.get('sensitive', 0)), reverse=True) user_detail['content'] = json.dumps(s_result) user_detail['xnr_user_no'] = xnr_user_no user_detail['validity'] = 0 user_detail['timestamp'] = today_datetime #写入数据库 today_date = ts2datetime(today_datetime) twitter_user_warning_index_name = twitter_user_warning_index_name_pre + today_date task_id = xnr_user_no + '_' + user_detail['uid'] if s_result: try: es_xnr_2.index(index=twitter_user_warning_index_name, doc_type=twitter_user_warning_index_type, body=user_detail, id=task_id) mark = True except: mark = False else: pass results.append(mark) return results
def group_evaluate_trace(xnr_user_no, nodes, all_influence, all_sensitive, date_time, G=None): result = {} result['xnr_user_no'] = xnr_user_no result['nodes'] = nodes result['num'] = len(nodes) #从redis中获取社区转发网络 count = 0 scan_cursor = 1 now_ts = time.time() now_date_ts = datetime2ts(ts2datetime(now_ts)) #get redis db number db_number = get_db_num(now_date_ts) print 'db_number:', str(db_number) #get redis db print 'retweet_dict::', retweet_redis_dict retweet_redis = retweet_redis_dict[str(db_number)] comment_redis = comment_redis_dict[str(db_number)] retweet_result = [] for uid in nodes: item_1 = str('retweet_' + uid) # print 'item_lookup::',item_1,type(item_1) re_result = retweet_redis.hgetall(item_1) if re_result: save_dict = dict() save_dict['uid'] = uid save_dict['uid_retweet'] = re_result retweet_result.append(save_dict) # print 'test_result::',retweet_result # print 'aaa:::', retweet_redis.hgetall('retweet_'+str(nodes[-1])) #print 'retweet_redis::',retweet_redis #print 'comment_redis::',comment_redis ''' re_scan = retweet_redis.scan(scan_cursor,count=10) for item in re_scan[1]: # item_list = item.split('_') print 'item::',item,type(item) item_result = retweet_redis.hgetall(item) print 'item_result::',item_result # print 'hlen::',retweet_redis.hlen() # print 'hgetall::',retweet_redis.hgetall() retweet_result = retweet_redis.hgetall(nodes) comment_result = comment_redis.hgetall(nodes) ''' # print 'retweet_result:::',retweet_result #print 'comment_result:::',comment_result G_i = nx.Graph() for i in retweet_result: # print 'i:',i # if not i['found']: # continue uid_retweet = i['uid_retweet'] max_count = max([int(n) for n in uid_retweet.values()]) G_i.add_weighted_edges_from([ (i['uid'], j, float(uid_retweet[j]) / max_count) for j in uid_retweet.keys() if j != i['uid'] and j and i['uid'] ]) ''' for i in comment_result: # print 'comment_i:',i if not i['found']: continue uid_comment = json.loads(i['_source']['uid_comment']) max_count = max([int(n) for n in uid_comment.values()]) G_i.add_weighted_edges_from([(i['_source']['uid'],j,float(uid_comment[j])/max_count) for j in uid_comment.keys() if j != i['_source']['uid'] and j and i['_source']['uid']]) ''' sub_g = G_i.subgraph(nodes) result['density'] = round(nx.density(sub_g), 4) #print 'ave_cluster::',nx.average_clustering(sub_g) try: result['cluster'] = round(nx.average_clustering(sub_g), 4) except: result['cluster'] = 0 result['transitivity'] = round(nx.transitivity(sub_g), 4) ##将结果换成当天的计算结果 influence_field = 'user_index' sensitive_field = 'sensitive' influence_result = get_influence_value(date_time, influence_field, nodes) sensitive_result = get_sensitive_value(date_time, sensitive_field, nodes) result['max_influence'] = round( (max(influence_result) / float(all_influence)) * 100, 4) result['mean_influence'] = round( ((sum(influence_result) / len(influence_result)) / float(all_influence)) * 100, 4) max_sensitive = round((max(sensitive_result) / float(all_sensitive)) * 1, 4) if max_sensitive > 100: result['max_sensitive'] = 100.0000 else: result['max_sensitive'] = max_sensitive result['mean_sensitive'] = round( ((sum(sensitive_result) / len(sensitive_result)) / float(all_sensitive)) * 1, 4) return result
s_re = scan(es_9200, query=es_query, index=BCIHIS_INDEX_NAME, doc_type=BCIHIS_INDEX_TYPE) count = 0 array = [] while 1: try: temp = s_re.next() one_item = {} one_item['id'] = temp['_id'].encode("utf-8") one_item['total_num'] = 0 one_item['today_bci'] = 0 one_item['update_time'] = TODAY_TIME array.append(one_item) count += 1 if count % 1000 == 0: r_flow.lpush('update_bci_list', json.dumps(array)) array = [] count = 0 except StopIteration: print "all done" r_flow.lpush('update_bci_list', json.dumps(array)) break if __name__ == '__main__': todaydate = ts2datetime(time.time()) mapper_bci_today(todaydate) mapper_bci_history(todaydate)
word_dict示例:{uid1:{'w1':f1,'w2':f2...}...} weibo_list示例:[[uid1,text1,ts1],[uid2,text2,ts2],...](每一条记录对应三个值:uid、text、timestamp) ''' word_dict = dict()#词频字典 weibo_list = []#微博列表 now_ts = time.time() #run_type if RUN_TYPE = 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = datetime2ts(RUN_TEST_TIME) start_date_ts = now_date_ts - DAY * WEEK for i in range(0,WEEK): iter_date_ts = start_date_ts + DAY * i flow_text_index_date = ts2datetime(iter_date_ts) flow_text_index_name = flow_text_index_name_pre + flow_text_index_date print flow_text_index_name try: flow_text_exist = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\ body={'query':{'filtered':{'filter':{'terms':{'uid': uid_list}}}}, 'size': MAX_VALUE}, _source=False, fields=['text','uid','keywords_dict','timestamp'])['hits']['hits'] except: flow_text_exist = [] for flow_text_item in flow_text_exist: uid = flow_text_item['fields']['uid'][0].encode('utf-8') text = flow_text_item['fields']['text'][0].encode('utf-8') ts = flow_text_item['fields']['timestamp'][0] keywords_dict = json.loads(flow_text_item['fields']['keywords_dict'][0]) keywords_dict = json.dumps(keywords_dict, encoding="UTF-8", ensure_ascii=False) keywords_dict = eval(keywords_dict)
def get_flow_information_v2(uid_list, all_user_keywords_dict): results = {} #results = {uid:{'hashtag_dict':{},'hashtag':'', 'keywords_dict':{}, 'keywords_string':'', 'activity_geo':'', 'activity_geo_dict':dict, 'activity_geo_aggs':''}} iter_results = { } # iter_results = {uid:{'hashtag': hashtag_dict, 'geo':geo_dict, 'keywords':keywords_dict}} now_ts = time.time() #run_type today_sensitive_results = {} if RUN_TYPE == 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = test_ts print 'run_type:', RUN_TYPE for i in range(WEEK, 0, -1): ts = now_date_ts - DAY * i uid_day_geo = {} #compute hashtag and geo hashtag_results = r_cluster_3.hmget('hashtag_' + str(ts), uid_list) ip_results = r_cluster.hmget('new_ip_' + str(ts), uid_list) print 'ip_results:', ip_results #compute sensitive_words sensitive_results = r_cluster_3.hmget('sensitive_' + str(ts), uid_list) count = 0 for uid in uid_list: #init iter_results[uid] if uid not in iter_results: iter_results[uid] = { 'hashtag': {}, 'geo': {}, 'geo_track': [], 'keywords': {}, 'sensitive': {}, 'school': {}, 'week_ip': { 0: {}, 1: {}, 2: {}, 3: {}, 4: {}, 5: {} }, 'ip': {} } if uid not in today_sensitive_results: today_sensitive_results[uid] = {} #compute hashtag hashtag_item = hashtag_results[count] if hashtag_item: uid_hashtag_dict = json.loads(hashtag_item) else: uid_hashtag_dict = {} for hashtag in uid_hashtag_dict: try: iter_results[uid]['hashtag'][hashtag] += uid_hashtag_dict[ hashtag] except: iter_results[uid]['hashtag'][hashtag] = uid_hashtag_dict[ hashtag] #compute sensitive sensitive_item = sensitive_results[count] if sensitive_item: uid_sensitive_dict = json.loads(sensitive_item) else: uid_sensitive_dict = {} for sensitive_word in uid_sensitive_dict: try: iter_results[uid]['sensitive'][ sensitive_word] += uid_sensitive_dict[sensitive_word] except: iter_results[uid]['sensitive'][ sensitive_word] = uid_sensitive_dict[sensitive_word] if ts == now_date_ts - DAY: try: today_sensitive_results[uid][ sensitive_word] += uid_sensitive_dict[ sensitive_word] except: today_sensitive_results[uid][ sensitive_word] = uid_sensitive_dict[ sensitive_word] #compute geo uid_day_geo[uid] = {} ip_item = ip_results[count] if ip_item: uid_ip_dict = json.loads(ip_item) else: uid_ip_dict = {} for ip in uid_ip_dict: ip_count = len(uid_ip_dict[ip].split('&')) geo, school = ip2city(ip) if geo: try: iter_results[uid]['geo'][geo] += ip_count except: iter_results[uid]['geo'][geo] = ip_count try: uid_day_geo[uid][geo] += ip_count except: uid_day_geo[uid][geo] = ip_count if school: try: iter_results[uid]['school'][school] += ip_count except: iter_results[uid]['school'][school] = ip_count #deal ip: job_ip&home_ip&active_ip ip_time_list = uid_ip_dict[ip].split('&') try: iter_results[uid]['ip'][ip] += ip_count except: iter_results[uid]['ip'] = {ip: ip_count} for ip_time_item in ip_time_list: ip_timesegment = (int(ip_time_item) - ts) / IP_TIME_SEGMENT try: iter_results[uid]['week_ip'][ip_timesegment][ip] += 1 except: iter_results[uid]['week_ip'][ip_timesegment][ip] = 1 #end deal ip iter_results[uid]['geo_track'].append(uid_day_geo[uid]) count += 1 #get keywords top for uid in uid_list: #print 'test iter_results_ip:', iter_results[uid]['week_ip'] results[uid] = {} #hashtag hashtag_dict = iter_results[uid]['hashtag'] results[uid]['hashtag_dict'] = json.dumps(hashtag_dict) results[uid]['hashtag'] = '&'.join(hashtag_dict.keys()) #sensitive words sensitive_word_dict = iter_results[uid]['sensitive'] results[uid]['sensitive_dict'] = json.dumps(sensitive_word_dict) results[uid]['sensitive_string'] = '&'.join(sensitive_word_dict.keys()) sensitive_score = 0 today_sensitive_results_user = today_sensitive_results[uid] for sensitive_item in today_sensitive_results_user: k = sensitive_item v = today_sensitive_results_user[sensitive_item] tmp_stage = r_sensitive.hget('sensitive_words', k) if tmp_stage: sensitive_score += v * sensitive_score_dict[str(tmp_stage)] results[uid]['sensitive'] = sensitive_score #geo geo_dict = iter_results[uid]['geo'] geo_track_list = iter_results[uid]['geo_track'] results[uid]['activity_geo_dict'] = json.dumps(geo_track_list) geo_dict_keys = geo_dict.keys() results[uid]['activity_geo'] = '&'.join( ['&'.join(item.split('\t')) for item in geo_dict_keys]) try: results[uid]['activity_geo_aggs'] = '&'.join( [item.split('\t')[-1] for item in geo_dict_keys]) except: results[uid]['activity_geo_aggs'] = '' #keywords keywords_dict = all_user_keywords_dict[uid] keywords_top50 = sorted(keywords_dict.items(), key=lambda x: x[1], reverse=True)[:50] keywords_top50_string = '&'.join( [keyword_item[0] for keyword_item in keywords_top50]) results[uid]['keywords'] = json.dumps(keywords_top50) results[uid]['keywords_string'] = keywords_top50_string #school dict school_dict = iter_results[uid]['school'] school_string = '&'.join(school_dict.keys()) if school_dict != {}: is_school = '1' else: is_school = '0' results[uid]['is_school'] = is_school results[uid]['school_string'] = school_string results[uid]['school_dict'] = json.dumps(school_dict) #ip: job_ip&home_ip&activity_ip #activity_ip all_ip_dict = iter_results[uid]['ip'] sort_all_ip = sorted(all_ip_dict.items(), key=lambda x: x[1], reverse=True) try: activity_ip = sort_all_ip[0][0] except: activity_ip = '' results[uid]['activity_ip'] = str(activity_ip) #job_ip & home_ip week_time_ip_dict = iter_results[uid]['week_ip'] for i in range(0, 6): try: segment_dict = week_time_ip_dict[i] except: week_time_ip_dict[i] = {} home_ip, job_ip = get_ip_description(week_time_ip_dict) results[uid]['home_ip'] = str(home_ip) results[uid]['job_ip'] = str(job_ip) return results
import time reload(sys) sys.path.append('../../') from time_utils import ts2datetime,ts2yeartime,datetime2ts from parameter import WARMING_DAY,MAX_VALUE,DAY from global_config import S_TYPE,S_DATE_BCI,S_DATE_WARMING from elasticsearch import Elasticsearch from global_utils import es_xnr as es from global_utils import weibo_user_warning_index_name_pre,weibo_user_warning_index_type,\ weibo_event_warning_index_name_pre,weibo_event_warning_index_type,\ weibo_speech_warning_index_name_pre,weibo_speech_warning_index_type,\ weibo_timing_warning_index_name_pre,weibo_timing_warning_index_type,\ weibo_date_remind_index_name,weibo_date_remind_index_type,\ weibo_warning_corpus_index_name,weibo_warning_corpus_index_type NOW_DATE=ts2datetime(int(time.time())-DAY) def weibo_user_warning_mappings(index_name): index_info = { 'settings':{ 'number_of_replicas':0, 'number_of_shards':5 }, 'mappings':{ weibo_user_warning_index_type:{ 'properties':{ 'xnr_user_no':{ # 虚拟人 'type':'string', 'index':'not_analyzed' }, 'user_name':{ #预警用户昵称
def organize_feature(mid, topic): if RUN_TYPE: ts = time.time() else: ts = datetime2ts("2016-11-17") index_list = [] for i in range(7): index_list.append("flow_text_" + ts2datetime(ts - i * 24 * 3600)) result = dict() for iter_index in index_list: if not es.indices.exists(index=iter_index): continue try: result = es.get(index=iter_index, doc_type="text", id=mid)["_source"] break except: pass if not result: return [0, 0, 0, 0, 0, 0, 0] ts = result["timestamp"] query_body = {"query": {"term": {"root_mid": mid}}} #total_weibo #count = es.count(index=index_list, doc_type="text", body=query_body)["count"] query_body_uid = { "query": { "term": { "root_mid": mid } }, "aggs": { "uid_count": { "cardinality": { "field": "uid" } } } } # total_uid #total_uid_count = es.search(index=index_list, doc_type="text", body=query_body_uid)['aggregations']["uid_count"]["value"] feature_list = [] feature_list.append(math.log(result["user_fansnum"] + 1)) query_body_ts = { "query": { "bool": { "must": [{ "term": { "root_mid": mid } }, { "range": { "timestamp": { "lt": ts + 3600 * 10 } } }] } }, "aggs": { "weibo_type": { "terms": { "field": "message_type" } } } } comment = 0 retweet = 0 tmp_count = es.search( index=index_list, doc_type="text", body=query_body_ts)['aggregations']["weibo_type"]["buckets"] if tmp_count: for item in tmp_count: if int(item["key"]) == 2: comment = item["doc_count"] elif int(item["key"]) == 3: retweet = item["doc_count"] feature_list.append(comment + retweet) feature_list.append(retweet) feature_list.append(comment) feature_list.append(retweet / float(comment + retweet + 1)) feature_list.append(comment / float(comment + retweet + 1)) query_body_uid = { "query": { "bool": { "must": [{ "term": { "root_mid": mid } }, { "range": { "timestamp": { "lt": ts + 3600 * 10 } } }] } }, "aggs": { "uid_count": { "cardinality": { "field": "uid" } } } } uid_count = es.search( index=index_list, doc_type="text", body=query_body_uid)['aggregations']["uid_count"]["value"] feature_list.append(uid_count) #feature_list.append(topic_field_dict[topic]) return feature_list
k = int(len(g.nodes()) * ratio_pm) items = db.session.query(BetweenessCentralityUser).filter(BetweenessCentralityUser.topic==topic ,\ BetweenessCentralityUser.rank<=k ,\ BetweenessCentralityUser.windowsize==windowsize ,\ BetweenessCentralityUser.date==date).all() positive_m = [] for item in items: positive_m.append(item.userid) print 'len(positive_m):', k print 'positive_m:', positive_m return positive_m if __name__=='__main__': g = nx.read_gexf('test_graph.gexf') positive_m = [] source = [] topic = u'东盟,博览会' windowsize = 6 end_ts = datetime2ts('2013-09-08') date = ts2datetime(end_ts) ratio_pm = 1 / float(30) positive_m = Get_pm(g, ratio_pm, topic, windowsize, date) while positive_m: #print 'positive_m, source:', len(positive_m), len(source) source, positive_m = Get_s(g, positive_m, source) print 'positive_m, source:', len(positive_m), len(source) save_source(source)
def delete_files(): localtime = int(time.time()) - 24 * 3600 #隔天删除数据 print "time to delete files ..." count = 0 file_list = os.listdir(BIN_FILE_PATH) for each in file_list: file_name = each.split('.')[0] file_timestamp = int(file_name.split('_')[0]) if file_timestamp < localtime: os.remove(os.path.join(BIN_FILE_PATH, each)) count += 1 print 'we delete %s file at the time %s' % (count, localtime) if __name__ == "__main__": context = zmq.Context() controller = context.socket(zmq.PUB) controller.bind("tcp://%s:%s" % (ZMQ_CTRL_HOST_FLOW1, ZMQ_CTRL_VENT_PORT_FLOW5)) for i in range(20): time.sleep(0.1) controller.send("PAUSE") # repeat to send to ensure ts = ts2datetime(time.time()) print "stop_zmq&stop-flow2*%s" % ts #delete_files()
def create_user_log(): now_time=int(time.time()) today_datetime=datetime2ts(ts2datetime(now_time)) start_time=today_datetime-DAY #前一天0点 end_time=today_datetime #定时文件启动的0点 operate_date=ts2datetime(start_time) #查询账户列表 user_name_list=get_user_account_list() mark_list=[] #查询账户所管理的虚拟人 for user_account in user_name_list: #对应账户的日志ID #print 'user_account',user_account,type(user_account) #print 'operate_date',operate_date,type(operate_date) user_account=list(user_account)[0] log_id=str(user_account)+'_'+operate_date print log_id log_content_dict=dict() ########################################################################### #微博部分日志 ########################################################################### #账户是否创建虚拟人 xnr_number=create_xnr_number(user_account,start_time,end_time) if xnr_number > 0: log_content_dict[u'创建微博虚拟人']=xnr_number else: pass xnr_user_no_list=get_user_xnr_list(user_account) xnr_uid_list=get_xnr_uid_list(user_account) #遍历各个模块,验证所管理虚拟人是否进行操作 ##################发帖操作################# #日常发帖 daily_post_type='daily_post' daily_post_num=count_type_posting(daily_post_type,operate_date,xnr_user_no_list) if daily_post_num > 0: log_content_dict[u'微博-日常发帖']=daily_post_num else: pass #业务发帖 business_post_type='business_post' business_post_num=count_type_posting(business_post_type,operate_date,xnr_user_no_list) if business_post_num > 0: log_content_dict[u'微博-业务发帖']=business_post_num else: pass #热点跟随 hot_post_type='hot_post' hot_post_num=count_type_posting(hot_post_type,operate_date,xnr_user_no_list) if hot_post_num > 0: log_content_dict[u'微博-热点跟随']=hot_post_num else: pass #跟踪转发 retweet_timing_num=count_tweet_retweet(start_time,end_time,xnr_user_no_list) if retweet_timing_num > 0: log_content_dict[u'微博-跟踪转发']=retweet_timing_num else: pass ##################社交操作:转发、评论、点赞################# #转发 retweet_type='3' retweet_num=count_retweet_comment_operate(retweet_type,operate_date,xnr_uid_list) if retweet_num > 0: log_content_dict[u'微博-转发']=retweet_num else: pass #评论 comment_type='2' comment_num=count_retweet_comment_operate(comment_type,operate_date,xnr_uid_list) if comment_num > 0: log_content_dict[u'微博-评论']=comment_num else: pass #点赞 like_num=count_like_operate(start_time,end_time,xnr_uid_list) if like_num > 0: log_content_dict[u'微博-点赞']=like_num else: pass #私信 private_message_num=count_private_message(start_time,end_time,xnr_uid_list) if private_message_num > 0: log_content_dict[u'微博-私信']=private_message_num else: pass ##################加入语料################# add_corpus_num=count_add_corpus(start_time,end_time,xnr_user_no_list) if add_corpus_num > 0: log_content_dict[u'微博-加入语料']=add_corpus_num else: pass ##################上报操作################# report_num=count_report_type(start_time,end_time,xnr_user_no_list) if report_num > 0: log_content_dict[u'微博-上报']=report_num else: pass ##################加入预警库################# add_warming_num=count_add_warming_speech(start_time,end_time,xnr_user_no_list) if add_warming_num > 0: log_content_dict[u'微博-加入预警库']=add_warming_num else: pass ##################定时任务################# timing_task_num=count_add_timing_task(start_time,end_time,xnr_user_no_list) if timing_task_num > 0: log_content_dict[u'微博-创建定时任务']=timing_task_num else: pass ########################################################################### ########################################################################### ##################领域创建################# create_domain_num=count_create_domain(user_account,start_time,end_time) if create_domain_num > 0: log_content_dict[u'领域创建']=create_domain_num else: pass ##################业务知识库################# #敏感词创建 create_sensitive_words_num=count_create_business(user_account,start_time,end_time,weibo_sensitive_words_index_name,weibo_sensitive_words_index_type) if create_sensitive_words_num > 0: log_content_dict[u'创建敏感词']=create_sensitive_words_num else: pass #时间节点创建 create_date_remind_num=count_create_business(user_account,start_time,end_time,weibo_date_remind_index_name,weibo_date_remind_index_type) if create_date_remind_num > 0: log_content_dict[u'创建时间节点']=create_date_remind_num else: pass #隐喻式表达创建 create_hidden_expression_num=count_create_business(user_account,start_time,end_time,weibo_hidden_expression_index_name,weibo_hidden_expression_index_type) if create_hidden_expression_num > 0: log_content_dict[u'创建隐喻式表达']=create_hidden_expression_num else: pass ########################################################################### #QQ部分日志 ########################################################################### #账户是否创建QQ虚拟人 qq_xnr_number=create_qqxnr_number(user_account,start_time,end_time) if qq_xnr_number > 0: log_content_dict[u'创建QQ虚拟人']=qq_xnr_number else: pass qq_xnr_user_no_list=get_user_qqxnr_list(user_account) #今日发帖量 qqxnr_daily_post = count_qqxnr_daily_post(operate_date,qq_xnr_user_no_list) if qqxnr_daily_post > 0: log_content_dict[u'QQ-发言']=qqxnr_daily_post else: pass #上报数量 qq_report_number=count_qq_report_number(start_time,end_time,qq_xnr_user_no_list) if qq_report_number > 0: log_content_dict[u'QQ-上报']=qq_report_number else: pass log_content=json.dumps(log_content_dict) #写入日志 #日志ID存在判断 try: es_xnr.update(index=weibo_log_management_index_name,doc_type=weibo_log_management_index_type,id=log_id,body={'doc':{'operate_content':log_content}}) mark=True except: mark=False mark_list.append(mark) return mark_list
def scan_retweet(ft_type): count = 0 scan_cursor = 0 now_ts = time.time() now_date_ts = datetime2ts(ts2datetime(now_ts)) #get redis db number db_number = get_db_num(now_date_ts) if ft_type == 'fb': retweet_redis_dict = fb_retweet_dict index_name = 'fb_be_retweet_' +str(db_number) else: retweet_redis_dict = tw_retweet_dict index_name = 'tw_be_retweet_' +str(db_number) #get redis db retweet_redis = retweet_redis_dict[str(db_number)] # # 1. 判断即将切换的db中是否有数据 # while 1: # redis_host_list.pop(db_number) # other_db_number = retweet_redis_dict[redis_host_list[0]] # 获得对应的redis # current_dbsize = other_db_number.dbsize() # if current_dbsize: # break # 已经开始写入新的db,说明前一天的数据已经写完 # else: # time.sleep(60) # 2. 删除之前的es be_retweet_es_mappings(str(db_number),ft_type) # 3. scan retweet_bulk_action = [] be_retweet_bulk_action = [] start_ts = time.time() #retweet count/be_retweet count #retweet_count = 0 be_retweet_count = 0 while True: re_scan = retweet_redis.scan(scan_cursor, count=100) re_scan_cursor = re_scan[0] ''' if re_scan_cursor == 0: print 'scan finish' if retweet_bulk_action != []: es.bulk(retweet_bulk_action, index='retweet_'+str(db_number), doc_type='user') if be_retweet_bulk_action != []: es.bulk(be_retweet_bulk_action, index='be_retweet_'+str(db_number), doc_type='user') break ''' for item in re_scan[1]: count += 1 item_list = item.split('_') save_dict = {} if len(item_list)==3: be_retweet_count += 1 uid = item_list[2] item_result = retweet_redis.hgetall(item) save_dict['uid'] = uid save_dict['uid_be_retweet'] = json.dumps(item_result) be_retweet_bulk_action.extend([{'index':{'_id':uid}}, save_dict]) #print 'be_retweet_bulk_action...',be_retweet_bulk_action if be_retweet_bulk_action: es.bulk(be_retweet_bulk_action, index=index_name, doc_type='user') else: break retweet_bulk_action = [] be_retweet_bulk_action = [] end_ts = time.time() print '%s sec scan %s count user:'******'count:', count # flushdb retweet_redis.flushdb() print 'end'
'type':'long' }, 'socail_keyword':{ #社区初始关键词 'type':'string', 'index':'not_analyzed' }, 'warning_type':{ #预警类型:人物突增预警;影响力剧增预警;敏感度剧增预警;社区聚集预警 'type':'string', 'index':'not_analyzed' } } } } } weibo_community_index_name = weibo_community_index_name_pre + date_name if not es.indices.exists(index=weibo_community_index_name): es.indices.create(index=weibo_community_index_name,body=index_info,ignore=400) if __name__ == '__main__': if S_TYPE == 'test': date_name = WEIBO_COMMUNITY_DATE else: now_time = int(time.time()) date_name = ts2datetime(now_time) weibo_community_mappings(date_name)
f.write(json.dumps(group_evaluate(xnr_user_no,v,all_influence,all_sensitive,G))+'\n') #print 'total time:',time.time()-s #print 'eq:',ExtendQ(allG,coms_list) mark = True except: mark = False return mark if __name__ == '__main__': start_time = int(time.time()) if S_TYPE == 'test': today_time = datetime2ts(WEIBO_COMMUNITY_DATE) xnr_user_no_list = ['WXNR0004'] else: today_time = time.time() - 1*DAY #today_time = datetime2ts('2018-07-15') print ts2datetime(today_time) xnr_user_no_list = get_compelete_wbxnr() # xnr_user_no_list = ['WXNR0004'] for xnr_user_no in xnr_user_no_list: create_weibo_community(xnr_user_no,today_time) end_time = int(time.time()) print 'cost_time:::',end_time - start_time
def get_hot_sensitive_recommend_at_user(sort_item): if S_TYPE == 'test': now_ts = datetime2ts(S_DATE_FB) else: now_ts = int(time.time()) datetime = ts2datetime(now_ts-24*3600) #sort_item = 'sensitive' sort_item_2 = 'timestamp' index_name = facebook_flow_text_index_name_pre + datetime query_body = { 'query':{ 'match_all':{} }, 'sort':{sort_item:{'order':'desc'}}, 'size':HOT_EVENT_TOP_USER, '_source':['uid','user_fansnum','retweeted','timestamp'] } # if sort_item == 'retweeted': # sort_item_2 = 'timestamp' # else: # sort_item_2 = 'retweeted' es_results = es.search(index=index_name,doc_type=facebook_flow_text_index_type,body=query_body)['hits']['hits'] uid_fansnum_dict = dict() if es_results: for result in es_results: result = result['_source'] uid = result['uid'] uid_fansnum_dict[uid] = {} uid_fansnum_dict[uid][sort_item_2] = result[sort_item_2] uid_fansnum_dict_sort_top = sorted(uid_fansnum_dict.items(),key=lambda x:x[1][sort_item_2],reverse=True) uid_set = set() for item in uid_fansnum_dict_sort_top: uid_set.add(item[0]) uid_list = list(uid_set) ## 根据uid,从weibo_user中得到 nick_name uid_nick_name_dict = dict() # uid不会变,而nick_name可能会变 es_results_user = es.mget(index=facebook_user_index_name,doc_type=facebook_user_index_type,body={'ids':uid_list})['docs'] i = 0 for result in es_results_user: if result['found'] == True: result = result['_source'] uid = result['uid'] nick_name = result['name'] if nick_name: i += 1 uid_nick_name_dict[uid] = nick_name if i >= HOT_AT_RECOMMEND_USER_TOP: break return uid_nick_name_dict