def get_user_geo(uid): result = [] user_geo_result = {} user_ip_dict = {} user_ip_result = dict() now_ts = time.time() now_date = ts2datetime(now_ts) #run_type if RUN_TYPE == 1: ts = datetime2ts(now_date) else: ts = datetime2ts(RUN_TEST_TIME) for i in range(1, 8): ts = ts - 3600*24 results = r_cluster.hget('new_ip_'+str(ts), uid) if results: ip_dict = json.loads(results) for ip in ip_dict: ip_count = len(ip_dict[ip].split('&')) try: user_ip_result[ip] += ip_count except: user_ip_result[ip] = ip_count user_geo_dict = ip2geo(user_ip_result) user_geo_result = sorted(user_geo_dict.items(), key=lambda x:x[1], reverse=True) return user_geo_result
def get_user_weibo(uid): result = [] #use to test datestr = '2013-09-02' end_ts = datetime2ts(datestr) #real way to get datestr and ts_segment ''' now_ts = time.time() now_date = ts2datetime(now_ts) now_date_ts = datetime2ts(now_date) ts_segment = (int((now_ts - now_date_ts) / 3600)) % 24 end_ts = now_date_ts + ts_segment * 3600 ''' file_list = set(os.listdir(DEFAULT_LEVELDBPATH)) for i in range(24 * 7, 0, -1): ts = end_ts - i * 3600 datestr = ts2datetime(ts) ts_segment = (int((ts - datetime2ts(datestr)) / 3600)) % 24 + 1 leveldb_folder = datestr + str(ts_segment) if leveldb_folder in file_list: leveldb_bucket = dynamic_leveldb(leveldb_folder) try: user_weibo = leveldb_bucket.Get(uid) weibo_list = json.loads(user_weibo) result.extend(weibo_list) except: pass return result
def get_user_ip(uid): flow_text_index_list = [] now_timestamp = datetime2ts(ts2datetime(time.time())) if RUN_TYPE == 0: now_timestamp = datetime2ts(RUN_TEST_TIME) for i in range(7, 0, -1): iter_date = ts2datetime(now_timestamp - DAY * i) flow_text_index_list.append(flow_text_index_name_pre + iter_date) weibo_all = es_flow_text.search(index=flow_text_index_list, doc_type=flow_text_index_type, body={ 'query': { 'filtered': { 'filter': { 'term': { 'uid': uid } } } }, 'size': 10, })['hits']['hits'] ip = weibo_all[0]["_source"]["ip"] return ip
def search_sentiment_all_portrait(start_date, end_date, time_segment): sentiment_ts_count_dict = {} start_ts = datetime2ts(start_date) end_ts = datetime2ts(end_date) search_date_list = [] domain_list = domain_en2ch_dict.keys() for i in range(start_ts, end_ts + DAY, DAY): iter_date = ts2datetime(i) search_date_list.append(iter_date) for sentiment in sentiment_type_list: sentiment_ts_count_dict[sentiment] = [] for date_item in search_date_list: ts_count_result_list = [] for domain in domain_list: iter_r_name = r_domain_sentiment_pre + date_item + '_' + sentiment + '_' + domain #get ts_count_dict in one day ts_count_result = R_DOMAIN_SENTIMENT.hgetall(iter_r_name) ts_count_result_list.append(ts_count_result) #union all domain to get all portrait all_ts_count_result = union_dict(ts_count_result_list) #get x and y list by timesegment new_ts_count_dict = get_new_ts_count_dict(all_ts_count_result, time_segment, date_item) sort_new_ts_count = sorted(new_ts_count_dict.items(), key=lambda x:x[0]) sentiment_ts_count_dict[sentiment].extend(sort_new_ts_count) return sentiment_ts_count_dict
def new_get_influence_trend(uid, time_segment): results = {} try: influence_history = ES_COPY_USER_PORTRAIT.get(index=COPY_USER_PORTRAIT_INFLUENCE, doc_type=COPY_USER_PORTRAIT_INFLUENCE_TYPE,\ id=uid)['_source'] print ES_COPY_USER_PORTRAIT, COPY_USER_PORTRAIT_INFLUENCE, COPY_USER_PORTRAIT_INFLUENCE_TYPE, uid print influence_history except: influence_history = {} if influence_history: results = get_evaluate_trend(influence_history, 'bci') else: results = {} print results #deal results for situation---server power off new_time_list = [] new_count_list = [] new_results = {} now_time_ts = time.time() now_date_ts = datetime2ts(ts2datetime(now_time_ts)) if RUN_TYPE == 0: now_date_ts = datetime2ts(RUN_TEST_TIME) for i in range(time_segment, 0, -1): iter_date_ts = now_date_ts - i * DAY try: date_count = results[iter_date_ts] except: date_count = 0 new_time_list.append(iter_date_ts) new_count_list.append(date_count) new_results = {'timeline': new_time_list, 'evaluate_index': new_count_list} return new_results
def get_user_detail(date, input_result, status, user_type="influence", auth=""): bci_date = ts2datetime(datetime2ts(date) - DAY) results = [] if status=='show_in': uid_list = input_result if status=='show_compute': uid_list = input_result.keys() if status=='show_in_history': uid_list = input_result.keys() if date!='all': index_name = 'bci_' + ''.join(bci_date.split('-')) else: now_ts = time.time() now_date = ts2datetime(now_ts) index_name = 'bci_' + ''.join(now_date.split('-')) tmp_ts = str(datetime2ts(date) - DAY) sensitive_string = "sensitive_score_" + tmp_ts query_sensitive_body = { "query":{ "match_all":{} }, "size":1, "sort":{sensitive_string:{"order":"desc"}} } try: top_sensitive_result = es_bci_history.search(index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX, body=query_sensitive_body, _source=False, fields=[sensitive_string])['hits']['hits'] top_sensitive = top_sensitive_result[0]['fields'][sensitive_string][0] except Exception, reason: print Exception, reason top_sensitive = 400
def get_psycho_status(uid_list): results = {} uid_sentiment_dict = {} #time for es_flow_text now_ts = time.time() now_date_ts = datetime2ts(ts2datetime(now_ts)) #run_type if RUN_TYPE == 0: now_date_ts = datetime2ts(RUN_TEST_TIME) start_date_ts = now_date_ts - DAY * WEEK for i in range(0, WEEK): iter_date_ts = start_date_ts + DAY * i flow_text_index_date = ts2datetime(iter_date_ts) flow_text_index_name = flow_text_index_name_pre + flow_text_index_date try: flow_text_exist = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\ body={'query':{'filtered':{'filter':{'terms':{'uid': uid_list}}}}, 'size': MAX_VALUE}, _source=False, fields=['uid', 'sentiment'])['hits']['hits'] except: flow_text_exist = [] for flow_text_item in flow_text_exist: uid = flow_text_item['fields']['uid'][0] sentiment = flow_text_item['fields']['sentiment'][0] if uid in uid_sentiment_dict: try: uid_sentiment_dict[uid][str(sentiment)] += 1 except: uid_sentiment_dict[uid][str(sentiment)] = 1 else: uid_sentiment_dict[uid] = {str(sentiment): 1} #compute first and second psycho_status for uid in uid_list: results[uid] = {'first': {}, 'second': {}} try: user_sentiment_result = uid_sentiment_dict[uid] except: user_sentiment_result = {} all_count = sum(user_sentiment_result.values()) #compute second level sentiment---negative type sentiment second_sentiment_count_list = [ user_sentiment_result[item] for item in user_sentiment_result if item in SENTIMENT_SECOND ] second_sentiment_all_count = sum(second_sentiment_count_list) for sentiment_item in SENTIMENT_SECOND: try: results[uid]['second'][sentiment_item] = float( user_sentiment_result[sentiment_item]) / all_count except: results[uid]['second'][sentiment_item] = 0 #compute first level sentiment---middle, postive, negative user_sentiment_result['7'] = second_sentiment_all_count for sentiment_item in SENTIMENT_FIRST: try: sentiment_ratio = float( user_sentiment_result[sentiment_item]) / all_count except: sentiment_ratio = 0 results[uid]['first'][sentiment_item] = sentiment_ratio return results
def get_user_geo(uid): result = [] user_geo_result = {} user_ip_dict = {} user_ip_result = dict() now_ts = time.time() now_date = ts2datetime(now_ts) #run_type if RUN_TYPE == 1: ts = datetime2ts(now_date) else: ts = datetime2ts(RUN_TEST_TIME) for i in range(1, 8): ts = ts - 3600*24 results = r_cluster2.hget('new_ip_'+str(ts), uid) if results: ip_dict = json.loads(results) for ip in ip_dict: ip_count = len(ip_dict[ip].split('&')) try: user_ip_result[ip] += ip_count except: user_ip_result[ip] = ip_count user_geo_dict = ip2geo(user_ip_result) user_geo_result = sorted(user_geo_dict.items(), key=lambda x:x[1], reverse=True) return user_geo_result
def get_user_geo(uid): result = [] user_geo_result = {} user_ip_dict = {} user_ip_result = dict() now_ts = time.time() now_date = ts2datetime(now_ts) ts = datetime2ts(now_date) #test ts = datetime2ts('2013-09-08') for i in range(1, 8): ts = ts - 3600*24 results = r_cluster.hget('ip_'+str(ts), uid) if results: ip_dict = json.loads(results) for ip in ip_dict: try: user_ip_result[ip] += ip_dict[ip] except: user_ip_result[ip] = ip_dict[ip] #print 'user_ip_result:', user_ip_result user_geo_dict = ip2geo(user_ip_result) user_geo_result = sorted(user_geo_dict.items(), key=lambda x:x[1], reverse=True) return user_geo_result
def get_user_weibo(uid): result = [] #use to test datestr = '2013-09-02' end_ts = datetime2ts(datestr) #real way to get datestr and ts_segment ''' now_ts = time.time() now_date = ts2datetime(now_ts) now_date_ts = datetime2ts(now_date) ts_segment = (int((now_ts - now_date_ts) / 3600)) % 24 end_ts = now_date_ts + ts_segment * 3600 ''' file_list = set(os.listdir(DEFAULT_LEVELDBPATH)) for i in range(24*7, 0, -1): ts = end_ts - i * 3600 datestr = ts2datetime(ts) ts_segment = (int((ts - datetime2ts(datestr)) / 3600)) % 24 + 1 leveldb_folder = datestr + str(ts_segment) if leveldb_folder in file_list: leveldb_bucket = dynamic_leveldb(leveldb_folder) try: user_weibo = leveldb_bucket.Get(uid) weibo_list = json.loads(user_weibo) result.extend(weibo_list) except: pass return result
def show_user_operation_index(admin_user, start_ts, end_ts): results = {} if admin_user == '' and start_ts == '' and end_ts == '': query_body = { 'query': { 'match_all':{} }, 'size': MAX_VALUE } elif admin_user and start_ts and end_ts: start = datetime2ts(start_ts) - 1 end = datetime2ts(end_ts) + DAY query_body = { 'query':{ 'bool':{ 'must':[ {'term': {'admin_user': admin_user}}, {'range':{'timestamp': {'gte': start, 'lt': end}}} ] } }, 'size': MAX_VALUE } elif admin_user and (start_ts == '' or end_ts == ''): query_body = { 'query':{ 'term':{'admin_user': admin_user} }, 'size': MAX_VALUE } elif admin_user == '' and start_ts != '' and end_ts != '': start = datetime2ts(start_ts) - 1 end = datetime2ts(end_ts) + DAY query_body = { 'query':{ 'range':{'timestamp':{'gte': start, 'lt': end}} }, 'size': MAX_VALUE } try: results = es_operation.search(index=operation_index_name, doc_type=operation_index_type, body=query_body)['hits']['hits'] except: results = [] return_results = [] all_operation_dict = {} stat_operation_list = ['rank_count', 'compute_count', 'sentiment_count',\ 'recomment_count', 'detect_count', 'analysis_count', 'tag_count', 'network_count', 'sensing_count'] for item in results: source = item['_source'] return_results.append(source) for stat_item in stat_operation_list: try: all_operation_dict[stat_item] += source[stat_item] except: all_operation_dict[stat_item] = source[stat_item] sort_return_results = sorted(return_results, key=lambda x:x['timestamp'], reverse=False) if sort_return_results: sort_return_results.append(all_operation_dict) return sort_return_results
def get_influence_content(uid, timestamp_from, timestamp_to): weibo_list = [] #split timestamp range to new_range_dict_list from_date_ts = datetime2ts(ts2datetime(timestamp_from)) to_date_ts = datetime2ts(ts2datetime(timestamp_to)) new_range_dict_list = [] if from_date_ts != to_date_ts: iter_date_ts = from_date_ts while iter_date_ts < to_date_ts: iter_next_date_ts = iter_date_ts + DAY new_range_dict_list.append({ 'range': { 'timestamp': { 'gte': iter_date_ts, 'lt': iter_next_date_ts } } }) iter_date_ts = iter_next_date_ts if new_range_dict_list[0]['range']['timestamp']['gte'] < timestamp_from: new_range_dict_list[0]['range']['timestamp'][ 'gte'] = timestamp_from if new_range_dict_list[-1]['range']['timestamp']['lt'] > timestamp_to: new_range_dict_list[-1]['range']['timestamp']['lt'] = timestamp_to else: new_range_dict_list = [{ 'range': { 'timestamp': { 'gte': timestamp_from, 'lt': timestamp_to } } }] #iter date to search flow_text iter_result = [] for range_item in new_range_dict_list: range_from_ts = range_item['range']['timestamp']['gte'] range_from_date = ts2datetime(range_from_ts) flow_text_index_name = flow_text_index_name_pre + range_from_date query = [] query.append({'term': {'uid': uid}}) query.append(range_item) try: flow_text_exist = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\ body={'query':{'bool':{'must': query}}, 'sort':[{'timestamp':'asc'}]})['hits']['hits'] except: flow_text_exist = [] iter_result.extend(flow_text_exist) # get weibo list for item in flow_text_exist: source = item['_source'] weibo = {} weibo['timestamp'] = ts2date(source['timestamp']) weibo['ip'] = source['ip'] weibo['text'] = source['text'] weibo['geo'] = '\t'.join(source['geo'].split('&')) weibo_list.append(weibo) return weibo_list
def adsRec(uid, queryInterval=HOUR * 24): ''' 从广告表中读取当前时间点前一段时间queryInterval内的广微博,得到其中的广告部分 然后根据用户的key_word信息得到推荐的广告。 :param uid: 用户ID :param queryInterval: 查询之前多久的广告 :return: 广告微博列表,按照相关度(感兴趣程度)排序 ''' # 运行状态, # 0 -> 当前为2013-9-8 00:00:00 # 1 -> 当前时间 now_date = ts2datetime(time.time()) if RUN_TYPE == 1 else ts2datetime( datetime2ts(RUN_TEST_TIME) - DAY) # 获取用户的偏好 try: print uid user_portrait_result = es_user_portrait. \ get_source(index=portrait_index_name, doc_type=profile_index_type, id=uid) except: return None user_key_words = set(user_portrait_result["keywords_string"].split("&")) # 直接从广告表中读取并计算 ads_weibo_all = es_ads_weibo.search( index=ads_weibo_index_name, doc_type=ads_weibo_index_type, body={ 'query': { "filtered": { "filter": { "range": { "timestamp": { "gte": datetime2ts(now_date) - queryInterval } } } } }, 'size': 2000, })['hits']['hits'] random.shuffle(ads_weibo_all) ads_weibo_all = ads_weibo_all[:800] # 根据权重得到不同类别上词语的权重TFIDF topic_word_weight_dic = construct_topic_word_weight_dic( ADS_TOPIC_TFIDF_DIR) # 根据用户发微博的keywords得到用户在广告的topic上的分布 # 因为已有的topic不太适合广告的分类 user_topic_dic = construct_topic_feature_dic(user_key_words, topic_word_weight_dic) ads_weibo_prefer = adsPreferred(user_topic_dic, ads_weibo_all, topic_word_weight_dic, 30) return ads_weibo_prefer
def compare_user_activity(uid_list): result = {} # output data: {user:[weibo_status]}, {user:[(date,weibo)]}, ts_list timesegment_result = {} now_ts = time.time() date = ts2datetime(now_ts) #run_type if RUN_TYPE == 1: ts = datetime2ts(date) else: ts = datetime2ts(RUN_TEST_TIME) for i in range(1,8): ts = ts - DAY hash_name = 'activity_' + str(ts) r_result = r_cluster.hmget(hash_name, uid_list) if r_result: count = 0 for r_item in r_result: if r_item: r_item = json.loads(r_item) if uid_list[count] not in result: result[uid_list[count]] = {} if uid_list[count] not in timesegment_result: timesegment_result[uid_list[count]] = {} count += 1 if r_item: time_result = dict() for segment in r_item: try: result[uid_list[count-1]][int(segment)/16*15*60*16+ts] += r_item[segment] except: result[uid_list[count-1]][int(segment)/16*15*60*16+ts] = r_item[segment] try: timesegment_result[uid_list[count-1]][int(segment)/16*15*60*16] += r_item[segment] except: timesegment_result[uid_list[count-1]][int(segment)/16*15*60*16] = r_item[segment] user_list = {} user_timesegment_list = {} ts_list = [] for user in result: timesegment_dict = timesegment_result[user] sort_segment = sorted(timesegment_dict.items(), key=lambda x:x[1], reverse=True) segment_top = sort_segment[:3] user_timesegment_list[user] = segment_top user_dict = result[user] for i in range(0, 42): timestamp = ts + 15*60*16*i if len(ts_list)<42: ts_list.append(timestamp) try: count = user_dict[timestamp] except: count = 0 try: user_list[user].append(count) except: user_list[user] = [count] return user_list, user_timesegment_list, ts_list
def get_db_num(timestamp): date = ts2datetime(timestamp) date_ts = datetime2ts(date) r_beigin_ts = datetime2ts(R_BEGIN_TIME) db_number = ((date_ts - r_beigin_ts) / (DAY * 7)) % 2 + 1 # run_type if RUN_TYPE == 0: db_number = 1 return db_number
def get_psycho_status(uid_list): results = {} uid_sentiment_dict = {} #time for es_flow_text now_ts = time.time() now_date_ts = datetime2ts(ts2datetime(now_ts)) #run_type if RUN_TYPE == 0: now_date_ts = datetime2ts(RUN_TEST_TIME) start_date_ts = now_date_ts - DAY * WEEK for i in range(0, WEEK): iter_date_ts = start_date_ts + DAY * i flow_text_index_date = ts2datetime(iter_date_ts) flow_text_index_name = flow_text_index_name_pre + flow_text_index_date try: flow_text_exist = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\ body={'query':{'filtered':{'filter':{'terms':{'uid': uid_list}}}}, 'size': MAX_VALUE}, _source=False, fields=['uid', 'sentiment'])['hits']['hits'] except: flow_text_exist = [] for flow_text_item in flow_text_exist: uid = flow_text_item['fields']['uid'][0] sentiment = flow_text_item['fields']['sentiment'][0] if uid in uid_sentiment_dict: try: uid_sentiment_dict[uid][str(sentiment)] += 1 except: uid_sentiment_dict[uid][str(sentiment)] = 1 else: uid_sentiment_dict[uid] = {str(sentiment): 1} #compute first and second psycho_status for uid in uid_list: results[uid] = {'first':{}, 'second':{}} try: user_sentiment_result = uid_sentiment_dict[uid] except: user_sentiment_result = {} all_count = sum(user_sentiment_result.values()) #compute second level sentiment---negative type sentiment second_sentiment_count_list = [user_sentiment_result[item] for item in user_sentiment_result if item in SENTIMENT_SECOND] second_sentiment_all_count = sum(second_sentiment_count_list) for sentiment_item in SENTIMENT_SECOND: try: results[uid]['second'][sentiment_item] = float(user_sentiment_result[sentiment_item]) / all_count except: results[uid]['second'][sentiment_item] = 0 #compute first level sentiment---middle, postive, negative user_sentiment_result['7'] = second_sentiment_all_count for sentiment_item in SENTIMENT_FIRST: try: sentiment_ratio = float(user_sentiment_result[sentiment_item]) / all_count except: sentiment_ratio = 0 results[uid]['first'][sentiment_item] = sentiment_ratio return results
def get_social_inter_content(uid1, uid2, type_mark): weibo_list = [] #get two type relation about uid1 and uid2 #search weibo list now_ts = int(time.time()) #run_type if RUN_TYPE == 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = datetime2ts(RUN_TEST_TIME) #uid2uname uid2uname = {} try: portrait_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type ,\ body={'ids': [uid1, uid2]}, _source=False, fields=['uid', 'uname'])['docs'] except: portrait_result = [] for item in portrait_result: uid = item['_id'] if item['found'] == True: uname = item['fields']['uname'][0] uid2uname[uid] = uname else: uid2uname[uid] = 'unknown' #iter date to search weibo list for i in range(7, 0, -1): iter_date_ts = now_date_ts - i*DAY iter_date = ts2datetime(iter_date_ts) flow_text_index_name = flow_text_index_name_pre + str(iter_date) query = [] query.append({'bool':{'must':[{'term':{'uid':uid1}}, {'term':{'directed_uid': int(uid2)}}]}}) if type_mark=='out': query.append({'bool':{'must':[{'term':{'uid':uid2}}, {'term':{'directed_uid': int(uid1)}}]}}) try: flow_text_result = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\ body={'query': {'bool':{'should': query}}, 'sort':[{'timestamp':{'order': 'asc'}}], 'size':MAX_VALUE})['hits']['hits'] except: flow_text_result = [] for flow_text in flow_text_result: source = flow_text['_source'] weibo = {} weibo['timestamp'] = source['timestamp'] weibo['ip'] = source['ip'] weibo['geo'] = source['geo'] weibo['text'] = '\t'.join(source['text'].split('&')) weibo['uid'] = source['uid'] weibo['uname'] = uid2uname[weibo['uid']] weibo['directed_uid'] = str(source['directed_uid']) weibo['directed_uname'] = uid2uname[str(source['directed_uid'])] weibo_list.append(weibo) return weibo_list
def new_get_user_location(uid): results = {} now_date = ts2datetime(time.time()) now_date_ts = datetime2ts(now_date) #run type if RUN_TYPE == 0: now_date_ts = datetime2ts(RUN_TEST_TIME) - DAY now_date = ts2datetime(now_date_ts) #now ip try: ip_time_string = r_cluster.hget('new_ip_'+str(now_date_ts), uid) except Exception, e: raise e
def read_flow_text_sentiment(uid_list): """ 读取用户微博(返回结果有微博情绪标签): 输入数据:uid_list(字符串型列表) 输出数据:word_dict(用户分词结果字典),weibo_list(用户微博列表) word_dict示例:{uid1:{'w1':f1,'w2':f2...}...} weibo_list示例:[[uid1,text1,s1,ts1],[uid2,text2,s2,ts2],...](每一条记录对应四个值:uid、text、sentiment、timestamp) """ word_dict = dict() # 词频字典 weibo_list = [] # 微博列表 now_ts = time.time() now_date_ts = datetime2ts(ts2datetime(now_ts)) now_date_ts = datetime2ts("2013-09-08") start_date_ts = now_date_ts - DAY * WEEK for i in range(0, WEEK): iter_date_ts = start_date_ts + DAY * i flow_text_index_date = ts2datetime(iter_date_ts) flow_text_index_name = flow_text_index_name_pre + flow_text_index_date print flow_text_index_name try: flow_text_exist = es_flow_text.search( index=flow_text_index_name, doc_type=flow_text_index_type, body={"query": {"filtered": {"filter": {"terms": {"uid": uid_list}}}}, "size": MAX_VALUE}, _source=False, fields=["text", "uid", "sentiment", "keywords_dict", "timestamp"], )["hits"]["hits"] except: flow_text_exist = [] for flow_text_item in flow_text_exist: uid = flow_text_item["fields"]["uid"][0].encode("utf-8") text = flow_text_item["fields"]["text"][0].encode("utf-8") sentiment = int(flow_text_item["fields"]["sentiment"][0]) ts = flow_text_item["fields"]["timestamp"][0] keywords_dict = json.loads(flow_text_item["fields"]["keywords_dict"][0]) keywords_dict = json.dumps(keywords_dict, encoding="UTF-8", ensure_ascii=False) keywords_dict = eval(keywords_dict) if word_dict.has_key(uid): item_dict = Counter(word_dict[uid]) keywords_dict = Counter(keywords_dict) item_dict = dict(item_dict + keywords_dict) word_dict[uid] = item_dict else: word_dict[uid] = keywords_dict weibo_list.append([uid, text, sentiment, ts]) return word_dict, weibo_list
def get_group_user_track(uid): results = [] #step1:get user_portrait activity_geo_dict try: portrait_result = es_user_portrait.get(index=portrait_index_name, doc_type=portrait_index_type,\ id=uid, _source=False, fields=['activity_geo_dict']) except: portrait_result = {} if portrait_result == {}: return 'uid is not in user_portrait' activity_geo_dict = json.loads( portrait_result['fields']['activity_geo_dict'][0]) now_date_ts = datetime2ts(ts2datetime(int(time.time()))) start_ts = now_date_ts - DAY * len(activity_geo_dict) #step2: iter date to get month track for geo_item in activity_geo_dict: iter_date = ts2datetime(start_ts) sort_day_dict = sorted(geo_item.items(), key=lambda x: x[1], reverse=True) if sort_day_dict: results.append([iter_date, sort_day_dict[0][0]]) else: results.append([iter_date, '']) start_ts = start_ts + DAY return results
def search_mention(now_ts, uid): date = ts2datetime(now_ts) ts = datetime2ts(date) #print 'at date-ts:', ts stat_results = dict() results = dict() for i in range(1,8): ts = ts - 24 * 3600 try: result_string = r_cluster.hget('at_' + str(ts), str(uid)) except: result_string = '' if not result_string: continue result_dict = json.loads(result_string) for at_uid in result_dict: try: stat_results[at_uid] += result_dict[at_uid] except: stat_results[at_uid] = result_dict[at_uid] for at_uid in stat_results: # search uid ''' uname = search_uid2uname(at_uid) if not uname: ''' uid = '' count = stat_results[at_uid] results[at_uid] = [uid, count] if results: sort_results = sorted(results.items(), key=lambda x:x[1][1], reverse=True) return [sort_results[:20], len(results)] else: return [None, 0]
def ajax_revise_task(): task_name = request.args.get('task_name', '') # must finish = request.args.get("finish", "10") stop_time = request.args.get('stop_time', '') # timestamp now_ts = datetime2ts("2013-09-06") #now_ts = time.time() if stop_time and stop_time < now_ts: return json.dumps([]) if task_name: task_detail = es.get(index=index_manage_sensing_task, doc_type=task_doc_type, id=task_name)['_source'] if stop_time: task_detail['stop_time'] = stop_time if int(finish) == 0: task_detail['finish'] = finish task_detail['processing_status'] = "1" # 重启时将处理状态改为 if stop_time or int(finish) == 0: es.index(index=index_manage_sensing_task, doc_type=task_doc_type, id=task_name, body=task_detail) return json.dumps(['1']) return json.dumps([])
def get_db_num(timestamp): date = ts2datetime(timestamp) date_ts = datetime2ts(date) db_number = ((date_ts - r_beigin_ts) / (DAY * 7)) % 2 + 1 if RUN_TYPE == 0: db_number = 1 return db_number
def recommentation_in_auto(seatch_date, submit_user): results = [] #run type if RUN_TYPE == 1: now_date = ts2datetime(time.time() - DAY) else: now_date = ts2datetime(datetime2ts(RUN_TEST_TIME) - DAY) recomment_hash_name = 'recomment_' + now_date + '_auto' recomment_influence_hash_name = 'recomment_' + now_date + '_influence' recomment_sensitive_hash_name = 'recomment_' + now_date + '_sensitive' recomment_compute_hash_name = 'compute' #step1: get auto auto_result = r.hget(recomment_hash_name, 'auto') if auto_result: auto_user_list = json.loads(auto_result) else: auto_user_list = [] #step2: get admin user result admin_result = r.hget(recomment_hash_name, submit_user) if admin_result: admin_user_list = json.loads(admin_result) else: admin_user_list = [] #step3: get union user and filter compute/influence/sensitive union_user_auto_set = set(auto_user_list) | set(admin_user_list) influence_user = set(r.hkeys(recomment_influence_hash_name)) sensitive_user = set(r.hkeys(recomment_sensitive_hash_name)) compute_user = set(r.hkeys(recomment_compute_hash_name)) filter_union_user = union_user_auto_set - (influence_user | sensitive_user | compute_user) auto_user_list = list(filter_union_user) #step4: get user detail results = get_user_detail(now_date, auto_user_list, 'show_in', 'auto') return results
def new_get_activeness_trend(uid, time_segment): results = {} try: activeness_history = ES_COPY_USER_PORTRAIT.get(index=COPY_USER_PORTRAIT_ACTIVENESS, doc_type=COPY_USER_PORTRAIT_ACTIVENESS_TYPE,\ id=uid)['_source'] except: activeness_history = {} if activeness_history: results = get_evaluate_trend(activeness_history, 'activeness') else: results = {} #deal results for situation---server power off new_time_list = [] new_count_list = [] new_results = {} now_time_ts = time.time() now_date_ts = datetime2ts(ts2datetime(now_time_ts)) for i in range(time_segment, 0, -1): iter_date_ts = now_date_ts - i * DAY try: date_count = results[iter_date_ts] except: date_count = 0 new_time_list.append(iter_date_ts) new_count_list.append(date_count) new_results = {'timeline': new_time_list, 'evaluate_index': new_count_list} return new_results
def get_geo_track(uid): date_results = [] # {'2013-09-01':[(geo1, count1),(geo2, count2)], '2013-09-02'...} now_ts = time.time() now_date = ts2datetime(now_ts) #test now_date = '2013-09-08' ts = datetime2ts(now_date) city_list = [] city_set = set() for i in range(7, 0, -1): timestamp = ts - i*24*3600 #print 'timestamp:', ts2datetime(timestamp) ip_dict = dict() results = r_cluster.hget('ip_'+str(timestamp), uid) ip_dict = dict() date = ts2datetime(timestamp) date_key = '-'.join(date.split('-')[1:]) if results: ip_dict = json.loads(results) geo_dict = ip_dict2geo(ip_dict) city_list.extend(geo_dict.keys()) sort_geo_dict = sorted(geo_dict.items(), key=lambda x:x[1], reverse=True) date_results.append([date_key, sort_geo_dict[:2]]) else: date_results.append([date_key, []]) print 'results:', date_results city_set = set(city_list) geo_conclusion = get_geo_conclusion(uid, city_set) return [date_results, geo_conclusion]
def search_location(now_ts, uid): date = ts2datetime(now_ts) #print 'date:', date ts = datetime2ts(date) #print 'date-ts:', ts stat_results = dict() results = dict() for i in range(1, 8): ts = ts - 24 * 3600 #print 'for-ts:', ts result_string = r_cluster.hget('ip_' + str(ts), str(uid)) if not result_string: continue result_dict = json.loads(result_string) for ip in result_dict: try: stat_results[ip] += result_dict[ip] except: stat_results[ip] = result_dict[ip] for ip in stat_results: city = ip2city(ip) if city: try: results[city][ip] = stat_results[ip] except: results[city] = {ip: stat_results[ip]} description = active_geo_description(results) results['description'] = description #print 'location results:', results return results
def search_sentiment_all_keywords_task(submit_date, keywords_string, submit_user, start_date, end_date, status): results = [] query_list = [] if submit_date: submit_ts_start = datetime2ts(submit_date) submit_ts_end = submit_ts_start + DAY query_list.append({'range': {'submit_ts': {'gte': submit_ts_start, 'lt':submit_ts_end}}}) if keywords_string: keywords_list = keywords_string.split(',') query_list.append({'terms':{'query_keywords': keywords_list}}) if submit_user: query_list.append({'term': {'submit_user': submit_user}}) if start_date: start_s_ts = datetime2ts(start_date) if end_date: start_e_ts = datetime2ts(end_date) else: start_e_ts = start_s_ts + DAY * 30 start_date_nest_body_list = [ts2datetime(ts) for ts in range(start_s_ts, start_e_ts + DAY, DAY)] query_list.append({'terms':{'start_date': start_date_nest_body_list}}) if end_date: end_e_ts = datetime2ts(end_date) if start_date: end_s_ts = datetime2ts(start_date) else: end_s_ts = end_e_ts - DAY * 30 end_date_nest_body_list = [ts2datetime(ts) for ts in range(end_s_ts, end_e_ts + DAY, DAY)] query_list.append({'terms': {'end_date': end_date_nest_body_list}}) if status: query_list.append({'term': {'status': status}}) try: task_results = es_sentiment_task.search(index=sentiment_keywords_index_name, \ doc_type=sentiment_keywords_index_type, body={'query':{'bool':{'must':query_list}}})['hits']['hits'] except: task_results = [] for task_item in task_results: task_source = task_item['_source'] task_id = task_source['task_id'] start_date = task_source['start_date'] end_date = task_source['end_date'] keywords = task_source['query_keywords'] submit_ts = ts2date(task_source['submit_ts']) status = task_source['status'] segment = task_source['segment'] results.append([task_id, start_date, end_date, keywords, submit_ts, status, segment]) return results
def get_influence_content(uid, timestamp_from, timestamp_to): weibo_list = [] #split timestamp range to new_range_dict_list from_date_ts = datetime2ts(ts2datetime(timestamp_from)) to_date_ts = datetime2ts(ts2datetime(timestamp_to)) new_range_dict_list = [] if from_date_ts != to_date_ts: iter_date_ts = from_date_ts while iter_date_ts < to_date_ts: iter_next_date_ts = iter_date_ts + DAY new_range_dict_list.append({'range':{'timestamp':{'gte':iter_date_ts, 'lt':iter_next_date_ts}}}) iter_date_ts = iter_next_date_ts if new_range_dict_list[0]['range']['timestamp']['gte'] < timestamp_from: new_range_dict_list[0]['range']['timestamp']['gte'] = timestamp_from if new_range_dict_list[-1]['range']['timestamp']['lt'] > timestamp_to: new_range_dict_list[-1]['range']['timestamp']['lt'] = timestamp_to else: new_range_dict_list = [{'range':{'timestamp':{'gte':timestamp_from, 'lt':timestamp_to}}}] #iter date to search flow_text iter_result = [] for range_item in new_range_dict_list: range_from_ts = range_item['range']['timestamp']['gte'] range_from_date = ts2datetime(range_from_ts) flow_text_index_name = flow_text_index_name_pre + range_from_date query = [] query.append({'term':{'uid':uid}}) query.append(range_item) try: flow_text_exist = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\ body={'query':{'bool':{'must': query}}, 'sort':[{'timestamp':'asc'}]})['hits']['hits'] except: flow_text_exist = [] iter_result.extend(flow_text_exist) # get weibo list for item in flow_text_exist: source = item['_source'] weibo = {} weibo['timestamp'] = ts2date(source['timestamp']) weibo['ip'] = source['ip'] weibo['text'] = source['text'] if source['geo']: weibo['geo'] = '\t'.join(source['geo'].split('&')) else: weibo['geo'] = '' weibo_list.append(weibo) return weibo_list
def search_sentiment_all_keywords_task(submit_date, keywords_string, submit_user, start_date, end_date, status): results = [] query_list = [] if submit_date: submit_ts_start = datetime2ts(submit_date) submit_ts_end = submit_ts_start + DAY query_list.append({'range': {'submit_ts': {'gte': submit_ts_start, 'lt':submit_ts_end}}}) if keywords_string: keywords_list = keywords_string.split(',') query_list.append({'terms':{'query_keywords': keywords_list}}) if submit_user: query_list.append({'term': {'submit_user': submit_user}}) if start_date: start_s_ts = datetime2ts(start_date) if end_date: start_e_ts = datetime2ts(end_date) else: start_e_ts = start_s_ts + DAY * 30 start_date_nest_body_list = [ts2datetime(ts) for ts in range(start_s_ts, start_e_ts + DAY, DAY)] query_list.append({'terms':{'start_date': start_date_nest_body_list}}) if end_date: end_e_ts = datetime2ts(end_date) if start_date: end_s_ts = datetime2ts(start_date) else: end_s_ts = end_e_ts - DAY * 30 end_date_nest_body_list = [ts2datetime(ts) for ts in range(end_s_ts, end_e_ts + DAY, DAY)] query_list.append({'terms': {'end_date': end_date_mest_body_list}}) if status: query_list.append({'term': {'status': status}}) try: task_results = es_sentiment_task.search(index=sentiment_keywords_index_name, \ doc_type=sentiment_keywords_index_type, body={'query':{'bool':{'must':query_list}}})['hits']['hits'] except: task_results = [] for task_item in task_results: task_source = task_item['_source'] task_id = task_source['task_id'] start_date = task_source['start_date'] end_date = task_source['end_date'] keywords = task_source['query_keywords'] submit_ts = ts2date(task_source['submit_ts']) status = task_source['status'] segment = task_source['segment'] results.append([task_id, start_date, end_date, keywords, submit_ts, status, segment]) return results
def get_user_geo(uid, dropped_geos=u"中国&美国"): """ :param uid: 用户的id :param dropped_geos: &分割的地点,因为geo中都包含中国 :return: geo 位置的set """ dropped_geos = set(dropped_geos.split("&")) # 获取用户的偏好 try: user_portrait_result = es_user_portrait. \ get_source(index=portrait_index_name, doc_type=profile_index_type, id=uid) except NotFoundError: user_portrait_result = None # portrait表中存在geo信息 if user_portrait_result and len(user_portrait_result["activity_geo"]) > 0: geos = user_portrait_result["activity_geo"] - dropped_geos # 不存在geo信息,获取之前发去的微博提取 else: flow_text_index_list = [] now_timestamp = datetime2ts(ts2datetime(time.time())) if RUN_TYPE == 0: now_timestamp = datetime2ts(RUN_TEST_TIME) for i in range(7, 0, -1): iter_date = ts2datetime(now_timestamp - DAY * i) flow_text_index_list.append(flow_text_index_name_pre + iter_date) weibo_all = es_flow_text.search(index=flow_text_index_list, doc_type=flow_text_index_type, body={ 'query': { 'filtered': { 'filter': { 'term': { 'uid': uid } } } }, 'size': 2000, })['hits']['hits'] geos = set() for temp in weibo_all: geos |= set(temp["_source"]["geo"].split("&")) return geos
def get_db_num(timestamp): date = ts2datetime(timestamp) date_ts = datetime2ts(date) db_number = 2 - (((date_ts - begin_ts) / (DAY * 7))) % 2 #run_type if RUN_TYPE == 0: db_number = 1 return db_number
def new_get_user_weibo(uid, sort_type): results = [] weibo_list = [] now_date = ts2datetime(time.time()) #run_type if RUN_TYPE == 0: now_date = RUN_TEST_TIME sort_type = 'timestamp' #step1:get user name try: user_profile_result = es_user_profile.get(index=profile_index_name, doc_type=profile_index_type,\ id=uid, _source=False, fields=['nick_name']) except: user_profile_result = {} if user_profile_result: uname = user_profile_result['fields']['nick_name'][0] else: uname = '' #step2:get user weibo for i in range(7, 0, -1): iter_date = ts2datetime(datetime2ts(now_date) - i * DAY) index_name = flow_text_index_name_pre + iter_date try: weibo_result = es_flow_text.search(index=index_name, doc_type=flow_text_index_type,\ body={'query':{'filtered':{'filter':{'term': {'uid': uid}}}}, 'sort':sort_type, 'size':100})['hits']['hits'] except: weibo_result = [] if weibo_result: weibo_list.extend(weibo_result) print 'weibo_list:', weibo_list[0] sort_weibo_list = sorted(weibo_list, key=lambda x: x['_source'][sort_type], reverse=True)[:100] for weibo_item in sort_weibo_list: source = weibo_item['_source'] mid = source['mid'] uid = source['uid'] text = source['text'] ip = source['geo'] timestamp = source['timestamp'] date = ts2date(timestamp) sentiment = source['sentiment'] #run_type if RUN_TYPE == 1: retweet_count = source['retweet_count'] comment_count = source['comment_count'] sensitive_score = source['sensitive'] else: retweet_count = 0 comment_count = 0 sensitive_score = 0 city = ip2city(ip) results.append([ mid, uid, text, ip, city, timestamp, date, retweet_count, comment_count, sensitive_score ]) return results
def get_recommentation(submit_user): if RUN_TYPE: now_ts = time.time() else: now_ts = datetime2ts(RUN_TEST_TIME) in_portrait_set = set(r.hkeys("compute")) result = [] for i in range(7): iter_ts = now_ts - i*DAY iter_date = ts2datetime(iter_ts) submit_user_recomment = "recomment_" + submit_user + "_" + str(iter_date) bci_date = ts2datetime(iter_ts - DAY) submit_user_recomment = r.hkeys(submit_user_recomment) bci_index_name = "bci_" + bci_date.replace('-', '') exist_bool = es_cluster.indices.exists(index=bci_index_name) if not exist_bool: continue if submit_user_recomment: user_bci_result = es_cluster.mget(index=bci_index_name, doc_type="bci", body={'ids':submit_user_recomment}, _source=True)['docs'] user_profile_result = es_user_profile.mget(index='weibo_user', doc_type='user', body={'ids':submit_user_recomment}, _source=True)['docs'] max_evaluate_influ = get_evaluate_max(bci_index_name) for i in range(len(submit_user_recomment)): uid = submit_user_recomment[i] bci_dict = user_bci_result[i] profile_dict = user_profile_result[i] try: bci_source = bci_dict['_source'] except: bci_source = None if bci_source: influence = bci_source['user_index'] influence = math.log(influence/max_evaluate_influ['user_index'] * 9 + 1 ,10) influence = influence * 100 else: influence = '' try: profile_source = profile_dict['_source'] except: profile_source = None if profile_source: uname = profile_source['nick_name'] location = profile_source['user_location'] fansnum = profile_source['fansnum'] statusnum = profile_source['statusnum'] else: uname = '' location = '' fansnum = '' statusnum = '' if uid in in_portrait_set: in_portrait = "1" else: in_portrait = "0" recomment_day = iter_date result.append([iter_date, uid, uname, location, fansnum, statusnum, influence, in_portrait]) return result
def read_flow_text_sentiment(uid_list): ''' 读取用户微博(返回结果有微博情绪标签): 输入数据:uid_list(字符串型列表) 输出数据:word_dict(用户分词结果字典),weibo_list(用户微博列表) word_dict示例:{uid1:{'w1':f1,'w2':f2...}...} weibo_list示例:[[uid1,text1,s1,ts1],[uid2,text2,s2,ts2],...](每一条记录对应四个值:uid、text、sentiment、timestamp) ''' word_dict = dict()#词频字典 weibo_list = []#微博列表 now_ts = time.time() now_date_ts = datetime2ts(ts2datetime(now_ts)) now_date_ts = datetime2ts('2013-09-08') start_date_ts = now_date_ts - DAY * WEEK for i in range(0,WEEK): iter_date_ts = start_date_ts + DAY * i flow_text_index_date = ts2datetime(iter_date_ts) flow_text_index_name = flow_text_index_name_pre + flow_text_index_date print flow_text_index_name try: flow_text_exist = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\ body={'query':{'filtered':{'filter':{'terms':{'uid': uid_list}}}}, 'size': MAX_VALUE}, _source=False, fields=['text','uid','sentiment','keywords_dict','timestamp'])['hits']['hits'] except: flow_text_exist = [] for flow_text_item in flow_text_exist: uid = flow_text_item['fields']['uid'][0].encode('utf-8') text = flow_text_item['fields']['text'][0].encode('utf-8') sentiment = int(flow_text_item['fields']['sentiment'][0]) ts = flow_text_item['fields']['timestamp'][0] keywords_dict = json.loads(flow_text_item['fields']['keywords_dict'][0]) keywords_dict = json.dumps(keywords_dict, encoding="UTF-8", ensure_ascii=False) keywords_dict = eval(keywords_dict) if word_dict.has_key(uid): item_dict = Counter(word_dict[uid]) keywords_dict = Counter(keywords_dict) item_dict = dict(item_dict + keywords_dict) word_dict[uid] = item_dict else: word_dict[uid] = keywords_dict weibo_list.append([uid,text,sentiment,ts]) return word_dict,weibo_list
def cctv_video_rec(uid, k=10): flow_text_index_list = [] now_timestamp = datetime2ts(ts2datetime(time.time())) if RUN_TYPE == 0: now_timestamp = datetime2ts(RUN_TEST_TIME) for i in range(7, 0, -1): iter_date = ts2datetime(now_timestamp - DAY * i) flow_text_index_list.append(flow_text_index_name_pre + iter_date) weibo_all = es_flow_text.search(index=flow_text_index_list, doc_type=flow_text_index_type, body={ 'query': { 'filtered': { 'filter': { 'term': { 'uid': uid } } } }, 'size': 100, })['hits']['hits'] user_words = set() for weibo in weibo_all: weibo_text = weibo["_source"]["ip"] user_words |= set(jieba.cut(weibo_text)) rio_dict = load_topic_video_dict(RIO_VIDEO_INFO_FILE) tiger_videos = load_videos(TIGER_VIDEO_INFO_FILE) ret_dict = dict() ret_dict["tiger"] = random.sample(tiger_videos, k) user_pref_topic = set(rio_dict.keys()) & user_words # 若找不到,随机分配topic if len(user_pref_topic) == 0: user_pref_topic = set(random.sample(rio_dict.keys(), k)) ret_dict["rio"] = list() for topic in user_pref_topic: ret_dict["rio"].extend(rio_dict[topic]) if len(ret_dict["rio"]) >= k: ret_dict["rio"] = ret_dict["rio"][:k] break return ret_dict
def get_text_index(date): now_ts = datetime2ts(date) index_list = [] for i in range(7): ts = now_ts - i*DAY tmp_index = pre_text_index + ts2datetime(ts) index_list.append(tmp_index) return index_list
def search_weibo(root_uid,uid,mtype): query_body = { #'query':{ 'filter':{ 'bool':{ 'must':[{'term':{'uid':uid}}, {'term':{'message_type':mtype}}], 'should':[{'term':{'root_uid':root_uid}}, {'term':{'directed_uid':root_uid}}], } } #} } index_list = [] for i in range(7, 0, -1): if RUN_TYPE == 1: iter_date = ts2datetime(datetime2ts(now_date) - i * DAY) else: iter_date = ts2datetime(datetime2ts(RUN_TEST_TIME) - i * DAY) index_list.append(flow_text_index_name_pre + iter_date) results = es_flow_text.search(index=index_list,doc_type=flow_text_index_type,body=query_body)['hits']['hits'] weibo = {} f_result = [] if len(results) > 0: for result in results: #print type(result),result weibo['last_text'] = [result['_source']['text'],result['_source']['text'],result['_source']['timestamp']] mid = result['_source']['root_mid'] # print mid len_pre = len(flow_text_index_name_pre) index = result['_index'][len_pre:] root_index = [] for j in range(0,7): #一周的,一个月的话就0,30 iter_date = ts2datetime(datetime2ts(index) - j * DAY) root_index.append(flow_text_index_name_pre + iter_date) results0 = es_flow_text.search(index=root_index,doc_type=flow_text_index_type,body={'query':{'term':{'mid':mid}}})['hits']['hits'] if len(results0)>0: for result0 in results0: weibo['ori_text'] = [result0['_source']['text'],result0['_source']['timestamp']] f_result.append(weibo) weibo={} return f_result
def ajax_show_in_history(): results = {} date = request.args.get('date', '') input_ts = datetime2ts(date) now_ts = time.time() now_ts = test_time if now_ts - 24*3600*7 > input_ts: return None else: results = show_in_history(date) return json.dumps(results)
def get_user_hashtag(uid): user_hashtag_result = {} now_ts = time.time() now_date = ts2datetime(now_ts) ts = datetime2ts(now_date) #test ts = datetime2ts('2013-09-08') for i in range(1, 8): ts = ts - 3600*24 results = r_cluster.hget('hashtag_'+str(ts), uid) if results: hashtag_dict = json.loads(results) for hashtag in hashtag_dict: try: user_hashtag_result[hashtag] += hashtag_dict[hashtag] except: user_hashtag_result[hashtag] = hashtag_dict[hashtag] sort_hashtag_dict = sorted(user_hashtag_result.items(), key=lambda x:x[1], reverse=True) return sort_hashtag_dict
def get_user_trend(uid): activity_result = dict() now_ts = time.time() date = ts2datetime(now_ts) #run_type if RUN_TYPE == 1: ts = datetime2ts(date) else: ts = datetime2ts(RUN_TEST_TIME) timestamp = ts results = dict() for i in range(1, 8): ts = timestamp - 24 * 3600 * i try: result_string = r_cluster.hget('activity_' + str(ts), str(uid)) except: result_string = '' if not result_string: continue result_dict = json.loads(result_string) for time_segment in result_dict: try: results[int(time_segment) / 16 * 15 * 60 * 16 + ts] += result_dict[time_segment] except: results[int(time_segment) / 16 * 15 * 60 * 16 + ts] = result_dict[time_segment] trend_list = [] for i in range(1, 8): ts = timestamp - i * 24 * 3600 for j in range(0, 6): time_seg = ts + j * 15 * 60 * 16 if time_seg in results: trend_list.append((time_seg, results[time_seg])) else: trend_list.append((time_seg, 0)) sort_trend_list = sorted(trend_list, key=lambda x: x[0], reverse=True) x_axis = [item[0] for item in sort_trend_list] y_axis = [item[1] for item in sort_trend_list] return [x_axis, y_axis]
def search_sentiment_all(start_date, end_date, time_segment): results = {} start_ts = datetime2ts(start_date) end_ts = datetime2ts(end_date) search_date_list = [] for i in range(start_ts, end_ts + DAY, DAY): iter_date = ts2datetime(i) search_date_list.append(iter_date) sentiment_ts_count_dict = {} for sentiment in sentiment_type_list: sentiment_ts_count_dict[sentiment] = [] for date_item in search_date_list: iter_r_name = date_item + '_' + sentiment + '_all' #get ts_count_dict in one day ts_count_result = R_SENTIMENT_ALL.hgetall(iter_r_name) #get x and y list by timesegment new_ts_count_dict = get_new_ts_count_dict(ts_count_result, time_segment, date_item) sort_new_ts_count = sorted(new_ts_count_dict.items(), key=lambda x:x[0]) sentiment_ts_count_dict[sentiment].extend(sort_new_ts_count) return sentiment_ts_count_dict
def search_detect_task(task_name, submit_date, state, process, detect_type, submit_user): results = [] query = [{'match':{'task_type': 'detect'}}] condition_num = 0 if task_name: task_name_list = task_name.split(' ') for item in task_name_list: query.append({'wildcard':{'task_name': '*'+item+'*'}}) condition_num += 1 if submit_date: submit_date_ts = datetime2ts(submit_date) submit_date_from = submit_date_ts submit_date_to = submit_date_ts + DAY query.append({'range':{'submit_date':{'gte':submit_date_from, 'lt':submit_date_to}}}) condition_num += 1 if state: state_list = state.split(' ') for item in state_list: query.append({'wildcard':{'state': '*'+item+'*'}}) condition_num += 1 if process: query.append({'range':{'detect_process':{'from': int(process), 'to': MAX_PROCESS}}}) condition_num += 1 if detect_type: detect_type_list = detect_type.split(',') nest_body_list = [] for type_item in detect_type_list: nest_body_list.append({'wildcard':{'detect_type': '*'+type_item+'*'}}) query.append({'bool':{'should': nest_body_list}}) condition_num += 1 if submit_user: query.append({'term':{'submit_user': submit_user}}) condition_num += 1 try: search_result = es_group_result.search(index=group_index_name, doc_type=group_index_type, \ body={'query':{'bool': {'must': query}}, 'sort':[{'submit_date': {'order': 'desc'}}], 'size':MAX_VALUE})['hits']['hits'] except: search_result = [] #get group information table for group_item in search_result: source = group_item['_source'] task_name = source['task_name'] submit_date = ts2datetime(int(source['submit_date'])) submit_user = source['submit_user'] detect_type = source['detect_type'] state = source['state'] process = source['detect_process'] results.append([task_name, submit_user, submit_date, detect_type, state, process]) return results
def search_mention(uid): now_date_ts = datetime2ts(ts2datetime(time.time())) #run type if RUN_TYPE == 0: now_date_ts = datetime2ts(RUN_TEST_TIME) day_result_dict_list = [] for i in range(7,0, -1): iter_ts = now_date_ts - i * DAY try: result_string = r_cluster.hget('at_' + str(ts), str(uid)) except: result_string = '' if not result_string: continue day_result_dict = json.loads(results_string) day_result_dict_list.append(day_result_dict) if day_result_dict_list: week_result_dict = union_dict(day_result_dict_list) else: week_result_dict = {} return week_result_dict
def new_get_user_weibo(uid, sort_type): results = [] weibo_list = [] now_date = ts2datetime(time.time()) #run_type if RUN_TYPE == 0: now_date = RUN_TEST_TIME sort_type = 'timestamp' #step1:get user name try: user_profile_result = es_user_profile.get(index=profile_index_name, doc_type=profile_index_type,\ id=uid, _source=False, fields=['nick_name']) except: user_profile_result = {} if user_profile_result: uname = user_profile_result['fields']['nick_name'][0] else: uname = '' #step2:get user weibo for i in range(7, 0, -1): iter_date = ts2datetime(datetime2ts(now_date) - i * DAY) index_name = flow_text_index_name_pre + iter_date try: weibo_result = es_flow_text.search(index=index_name, doc_type=flow_text_index_type,\ body={'query':{'filtered':{'filter':{'term': {'uid': uid}}}}, 'sort':sort_type, 'size':100})['hits']['hits'] except: weibo_result = [] if weibo_result: weibo_list.extend(weibo_result) print 'weibo_list:', weibo_list[0] sort_weibo_list = sorted(weibo_list, key=lambda x:x['_source'][sort_type], reverse=True)[:100] for weibo_item in sort_weibo_list: source = weibo_item['_source'] mid = source['mid'] uid = source['uid'] text = source['text'] ip = source['geo'] timestamp = source['timestamp'] date = ts2date(timestamp) sentiment = source['sentiment'] #run_type if RUN_TYPE == 1: retweet_count = source['retweet_count'] comment_count = source['comment_count'] sensitive_score = source['sensitive'] else: retweet_count = 0 comment_count = 0 sensitive_score = 0 city = ip2city(ip) results.append([mid, uid, text, ip, city,timestamp, date, retweet_count, comment_count, sensitive_score]) return results
def search_detect_task(task_name, submit_date, state, process, detect_type, submit_user): results = [] query = [{'match':{'task_type': 'detect'}}] condition_num = 0 if task_name: task_name_list = task_name.split(' ') for item in task_name_list: query.append({'wildcard':{'task_name': '*'+item+'*'}}) condition_num += 1 if submit_date: submit_date_ts = datetime2ts(submit_date) submit_date_from = submit_date_ts submit_date_to = submit_date_ts + DAY query.append({'range':{'submit_date':{'gte':submit_date_from, 'lt':submit_date_to}}}) condition_num += 1 if state: state_list = state.split(' ') for item in state_list: query.append({'wildcard':{'state': '*'+item+'*'}}) condition_num += 1 if process: query.append({'range':{'detect_process':{'from': int(process), 'to': MAX_PROCESS}}}) condition_num += 1 if detect_type: detect_type_list = detect_type.split(',') nest_body_list = [] for type_item in detect_type_list: nest_body_list.append({'wildcard':{'detect_type': '*'+type_item+'*'}}) query.append({'bool':{'should': nest_body_list}}) condition_num += 1 if submit_user: query.append({'wildcard':{'submit_user': '******'+submit_user+'*'}}) condition_num += 1 try: search_result = es_group_result.search(index=group_index_name, doc_type=group_index_type, \ body={'query':{'bool': {'must': query}}, 'sort':[{'submit_date': {'order': 'desc'}}], 'size':MAX_VALUE})['hits']['hits'] except: search_result = [] #get group information table for group_item in search_result: source = group_item['_source'] task_name = source['task_name'] submit_date = ts2datetime(int(source['submit_date'])) submit_user = source['submit_user'] detect_type = source['detect_type'] state = source['state'] process = source['detect_process'] results.append([task_name, submit_user, submit_date, detect_type, state, process]) return results
def get_user_hashtag(uid): user_hashtag_result = {} now_ts = time.time() now_date = ts2datetime(now_ts) #run_type if RUN_TYPE == 1: ts = datetime2ts(now_date) else: ts = datetime2ts(RUN_TEST_TIME) for i in range(1, 8): ts = ts - 3600*24 results = r_cluster.hget('hashtag_'+str(ts), uid) if results: hashtag_dict = json.loads(results) for hashtag in hashtag_dict: try: user_hashtag_result[hashtag] += hashtag_dict[hashtag] except: user_hashtag_result[hashtag] = hashtag_dict[hashtag] sort_hashtag_dict = sorted(user_hashtag_result.items(), key=lambda x:x[1], reverse=True) return sort_hashtag_dict
def get_user_trend(uid): activity_result = dict() now_ts = time.time() date = ts2datetime(now_ts) #run_type if RUN_TYPE == 1: ts = datetime2ts(date) else: ts = datetime2ts(RUN_TEST_TIME) timestamp = ts results = dict() for i in range(1, 8): ts = timestamp - 24*3600*i try: result_string = r_cluster.hget('activity_'+str(ts), str(uid)) except: result_string = '' if not result_string: continue result_dict = json.loads(result_string) for time_segment in result_dict: try: results[int(time_segment)/16*15*60*16+ts] += result_dict[time_segment] except: results[int(time_segment)/16*15*60*16+ts] = result_dict[time_segment] trend_list = [] for i in range(1, 8): ts = timestamp - i*24*3600 for j in range(0, 6): time_seg = ts + j*15*60*16 if time_seg in results: trend_list.append((time_seg, results[time_seg])) else: trend_list.append((time_seg, 0)) sort_trend_list = sorted(trend_list, key=lambda x:x[0], reverse=True) x_axis = [item[0] for item in sort_trend_list] y_axis = [item[1] for item in sort_trend_list] return [x_axis, y_axis]