def pagerank_rank(): timestamp = time.time() es_num = get_es_num(timestamp) if es_num == 0: network_es_mappings() network_count_es_mappings() tmp_file = tempfile.NamedTemporaryFile(delete=False) print 'step 1: scan', ts2date(timestamp) scan_retweet(tmp_file) tmp_file.close() if not tmp_file: return input_tmp_path = tmp_file.name print input_tmp_path ITER_COUNT = 10 TOP_N = 50 print 'step 2: pagerank', ts2date(time.time()) all_uids_count, dg_sorted_uids, pr_sorted_uids = pagerank(ITER_COUNT, input_tmp_path, TOP_N, 'all') print 'step 3: save', ts2date(time.time()) save_count_results(all_uids_count, es_num) save_dg_pr_results(dg_sorted_uids, es_num, 'dg') save_dg_pr_results(pr_sorted_uids, es_num, 'pr') print 'save done', ts2date(time.time())
def main(): #step1: get task from redis queue (rpop) #step2: get monitor task time record from redis----data: {'monitor_task_time_record':{task_name, compute_start_ts}} #step3: identify the compute_start_ts can be compute #setp4: get task user from es---group_result #step5: according task user count do differently computing #step6: compute task mid-result #step7: save the mid-result in mid-result es----timestamp as field #step8: identify the track task is doing ,not end/delete from group_result es status==1 not 0 #step8: if track_task is doing: update the compute_start_ts #step9: if track_task is doing: lpush task name to redis queue (keep the task in queue) #step10: if track_task is not doing: delete the compute_start_ts from redis while True: task_name = get_task_name() if task_name: start_ts = r_task.hget('monitor_task_time_record', task_name) start_ts = int(start_ts) #now_ts = time.time() #test now_ts = date2ts('2013-09-08 00:15:00') if start_ts == now_ts: status = add_task_name(task_name) if status == 0: print 'add task to redis fail' break if start_ts + 900 <= now_ts: task_user = get_task_user(task_name) if len(task_user) == 1: print 'compute %s start_ts %s' % (task_name, ts2date(start_ts)) #status = compute_mid_result_one(task_name, task_user, start_ts) else: #print 'compute %s start_ts %s' % (task_name, ts2date(start_ts)) #status = compute_mid_result_group(task_name, task_user, start_ts) #compute group polarization----compute once a day if datetime2ts(ts2datetime(start_ts)) == start_ts: print 'start commpute group inner %s' % ts2date( start_ts) group_status = compute_group_inner( task_name, task_user, start_ts) status = group_status #test status = 1 if status == 0: print 'there is a bug about %s task' % task_name else: #update the record time start_ts += 900 task_doing_status = identify_task_doing(task_name) #print 'task_doing_status:', task_doing_status if task_doing_status == True: r_task.hset('monitor_task_time_record', task_name, start_ts) status = add_task_name(task_name) if status == 0: print 'add task name to redis fail' else: r_task.hdel('monitor_task_time_record', task_name)
def get_maker(topic, new_peaks, new_bottom, ts_list, collection): begin_ts = ts_list[new_bottom[0]] end_ts = ts_list[new_peaks[0]] print 'get_maker news_bottom:', new_bottom[0] print 'get_maker news_peak:', new_peaks[0] print 'get_maker ts_list:', ts2date(ts_list[0]) print 'get_maker start_ts:', ts2date(begin_ts) print 'get_maker end_ts:', ts2date(end_ts) if begin_ts > end_ts: begin_ts = ts_list[0] begin_ts = begin_ts - Hour filter_dict = get_filter_dict() query_dict = {'timestamp':{'$gte':begin_ts, '$lte':end_ts}} ''' maker_list = collection.find(query_dict, filter_dict).sort('weight').limit(maker_news_count) if not maker_list: return [] else: return maker_list ''' input_news_list = collection.find(query_dict, filter_dict) # 第一个波段内所有新闻进行分词 news_cut_list = cut_news(input_news_list) # 计算top50的关键词 keywords_list = get_news_keywords(news_cut_list) # 计算波段内新闻的关键词占比weight weight_list = get_news_weight(news_cut_list, keywords_list) # 排序获取weight前20的news maker_list = get_top_weight_news(weight_list) if not maker_list: return [] else: return maker_list
def get_maker(topic, new_peaks, new_bottom, ts_list, collection): begin_ts = ts_list[new_bottom[0]] end_ts = ts_list[new_peaks[0]] print 'get_maker news_bottom:', new_bottom[0] print 'get_maker news_peak:', new_peaks[0] print 'get_maker ts_list:', ts2date(ts_list[0]) print 'get_maker start_ts:', ts2date(begin_ts) print 'get_maker end_ts:', ts2date(end_ts) if begin_ts > end_ts: begin_ts = ts_list[0] begin_ts = begin_ts - Hour filter_dict = get_filter_dict() query_dict = {'timestamp': {'$gte': begin_ts, '$lte': end_ts}} ''' maker_list = collection.find(query_dict, filter_dict).sort('weight').limit(maker_news_count) if not maker_list: return [] else: return maker_list ''' input_news_list = collection.find(query_dict, filter_dict) # 第一个波段内所有新闻进行分词 news_cut_list = cut_news(input_news_list) # 计算top50的关键词 keywords_list = get_news_keywords(news_cut_list) # 计算波段内新闻的关键词占比weight weight_list = get_news_weight(news_cut_list, keywords_list) # 排序获取weight前20的news maker_list = get_top_weight_news(weight_list) if not maker_list: return [] else: return maker_list
def get_interval_count(topic, start_ts, end_ts): results = [0] ts_list = [] #unit = 900 #during = Day during = interval_count_during start_ts = datetime2ts(ts2datetime(start_ts)) ts_list.append(start_ts) #end_ts = datetime2ts(ts2datetime(end_ts)) # deal with the time is not the whole day print 'before deal end_ts:', ts2date(end_ts) if end_ts - datetime2ts(ts2datetime(end_ts)) != 0: end_ts = datetime2ts(ts2datetime(end_ts)) + 3600 * 24 print 'get_interval_count start_ts:', ts2date(start_ts) print 'get_interval_count end_ts:', ts2date(end_ts) windowsize = (end_ts - start_ts) / Day interval = (end_ts - start_ts) / During for i in range(interval, 0, -1): begin_ts = end_ts - during * i over_ts = begin_ts + during ts_list.append(over_ts) items = db.session.query(PropagateCountNews).filter(PropagateCountNews.topic==topic ,\ PropagateCountNews.end<=over_ts ,\ PropagateCountNews.end>begin_ts ,\ PropagateCountNews.range==unit).all() if items: result = Merge_propagate(items) else: result = 0 results.append(float(result)) return ts_list, results
def create_task_list(given_ts): # 1. search from manage_sensing_task # 2. push to redis list-----task_work # print start info current_path = os.getcwd() file_path = os.path.join(current_path, 'task_list.py') now_ts = datehour2ts(ts2datehour(time.time() - 3600)) print_log = "&".join([file_path, "start", ts2date(now_ts)]) print print_log #ts = ts - 3600 query_body = {"query": {"match_all": {}}} search_results = es.search(index=index_sensing, doc_type=type_sensing, body=query_body)['hits']['hits'] count = 0 if search_results: for iter_item in search_results: _id = iter_item['_id'] item = iter_item['_source'] task = [] task.append(item['task_name']) # task_name task.append(json.loads(item['social_sensors'])) # social sensors #task.append(now_ts) task.append(given_ts) r.lpush('task_name', json.dumps(task)) count += 1 print count print_log = "&".join([file_path, "end", ts2date(time.time())]) print print_log
def bursty_event_detection(ts,mid_list): #print mid_list results = es_user_portrait.mget(index=monitor_index_name, doc_type=monitor_index_type, body={"ids":mid_list})["docs"] if len(mid_list) >=3: print results with open('burst_3.txt', 'a') as f: for item in results: item = item['_source'] mid = item['mid'] f.write(str(ts2date(ts))+str(ts2date(item['timestamp']))+str(item['uid'])+item["text"].encode("utf-8", "ignore")+"\n") item['detection'] = 1 es_user_portrait.index(index=monitor_index_name, doc_type=monitor_index_type,id=item['mid'], body=item) time_series = dict() for item in results: for k,v in item['_source'].iteritems(): if "ts-" in k: k = k.split('-')[1] if int(k) in time_series: time_series[int(k)] += v else: time_series[int(k)] = v sorted_result = sorted(time_series.items(), key=lambda x:x[0], reverse=False) if len(sorted_result) > 4 and len(mid_list) >=2: timestamp = sorted_result[-1][0] retweet_number = sorted_result[-1][1] average_list = [item[1] for item in sorted_result[:-1]] average = np.mean(average_list) std = np.std(average_list) former_three = sum(average_list[-4:-1]) #print average_list, retweet_number #if retweet_number > average + 1.96*std: if retweet_number > former_three: print sorted_result print "detect burst event" print "timestamp: ", timestamp print "weibo list: ", mid_list #取按时间排序的top2 text_list = [] for item in results: text_list.append(item['_source']) #sorted_by_ts = sorted(text_list, key=operator.itemgetter("timestamp"), reverse=False) #print "最早的两个微博:", sorted_by_ts[:2] #sorted_by_retweet = sorted(text_list, key=operator.itemgetter("sum_retweet"), reverse=True) #print sorted_by_retweet[:2] #mining_results = [] #mining_results.extend(sorted_by_ts[:2]) #mining_results.extend(sorted_by_retweet[:2]) with open("burst_3.txt", "a") as f: for item in text_list: mid = item['mid'] item['detection'] = 1 es_user_portrait.index(index=monitor_index_name, doc_type=monitor_index_type,id=item['mid'], body=item) f.write(str(ts2date(ts))+str(ts2date(item['timestamp']))+str(item['uid'])+item["text"].encode('utf-8', 'ignore')+"\n") else: results = [] return results
def pagerank_rank(): timestamp = time.time() net_dic_pr = get_net_dic_pr() ''' es_num = get_es_num(timestamp) if es_num == 0: network_es_mappings() network_count_es_mappings() ''' tmp_file = tempfile.NamedTemporaryFile(delete=False) print 'step 1: write', ts2date(timestamp) for key in net_dic_pr: write_tmp_file(tmp_file, key, net_dic_pr[key]) tmp_file.close() if not tmp_file: return input_tmp_path = tmp_file.name print input_tmp_path print 'step 2: pagerank', ts2date(time.time()) all_uids_count, dg_sorted_uids, pr_sorted_uids = pagerank( ITER_COUNT, input_tmp_path, TOP_N, 'all') print 'pr_sorted_uids:', pr_sorted_uids print 'step 3: save', ts2date(time.time()) save_count_results(all_uids_count, es_num) save_dg_pr_results(dg_sorted_uids, es_num, 'dg') save_dg_pr_results(pr_sorted_uids, es_num, 'pr') print 'save done', ts2date(time.time())
def get_interval_count(topic, start_ts, end_ts): results = [0] ts_list = [] #unit = 900 #during = Day during = interval_count_during start_ts = datetime2ts(ts2datetime(start_ts)) ts_list.append(start_ts) #end_ts = datetime2ts(ts2datetime(end_ts)) # deal with the time is not the whole day print 'before deal end_ts:', ts2date(end_ts) if end_ts - datetime2ts(ts2datetime(end_ts))!= 0: end_ts = datetime2ts(ts2datetime(end_ts)) + 3600 * 24 print 'get_interval_count start_ts:', ts2date(start_ts) print 'get_interval_count end_ts:', ts2date(end_ts) windowsize = (end_ts - start_ts) / Day interval = (end_ts - start_ts) / During for i in range(interval, 0, -1): begin_ts = end_ts - during * i over_ts = begin_ts + during ts_list.append(over_ts) items = db.session.query(PropagateCountNews).filter(PropagateCountNews.topic==topic ,\ PropagateCountNews.end<=over_ts ,\ PropagateCountNews.end>begin_ts ,\ PropagateCountNews.range==unit).all() if items: result = Merge_propagate(items) else: result = 0 results.append(float(result)) return ts_list, results
def pagerank_rank(): timestamp = time.time() es_num = get_es_num(timestamp) if es_num == 0: network_es_mappings() network_count_es_mappings() tmp_file = tempfile.NamedTemporaryFile(delete=False) print 'step 1: scan', ts2date(timestamp) scan_retweet(tmp_file) tmp_file.close() if not tmp_file: return input_tmp_path = tmp_file.name print input_tmp_path ITER_COUNT = 10 TOP_N = 50 print 'step 2: pagerank', ts2date(time.time()) all_uids_count, dg_sorted_uids, pr_sorted_uids = pagerank( ITER_COUNT, input_tmp_path, TOP_N, 'all') print 'step 3: save', ts2date(time.time()) save_count_results(all_uids_count, es_num) save_dg_pr_results(dg_sorted_uids, es_num, 'dg') save_dg_pr_results(pr_sorted_uids, es_num, 'pr') print 'save done', ts2date(time.time())
def create_task_list(): # 1. search from manage_sensing_task # 2. push to redis list-----task_work # print start info current_path = os.getcwd() file_path = os.path.join(current_path, 'task_list.py') if RUN_TYPE == 0: now_ts = 1463241600 # 1378008000 else: i = int(sys.argv[1]) now_ts = 1463241600 + 3600 * i #now_ts = date_hour2ts(ts2date_hour(time.time())) print_log = "&".join([file_path, "start", ts2date(now_ts)]) print print_log #ts = ts - 3600 query_body = { "query": { "filtered": { "filter": { "bool": { "must": [{ "term": { "finish": "0" } }, { "term": { "processing_status": "1" } }] } } } } } search_results = es.search(index=index_name, doc_type=task_doc_type, body=query_body)['hits']['hits'] count = 0 if search_results: for iter_item in search_results: _id = iter_item['_id'] item = iter_item['_source'] task = [] task.append(item['task_name']) # task_name task.append(item['keywords']) # keywords task.append(item['stop_time']) # stop time task.append(item['create_by']) task.append(now_ts) r.lpush('task_name', json.dumps(task)) count += 1 print count print_log = "&".join([file_path, "end", ts2date(time.time())]) print print_log
def main(): #step1: get task from redis queue (rpop) #step2: get monitor task time record from redis----data: {'monitor_task_time_record':{task_name, compute_start_ts}} #step3: identify the compute_start_ts can be compute #setp4: get task user from es---group_result #step5: according task user count do differently computing #step6: compute task mid-result #step7: save the mid-result in mid-result es----timestamp as field #step8: identify the track task is doing ,not end/delete from group_result es status==1 not 0 #step8: if track_task is doing: update the compute_start_ts #step9: if track_task is doing: lpush task name to redis queue (keep the task in queue) #step10: if track_task is not doing: delete the compute_start_ts from redis while True: task_name = get_task_name() if task_name: start_ts = r_task.hget('monitor_task_time_record', task_name) start_ts = int(start_ts) #now_ts = time.time() #test now_ts = date2ts('2013-09-08 00:15:00') if start_ts == now_ts: status = add_task_name(task_name) if status == 0: print 'add task to redis fail' break if start_ts + 900 <= now_ts: task_user = get_task_user(task_name) if len(task_user)==1: print 'compute %s start_ts %s' % (task_name, ts2date(start_ts)) #status = compute_mid_result_one(task_name, task_user, start_ts) else: #print 'compute %s start_ts %s' % (task_name, ts2date(start_ts)) #status = compute_mid_result_group(task_name, task_user, start_ts) #compute group polarization----compute once a day if datetime2ts(ts2datetime(start_ts)) == start_ts: print 'start commpute group inner %s' % ts2date(start_ts) group_status = compute_group_inner(task_name, task_user, start_ts) status = group_status #test status = 1 if status == 0: print 'there is a bug about %s task' % task_name else: #update the record time start_ts += 900 task_doing_status = identify_task_doing(task_name) #print 'task_doing_status:', task_doing_status if task_doing_status == True: r_task.hset('monitor_task_time_record', task_name, start_ts) status = add_task_name(task_name) if status==0: print 'add task name to redis fail' else: r_task.hdel('monitor_task_time_record', task_name)
def create_task_list(): # 1. search from manage_sensing_task # 2. push to redis list-----task_work # print start info current_path = os.getcwd() file_path = os.path.join(current_path, 'task_list.py') if RUN_TYPE == 0: now_ts = 1463241600 # 1378008000 else: i = int(sys.argv[1]) now_ts = 1463241600 + 3600 * i #now_ts = date_hour2ts(ts2date_hour(time.time())) print_log = "&".join([file_path, "start", ts2date(now_ts)]) print print_log #ts = ts - 3600 query_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ {"term":{"finish": "0"}}, {"term":{"processing_status": "1"}} ] } } } } } search_results = es.search(index=index_name, doc_type=task_doc_type, body=query_body)['hits']['hits'] count = 0 if search_results: for iter_item in search_results: _id = iter_item['_id'] item = iter_item['_source'] task = [] task.append(item['task_name']) # task_name task.append(item['keywords']) # keywords task.append(item['stop_time']) # stop time task.append(item['create_by']) task.append(now_ts) r.lpush('task_name', json.dumps(task)) count += 1 print count print_log = "&".join([file_path, "end", ts2date(time.time())]) print print_log
def get_tsu(new_peaks, new_bottom, ts_list, topic_xapian_id): #print 'new_peaks:', new_peaks #print 'new_bottom:', new_bottom #print 'ts_list:', ts_list end_ts = ts_list[new_peaks[0]] begin_ts = ts_list[new_bottom[0]] if begin_ts > end_ts: begin_ts = ts_list[0] query_dict = { 'timestamp': { '$gt': begin_ts, '$lt': end_ts }, 'message_type': 3 } print 'query_dict:', query_dict print 'begin_ts:', ts2date(begin_ts) print 'end_ts:', ts2date(end_ts) xapian_search_weibo = getXapianWeiboByTopic(topic_xapian_id) # 这里需要考虑话题id count, results = xapian_search_weibo.search( query=query_dict, fields=['retweeted_uid', 'retweeted_mid']) print 'count:', count ruid_count = {} ruid_mid = {} for result in results(): r_uid = result['retweeted_uid'] if (r_uid == 0) or (not r_uid): continue ruid_mid[r_uid] = result['retweeted_mid'] try: ruid_count[r_uid] += 1 except KeyError: ruid_count[r_uid] = 1 sorted_result = sorted(ruid_count.items(), key=lambda d: d[1], reverse=True) print 'top_source_user:'******''' count, top_weibo = xapian_search_weibo.search(query={'_id':top_source_mid}, fields=['timestamp']) print 'count:', count for i in top_weibo(): timestamp = i['timestamp'] print 'timestamp:', ts2date(int(timestamp)) ''' return sorted_result
def get_theme(theme_name, submit_user): if theme_name == '': theme_detail = es_special_event.search(index=special_event_name, doc_type=special_event_type,\ body={'query':{'term':{'user':submit_user}}})['hits']['hits'] else: query_body = { "query":{ 'bool':{ 'must':[ {'match': {"user":submit_user}}, {'match': {"topic_name":theme_name}}, ] } }, 'size':100 } theme_detail = es_event.search(index=special_event_name, doc_type=special_event_type,\ body=query_body)['hits']['hits'] theme_result = [] for i in theme_detail: topic_id = i['_id'] theme_name = i['_source']['topic_name'] contain_event = i['_source']['event_count'] auto_label = i['_source']['label'].split('&')[:5] try: work_tag = i['_source']['k_label'].split('&') # work_tag = deal_event_tag(work_tag, submit_user)[0] except: work_tag = [] submit_ts = ts2date(i['_source']['create_ts']) theme_result.append([topic_id, theme_name, contain_event, auto_label, work_tag, submit_ts]) return theme_result
def add_task( user_name ,type = "keyword",range = "all" ,pre ='flow_text_' , during = '1' , start_time ='2013-09-07' ,end_time ='2013-09-07', keyword = 'hello,world' , sort_norm = 'bci' , sort_scope = 'in_limit_keyword', time = 1, isall = False ): time_now = TIME.time() body_json = { 'submit_user' : user_name , 'keyword' : keyword, 'submit_time' : str(ts2date(time_now)) , 'end_time' : end_time, 'search_type' : type, 'status':0, 'range' : range , 'user_ts' : user_name + str(time_now), 'pre' : pre, 'during' : during , 'start_time' : start_time , 'sort_norm' : sort_norm , 'sort_scope' : sort_scope, 'time' : time , 'isall' : isall } try: es.index(index = USER_RANK_KEYWORD_TASK_INDEX , doc_type=USER_RANK_KEYWORD_TASK_TYPE , body=body_json) return body_json["user_ts"] except Exception , e1 : print e1
def profile_group_status_count(fieldEnName): start_ts = request.args.get('start_ts', None) end_ts = request.args.get('end_ts', None) if not start_ts or not end_ts: start_ts, end_ts = _time_zone(_utf_encode(default_timerange)) if start_ts: start_ts = int(start_ts) if end_ts: end_ts = int(end_ts) interval = (end_ts - start_ts) / (24 * 3600) + 1 datestr = ts2datetimestr(end_ts) # '20130907' date_list = last_week_to_date(datestr, interval) domainid = DOMAIN_LIST.index(fieldEnName) time_arr = [] total_arr = [] repost_arr = [] fipost_arr = [] for datestr in date_list: active, important, reposts, original = getDomainCountData(domainid, datestr) sumcount = reposts + original time_arr.append(ts2date(datetimestr2ts(datestr)).isoformat()) total_arr.append(sumcount) repost_arr.append(reposts) fipost_arr.append(original) return json.dumps({'time': time_arr, 'count': total_arr, 'repost': repost_arr, 'fipost': fipost_arr})
def compute_group_inner(task_name, task_user, start_ts): #step1: get task_user in-monitor task user retweet relation from monitor_inner_r #step2: get task_user in-task user retweet relation #step3: compute every inner user be-retweet ratio in task #step4: save top5 to es--monitor_result, doc_type=task_name, _id='inner_'+date e:'inner_2013-09-01' group_status = 0 time_segment = 3600 * 24 iter_time_segment = 900 iter_ts = start_ts - time_segment inner_group_dict = {} user_count_dict = {} print 'group inner ask_user:'******''' if iter_ts >= start_ts: break ''' key = 'inner_' + str(iter_ts) print 'iter_ts:', ts2date(iter_ts) inner_retweet_string = monitor_inner_r.hget(root_uid, key) print 'root_uid, key, inner_retweet_string:', root_uid, key, inner_retweet_string if inner_retweet_string: print 'yes' inner_retweet_dict = json.loads(inner_retweet_string) else: inner_retweet_dict = None if inner_retweet_dict: inner_group_dict[root_uid] = merge_dict( inner_group_dict[root_uid], inner_retweet_dict) iter_ts += iter_time_segment user_inner_retweet_count = sum(inner_group_dict[root_uid].values()) user_count_dict[root_uid] = user_inner_retweet_count all_be_retweet_count = sum(user_count_dict.values()) if all_be_retweet_count == 0: group_status = 1 return group_status sort_user_inner_retweet_count = sorted(user_count_dict.items(), key=lambda x: x[1], reverse=True) top5_user = sort_user_inner_retweet_count[:5] # timestamp: '2013-09-01' date = ts2datetime(start_ts - 24 * 3600) index_body = {'date': date} for rank in range(1, 6): key = 'top' + str(rank) index_body[key] = json.dumps(top5_user[rank - 1]) key = 'inner_' + date # save inner-retweet graph by dict {root_uid1:{uid1:count1, uid2:count2}, ...} index_body['inner_graph'] = json.dumps(inner_group_dict) es.index(index=monitor_index_name, doc_type=task_name, id=key, body=index_body) group_status = 1 return group_status
def search_user_task(user_name): c_result = {} query = {"query":{"bool":{"must":[{"term":{"submit_user":str(user_name)}}]}},"size":MAX_ITEMS,"sort":[{"create_time":{"order":"desc"}}],"fields":["status","search_type","keyword","submit_user","sort_scope","sort_norm","start_time","user_ts","end_time","create_time",'number']}#"sort":[{"create_time":{"order":"desc"}}],;;field:"create_time", 'number' if 1: return_list = [] result = es.search(index=USER_RANK_KEYWORD_TASK_INDEX , doc_type=USER_RANK_KEYWORD_TASK_TYPE , body=query)['hits'] c_result['flag'] = True for item in result['hits']: result_temp = {} result_temp['submit_user'] = item['fields']['submit_user'][0] result_temp['search_type'] = item['fields']['search_type'][0] #jln #result_temp['keyword'] = json.loads(item['fields']['keyword'][0]) result_temp['keyword'] = json.loads(item['fields']['keyword'][0]) result_temp['sort_scope'] = item['fields']['sort_scope'][0] result_temp['sort_norm'] = item['fields']['sort_norm'][0] # result_temp['start_time'] = ts2datetime(item['fields']['start_time'][0]) # result_temp['end_time'] = ts2datetime(item['fields']['end_time'][0]) result_temp['start_time'] = item['fields']['start_time'][0] result_temp['end_time'] = item['fields']['end_time'][0] result_temp['status'] = item['fields']['status'][0] result_temp['create_time'] = ts2date(item['fields']['create_time'][0]) result_temp['search_id'] = item['fields']['user_ts'][0] tmp = item['fields'].get('number', 0) if tmp: result_temp['number'] = int(tmp[0]) else: result_temp['number'] = 100 return_list.append(result_temp) c_result['data'] = return_list return c_result
def get_news_trend_pusher(topic, start_ts, end_ts, rank_method, news_skip, news_limit_count): results = [] print 'topic, start_ts, end_ts, rank_method:', topic.encode( 'utf-8'), ts2date(start_ts), ts2date(end_ts), rank_method items = db.session.query(TrendPusherNews).filter(TrendPusherNews.topic==topic ,\ TrendPusherNews.start_ts==start_ts ,\ TrendPusherNews.end_ts==end_ts).all() if not items or items == []: return [] for item in items: row = [] news_id = item.news_id news_id = deal_with(news_id) timestamp = item.timestamp comments_count = item.comments_count news_info = json.loads(item.news_info) url = news_info['url'] summary = news_info['summary'] datetime = news_info['datetime'] source_from_name = news_info['source_from_name'] content168 = news_info['content168'] title = news_info['title'] #weight = news_info['weight'] transmit_name = news_info['transmit_name'] #if len(transmit_name)==0: # transmit_name = u'未知' same_news_num = news_info['same_news_num'] row = [ news_id, url, summary, timestamp, datetime, source_from_name, content168, title, same_news_num, transmit_name, comments_count ] results.append(row) if rank_method == 'comments_count': sort_results = sorted(results, key=lambda x: x[10], reverse=True) # 评论数逆序排列 elif rank_method == 'timestamp': sort_results = sorted(results, key=lambda x: x[3]) # 时间戳正序排列 #elif rank_method=='weight': # sort_results = sorted(results, key=lambda x:x[10], reverse=True) # 相关度逆序排序 return sort_results[news_skip:news_limit_count + news_skip]
def scan_network_keywords_task(): #step1: read task information from redis queue #step2: identify the task information is exist in es #step3: compute the network trend task while True: #read task informaiton from redis queue network_task_information = get_task_information() print network_task_information #when redis queue null - file break if not network_task_information: break #identify the task is exist in es exist_mark = identify_task_exist(network_task_information) print 'exist_mark:', exist_mark if exist_mark: print 'step 1: compute', ts2date(time.time()) results = compute_network_task(network_task_information) if results: tmp_file = tempfile.NamedTemporaryFile(delete=False) write_tmp_file(tmp_file, results) tmp_file.close() if not tmp_file: return input_tmp_path = tmp_file.name print input_tmp_path ITER_COUNT = 10 TOP_N = 50 print 'step 2: pagerank', ts2date(time.time()) all_uids_count, dg_sorted_uids, pr_sorted_uids = pagerank( ITER_COUNT, input_tmp_path, TOP_N, 'keywords') #save results print 'step 3: save', ts2date(time.time()) save_mark = save_task_results(dg_sorted_uids, pr_sorted_uids, network_task_information) print 'save done', ts2date(time.time()) #identify save status if not save_mark: #status fail: push task information to redis queue push_mark = push_task_information(network_task_information) if not push_mark: print 'error push task queue' else: #if no exist - pass pass
def get_max_k_timestamp(results, p_ts_list): # 最大斜率 且增量要大于平均增量 length = len(results) smooth_results = [] incre_dict = {} k_dict = {} # 平滑处理--感觉会消耗信息!!!!!! for i in range(length): if i > 1: smooth = sum(results[i - 2:i + 1]) / 3.0 smooth_results.append(smooth) #print 'smooth_results:',i ,results[i-2:i+1], smooth_results l = len(smooth_results) if l >= 2: ''' if smooth_results[l-2]!=0: k = (smooth_results[l-1] - smooth_results[l-2]) / smooth_results[l-2] k_dict[l-1] = k else: k_dict[l-1] = 0 ''' k = (smooth_results[l - 1] - smooth_results[l - 2]) / Hour k_dict[l - 1] = k #print 'smooth_results:', smooth_results sort_k_list = sorted(k_dict.items(), key=lambda c: c[1], reverse=True) #print 'sort_k_list:', sort_k_list smooth_length = len(smooth_results) all_average = 0 for j in range(smooth_length): if j > 0: incre = float(smooth_results[j] - smooth_results[j - 1]) all_average += incre incre_dict[j - 1] = incre try: average_incre = all_average / len(incre_dict) except: average_incre = all_average remove_list = [] #print 'incre_dict:', incre_dict # 筛掉增量小于平均增量的 for k in incre_dict: if incre_dict[k] <= average_incre: remove_list.append(k) after_remove_k_list = [] for sort_k in sort_k_list: if not sort_k[0] in remove_list: index = sort_k[0] timestamp = p_ts_list[index + 1] k_value = sort_k[1] after_remove_k_list.append((index + 1, timestamp, k_value)) max_k_timestamp = after_remove_k_list[0][1] #print 'after_remove_k_list:', after_remove_k_list print 'max_k_timestamp:', max_k_timestamp print 'max_k_timestamp:', ts2date(max_k_timestamp) return max_k_timestamp
def search_user_task(user_name): c_result = {} query = { "query": { "bool": { "must": [{ "term": { "submit_user": str(user_name) } }] } }, "size": MAX_ITEMS, "sort": [{ "create_time": { "order": "desc" } }], "fields": [ "status", "search_type", "keyword", "submit_user", "sort_scope", "sort_norm", "start_time", "user_ts", "end_time", "create_time", 'number' ] } #"sort":[{"create_time":{"order":"desc"}}],;;field:"create_time", 'number' if 1: return_list = [] result = es.search(index=USER_RANK_KEYWORD_TASK_INDEX, doc_type=USER_RANK_KEYWORD_TASK_TYPE, body=query)['hits'] c_result['flag'] = True for item in result['hits']: result_temp = {} result_temp['submit_user'] = item['fields']['submit_user'][0] result_temp['search_type'] = item['fields']['search_type'][0] #jln #result_temp['keyword'] = json.loads(item['fields']['keyword'][0]) result_temp['keyword'] = json.loads(item['fields']['keyword'][0]) result_temp['sort_scope'] = item['fields']['sort_scope'][0] result_temp['sort_norm'] = item['fields']['sort_norm'][0] # result_temp['start_time'] = ts2datetime(item['fields']['start_time'][0]) # result_temp['end_time'] = ts2datetime(item['fields']['end_time'][0]) result_temp['start_time'] = item['fields']['start_time'][0] result_temp['end_time'] = item['fields']['end_time'][0] result_temp['status'] = item['fields']['status'][0] result_temp['create_time'] = ts2date( item['fields']['create_time'][0]) result_temp['search_id'] = item['fields']['user_ts'][0] tmp = item['fields'].get('number', 0) if tmp: result_temp['number'] = int(tmp[0]) else: result_temp['number'] = 100 return_list.append(result_temp) c_result['data'] = return_list return c_result
def scan_network_keywords_task(): #step1: read task information from redis queue #step2: identify the task information is exist in es #step3: compute the network trend task while True: #read task informaiton from redis queue network_task_information = get_task_information() print network_task_information #when redis queue null - file break if not network_task_information: break #identify the task is exist in es exist_mark = identify_task_exist(network_task_information) print 'exist_mark:', exist_mark if exist_mark: print 'step 1: compute', ts2date(time.time()) results = compute_network_task(network_task_information) if results: tmp_file = tempfile.NamedTemporaryFile(delete=False) write_tmp_file(tmp_file, results) tmp_file.close() if not tmp_file: return input_tmp_path = tmp_file.name print input_tmp_path ITER_COUNT = 10 TOP_N = 50 print 'step 2: pagerank', ts2date(time.time()) all_uids_count, dg_sorted_uids, pr_sorted_uids = pagerank(ITER_COUNT, input_tmp_path, TOP_N, 'keywords') #save results print 'step 3: save', ts2date(time.time()) save_mark = save_task_results(dg_sorted_uids, pr_sorted_uids, network_task_information) print 'save done', ts2date(time.time()) #identify save status if not save_mark: #status fail: push task information to redis queue push_mark = push_task_information(network_task_information) if not push_mark: print 'error push task queue' else: #if no exist - pass pass
def user_weibo(): """微博列表页面 """ # 要素 yaosu = 'moodlens' # 话题关键词 topic = request.args.get('query', default_topic) # 时间范围: 20130901-20130901 time_range = request.args.get('time_range', default_timerange) # 时间粒度: 3600 point_interval = request.args.get('point_interval', None) if not point_interval: point_interval = default_pointInterval else: for pi in pointIntervals: if pi['en'] == int(point_interval): point_interval = pi break weibos = [] tar_location = u'地域未知' tar_nickname = u'昵称未知' tar_profile_image_url = '#' tar_followers_count = u'粉丝数未知' tar_friends_count = u'关注数未知' tar_user_url = '#' uid = request.args.get('uid', None) if uid: count, results = xapian_search_weibo.search(query={'user': int(uid)}, sort_by=['timestamp'], \ fields=['id', 'user', 'text', 'reposts_count', 'comments_count', 'geo', 'timestamp']) for r in results(): r['weibo_url'] = 'http://weibo.com/' r['user_url'] = 'http://weibo.com/u/' + str(uid) r['created_at'] = ts2date(r['timestamp']) weibos.append(r) user_info = acquire_user_by_id(uid) if user_info: tar_name = user_info['name'] tar_location = user_info['location'] tar_profile_image_url = user_info['profile_image_url'] tar_friends_count = user_info['friends_count'] tar_followers_count = user_info['followers_count'] tar_user_url = 'http://weibo.com/u/' + str(uid) return render_template('index/weibolist.html', yaosu=yaosu, time_range=time_range, \ topic=topic, pointInterval=point_interval, pointIntervals=pointIntervals, \ gaishu_yaosus=gaishu_yaosus, deep_yaosus=deep_yaosus, tar_location=tar_location, \ tar_profile_image_url=tar_profile_image_url, \ statuses=weibos, tar_name=tar_name, tar_friends_count=tar_friends_count, \ tar_followers_count=tar_followers_count, tar_user_url=tar_user_url)
def get_tsu(new_peaks, new_bottom, ts_list, topic_xapian_id): #print 'new_peaks:', new_peaks #print 'new_bottom:', new_bottom #print 'ts_list:', ts_list end_ts = ts_list[new_peaks[0]] begin_ts = ts_list[new_bottom[0]] if begin_ts>end_ts: begin_ts = ts_list[0] query_dict = { 'timestamp':{'$gt':begin_ts, '$lt':end_ts}, 'message_type':3 } print 'query_dict:', query_dict print 'begin_ts:', ts2date(begin_ts) print 'end_ts:', ts2date(end_ts) xapian_search_weibo = getXapianWeiboByTopic(topic_xapian_id)# 这里需要考虑话题id count, results = xapian_search_weibo.search(query=query_dict, fields=['retweeted_uid','retweeted_mid']) print 'count:', count ruid_count = {} ruid_mid = {} for result in results(): r_uid = result['retweeted_uid'] if (r_uid == 0) or (not r_uid): continue ruid_mid[r_uid] = result['retweeted_mid'] try: ruid_count[r_uid] += 1 except KeyError: ruid_count[r_uid] = 1 sorted_result = sorted(ruid_count.items(), key=lambda d:d[1], reverse=True) print 'top_source_user:'******''' count, top_weibo = xapian_search_weibo.search(query={'_id':top_source_mid}, fields=['timestamp']) print 'count:', count for i in top_weibo(): timestamp = i['timestamp'] print 'timestamp:', ts2date(int(timestamp)) ''' return sorted_result
def sort_task(user, keyword, status, start_time, end_time, submit_time): query_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ {"term":{"submit_user": user}} ] } } } }, "size": 10000, "sort":{"submit_time":{"order":"desc"}} } query_list = [] if keyword: keyword_list = keyword.split(',') query_list.append({"terms":{"keyword_string":keyword_list}}) if status != 2: query_list.append({"term":{"status": status}}) if start_time and end_time: start_ts = datetime2ts(start_time) end_ts = datetime2ts(end_time) query_list.append({"range":{"start_time":{"gte":start_ts, "lte":end_ts}}}) query_list.append({"range":{"end_time":{"gte":start_ts, "lte":end_ts}}}) if submit_time: query_list.append({"term":{"submit_time": submit_time}}) if query_list: query_body["query"]["filtered"]["filter"]["bool"]["must"].extend(query_list) #print query_body search_results = es.search(index=USER_RANK_KEYWORD_TASK_INDEX, doc_type=USER_RANK_KEYWORD_TASK_TYPE, body=query_body)["hits"]["hits"] results = [] if search_results: for item in search_results: iter_item = item['_source'] tmp = [] tmp.append(iter_item['search_type']) tmp.append(json.loads(iter_item['keyword'])) tmp.append(ts2datetime(iter_item['start_time'])) tmp.append(ts2datetime(iter_item['end_time'])) tmp.append(iter_item['range']) tmp.append(ts2date(iter_item['create_time'])) tmp.append(iter_item['status']) tmp.append(iter_item['sort_norm']) tmp.append(iter_item['sort_scope']) tmp.append(item['_id']) # task_name results.append(tmp) return results
def compute_group_inner(task_name, task_user, start_ts): #step1: get task_user in-monitor task user retweet relation from monitor_inner_r #step2: get task_user in-task user retweet relation #step3: compute every inner user be-retweet ratio in task #step4: save top5 to es--monitor_result, doc_type=task_name, _id='inner_'+date e:'inner_2013-09-01' group_status = 0 time_segment = 3600*24 iter_time_segment = 900 iter_ts = start_ts - time_segment inner_group_dict = {} user_count_dict = {} print 'group inner ask_user:'******''' if iter_ts >= start_ts: break ''' key = 'inner_' + str(iter_ts) print 'iter_ts:', ts2date(iter_ts) inner_retweet_string = monitor_inner_r.hget(root_uid, key) print 'root_uid, key, inner_retweet_string:', root_uid, key, inner_retweet_string if inner_retweet_string: print 'yes' inner_retweet_dict = json.loads(inner_retweet_string) else: inner_retweet_dict = None if inner_retweet_dict: inner_group_dict[root_uid] = merge_dict(inner_group_dict[root_uid], inner_retweet_dict) iter_ts += iter_time_segment user_inner_retweet_count = sum(inner_group_dict[root_uid].values()) user_count_dict[root_uid] = user_inner_retweet_count all_be_retweet_count = sum(user_count_dict.values()) if all_be_retweet_count==0: group_status = 1 return group_status sort_user_inner_retweet_count = sorted(user_count_dict.items(), key=lambda x:x[1], reverse=True) top5_user = sort_user_inner_retweet_count[:5] # timestamp: '2013-09-01' date = ts2datetime(start_ts - 24*3600) index_body = {'date': date} for rank in range(1,6): key = 'top' + str(rank) index_body[key] = json.dumps(top5_user[rank-1]) key = 'inner_' + date # save inner-retweet graph by dict {root_uid1:{uid1:count1, uid2:count2}, ...} index_body['inner_graph'] = json.dumps(inner_group_dict) es.index(index=monitor_index_name, doc_type=task_name, id=key, body=index_body) group_status = 1 return group_status
def social_sensing_task(): count = 0 now_ts = ts2date(time.time()) while 1: temp = r.rpop("task_name") if temp: print "current_task:", json.loads(temp)[0] if not temp: print 'the last task:', count now_date = ts2date(time.time()) print 'All tasks Finished:', now_date break task_detail = json.loads(temp) count += 1 social_sensing(task_detail) print json.loads(temp)[0], ':Finished'
def read_uid_weibos(topic, date, windowsize, uid): # change end_ts = datetime2ts(date) start_ts = end_ts - Day * windowsize xapian_search_weibo = getXapianWeiboByTopic(topic, start_ts, end_ts) query_dict = {'user': uid} count, results = xapian_search_weibo.search(query=query_dict, fields=weibo_fields_list) if count == 0: weibo_list = [] else: weibo_list = [] for weibo in results(): wid = weibo['_id'] uid = weibo['user'] result = user_search.search_by_id(uid, fields=user_fields_list) if result: name = result['name'] location = result['location'] friends_count = result['friends_count'] followers_count = result['followers_count'] created_at = result['created_at'] statuses_count = result['statuses_count'] profile_image_url = result['profile_image_url'] else: name = u'未知' location = u'未知' friends_count = u'未知' followers_count = u'未知' created_at = u'未知' statuses_count = u'未知' profile_image_url = u'no' text = weibo['text'] geo = weibo['geo'] source = weibo['source'] timestamp = weibo['timestamp'] date = ts2date(timestamp) reposts_count = weibo['reposts_count'] comments_count = weibo['comments_count'] weibo_link = weiboinfo2url(uid, wid) domain = uid2domain(uid) row = [ wid, uid, name, location, friends_count, followers_count, created_at, statuses_count, profile_image_url, date, text, geo, source, reposts_count, comments_count, weibo_link ] weibo_list.append(row) sort_weibo_list = sorted(weibo_list, key=lambda x: x[9]) return sort_weibo_list
def get_max_k_timestamp(results, p_ts_list): # 最大斜率 且增量要大于平均增量 length = len(results) smooth_results = [] incre_dict = {} k_dict = {} # 平滑处理--感觉会消耗信息!!!!!! for i in range(length): if i>1: smooth = sum(results[i-2:i+1]) / 3.0 smooth_results.append(smooth) #print 'smooth_results:',i ,results[i-2:i+1], smooth_results l = len(smooth_results) if l>=2: ''' if smooth_results[l-2]!=0: k = (smooth_results[l-1] - smooth_results[l-2]) / smooth_results[l-2] k_dict[l-1] = k else: k_dict[l-1] = 0 ''' k = (smooth_results[l-1] - smooth_results[l-2]) / Hour k_dict[l-1] = k #print 'smooth_results:', smooth_results sort_k_list = sorted(k_dict.items(), key=lambda c:c[1], reverse=True) #print 'sort_k_list:', sort_k_list smooth_length = len(smooth_results) all_average = 0 for j in range(smooth_length): if j>0: incre = float(smooth_results[j] - smooth_results[j-1]) all_average += incre incre_dict[j-1] = incre average_incre = all_average / len(incre_dict) remove_list = [] #print 'incre_dict:', incre_dict # 筛掉增量小于平均增量的 for k in incre_dict: if incre_dict[k]<=average_incre: remove_list.append(k) after_remove_k_list = [] for sort_k in sort_k_list: if not sort_k[0] in remove_list: index = sort_k[0] timestamp = p_ts_list[index+1] k_value = sort_k[1] after_remove_k_list.append((index+1, timestamp, k_value)) max_k_timestamp = after_remove_k_list[0][1] #print 'after_remove_k_list:', after_remove_k_list print 'max_k_timestamp:', max_k_timestamp print 'max_k_timestamp:', ts2date(max_k_timestamp) return max_k_timestamp
def social_sensing_task(): while 1: temp = r.rpop("task_name") if not temp: now_date = ts2date(time.time()) print 'All tasks Finished:',now_date break task_detail = json.loads(temp) social_sensing(task_detail) print json.loads(temp)[0],':Finished'
def parseNews(news): news_dict = {} news = _json_loads(news) if not news: return {} for weibo in news: try: _id = deal_with(weibo['_id']) replies = 1 weibo['timestamp'] = ts2date(weibo['timestamp']) weibo['content168'] = weibo['content168'] news_dict[_id] = [replies, weibo] except: continue return news_dict
def social_sensing_task(): # 1. print start info count = 0 current_path = os.getcwd() file_path = os.path.join(current_path, 'social_sensing.py') now_ts = ts2date(time.time()) print_log = "&".join([file_path, "start", now_ts]) # print print_log #打印开始信息 while 1: temp = r.rpop("task_name") if not temp: print count now_ts = str(int(time.time())) print_log = "&".join([file_path, "end", now_ts]) break # finish all task in task_list task_detail = json.loads(temp) count += 1 social_sensing(task_detail)
def parseWeibos(weibos): weibo_dict = {} weibos = _json_loads(weibos) if not weibos: return {} for weibo in weibos: try: _id = weibo['_id'] username, profileimage = getuserinfo( weibo['user']) # get username and profile_image_url reposts_count = weibo['reposts_count'] weibo['weibo_link'] = weiboinfo2url(weibo['user'], _id) weibo['name'] = username weibo['profile_image_url'] = profileimage weibo['date'] = ts2date(weibo['timestamp']) weibo_dict[_id] = [reposts_count, weibo] except: continue return weibo_dict
def add_task(user_name, type="keyword", range="all", pre='flow_text_', during='1', start_time='2013-09-07', end_time='2013-09-07', keyword='hello,world', sort_norm='bci', sort_scope='in_limit_keyword', time=1, isall=False): time_now = TIME.time() body_json = { 'submit_user': user_name, 'keyword': keyword, 'submit_time': str(ts2date(time_now)), 'end_time': end_time, 'search_type': type, 'status': 0, 'range': range, 'user_ts': user_name + str(time_now), 'pre': pre, 'during': during, 'start_time': start_time, 'sort_norm': sort_norm, 'sort_scope': sort_scope, 'time': time, 'isall': isall } try: es.index(index=USER_RANK_KEYWORD_TASK_INDEX, doc_type=USER_RANK_KEYWORD_TASK_TYPE, body=body_json) return body_json["user_ts"] except Exception, e1: print e1
def get_propagate_peak_news(topic, start_ts, end_ts): lis = [] ts_lis = [] total_days = (end_ts - start_ts) / During for i in range(total_days + 1): ts = start_ts + During * i count = 0 for k, v in mtype_kv.iteritems(): dcount = ReadPropagateNews(topic, ts, During, v) if dcount: count += sum(dcount['dcount'].values()) lis.append(float(count)) ts_lis.append(ts2date(ts)) if not lis or not len(lis): return {} new_zeros = detect_peaks(lis) time_lis = {} for idx, point_idx in enumerate(new_zeros): timestamp = ts_lis[point_idx] time_lis[idx] = {'ts': timestamp, 'title': 'E' + str(idx)} return {'ts': ts_lis, 'count_list': lis, 'peak': time_lis}
def key_words_search(task_id, search_type, pre, during, start_time, keyword_list, search_key='', sort_norm='', sort_scope='', time=1, isall=False, number=100): number = int(number) should = [] for key in keyword_list: if search_type == "hashtag": should.append({"prefix": {"text": "#" + key + "#"}}) else: should.append({"wildcard": {"text": "*" + key + "*"}}) index_list = [] date = ts2datetime(start_time) index_name = pre + date while during: if es_flow_text.indices.exists(index=index_name): index_list.append(index_name) start_time = start_time + DAY date = ts2datetime(start_time) index_name = pre + date during -= 1 print index_list uid_set = set() text_results = [] query_body = { "query": { "bool": { "must": should } }, "sort": { "user_fansnum": { "order": "desc" } }, "size": 5000 } results = es_flow_text.search(index=index_list, doc_type='text', body=query_body, _source=False, fields=[ "uid", "user_fansnum", "text", "message_type", "sentiment", "timestamp", "geo", "retweeted", "comment" ])["hits"]["hits"] id_index = 0 index_list = [] un_uid_list = [] for item in results: if item['fields']['uid'][0] not in uid_set: uid_set.add(item['fields']['uid'][0]) un_uid_list.append(item['fields']['uid'][0]) index_list.append(id_index) id_index += 1 uid_list = [] print "un_uid_list: ", len(un_uid_list) portrait_list = [] count = 0 in_index = 0 if not isall and un_uid_list: # 库内 portrait_results = es_user_portrait.mget(index=USER_INDEX_NAME, doc_type=USER_INDEX_TYPE, body={"ids": un_uid_list}, _source=False, fields=['uname'])["docs"] for item in portrait_results: if item["found"]: portrait_list.append(item['_id']) nick_name = item['fields']['uname'][0] if nick_name == 'unknown': nick_name = item['_id'] index = index_list[in_index] weibo_url = weiboinfo2url(results[index]['fields']['uid'][0], results[index]['_id']) text_results.extend([ results[index]['fields']['uid'][0], results[index]['fields']['user_fansnum'][0], results[index]['fields']['text'][0], results[index]['fields']['message_type'][0], results[index]['fields']['sentiment'][0], ts2date(results[index]['fields']['timestamp'][0]), results[index]['fields']['geo'][0], results[index]['fields']['retweeted'][0], results[index]['fields']['comment'][0], nick_name, weibo_url ]) count += 1 if count == number: break print "portrait_len, ", len(portrait_list) in_index += 1 if portrait_list: uid_list = in_sort_filter(time, sort_norm, sort_scope, None, portrait_list, True, number) # sort elif un_uid_list: profile_result = es_user_profile.mget(index="weibo_user", doc_type="user", body={"ids": un_uid_list}, fields=['nick_name'])["docs"] for i in range(len(profile_result)): index = index_list[i] try: nick_name = profile_result[i]['fields']['nick_name'][0] except: nick_name = un_uid_list[i] item = results[index] weibo_url = weiboinfo2url(item['fields']['uid'][0], results[index]['_id']) text_results.append([ item['fields']['uid'][0], item['fields']['user_fansnum'][0], item['fields']['text'][0], item['fields']['message_type'][0], item['fields']['sentiment'][0], ts2date(item['fields']['timestamp'][0]), results[index]['fields']['geo'][0], results[index]['fields']['retweeted'][0], results[index]['fields']['comment'][0], nick_name, weibo_url ]) if i == number: break uid_list = all_sort_filter(un_uid_list[:number], sort_norm, time, True, number) print "filter_uid_list: ", len(uid_list) if uid_list: results = make_up_user_info(uid_list, isall, time, sort_norm) else: results = [] print "results: ", len(results) # 修改状态 task_detail = es_user_portrait.get(index=USER_RANK_KEYWORD_TASK_INDEX, doc_type=USER_RANK_KEYWORD_TASK_TYPE, id=task_id) item = task_detail['_source'] item['status'] = 1 item['result'] = json.dumps(results) item['text_results'] = json.dumps(text_results) item['number'] = len(results) es_user_portrait.index(index=USER_RANK_KEYWORD_TASK_INDEX, doc_type=USER_RANK_KEYWORD_TASK_TYPE, id=task_id, body=item) return "1"
def community_result(community_user_list, topic, date, windowsize): #change end_ts = datetime2ts(date) start_ts = end_ts - windowsize * Day xapian_search_weibo = getXapianWeiboByTopic(topic, start_ts, end_ts) query_dict = {'$or': []} for uid in community_user_list: query_dict['$or'].append({'user': int(uid)}) community_info = [] count, weibo_results = xapian_search_weibo.search(query=query_dict, fields=weibo_fields_list) if count == 0: return None, None, None sentiment_count = {} for weibo in weibo_results(): uid = weibo['user'] _id = weibo['_id'] result = user_search.search_by_id(uid, fields=user_fields_list) if result: name = result['name'] location = result['location'] friends_count = result['friends_count'] followers_count = result['followers_count'] created_at = result['created_at'] statuses_count = result['statuses_count'] profile_image_url = result['profile_image_url'] else: name = u'未知' location = u'未知' friends_count = u'未知' followers_count = u'未知' created_at = u'未知' statuses_count = u'未知' profile_image_url = u'no' text = weibo['text'] timestamp = weibo['timestamp'] date = ts2date(timestamp) reposts_count = weibo['reposts_count'] source = weibo['source'] geo = weibo['geo'] comments_count = weibo['comments_count'] sentiment = weibo['sentiment'] sentiment_name = emotions_kv[sentiment] weibo_link = weiboinfo2url(uid, _id) domain = uid2domain(uid) try: sentiment_count[sentiment] += 1 except KeyError: sentiment_count[sentiment] = 1 community_info.append([ _id, name, location, friends_count, followers_count, created_at, statuses_count, profile_image_url, text, date, reposts_count, source, geo, comments_count, sentiment_name, weibo_link, domain ]) sort_community_info = sorted(community_info, key=lambda x: x[10], reverse=True) #以转发量排序 mset = xapian_search_weibo.search(query=query_dict, max_offset=50, mset_direct=True) top_keyword = top_keywords(gen_mset_iter(xapian_search_weibo, mset, fields=['terms']), top=50) sort_top_keyword = sorted(top_keyword, key=lambda x: x[1], reverse=True) new_sentiment_list = [] for sentiment in sentiment_count: sentiment_ch = emotions_kv[int(sentiment)] num = sentiment_count[sentiment] ratio = float(num) / float(count) new_sentiment_list.append([sentiment_ch, num, ratio]) return sort_community_info, sort_top_keyword, new_sentiment_list, query_dict
def key_words_search(task_id, search_type , pre , during , start_time , keyword_list , search_key = '' , sort_norm = '', sort_scope = '' ,time = 7 , isall = False, number = 100): number = int(number) should = [] for key in keyword_list: if search_type == "hashtag": should.append({"prefix":{"text": "#" + key + "#"}}) else: should.append({"wildcard":{"text": "*" +key + "*"}}) index_list = [] date = ts2datetime(start_time) index_name = pre + date while during: if es_flow_text.indices.exists(index=index_name): index_list.append(index_name) start_time = start_time + DAY date = ts2datetime(start_time) index_name = pre + date during -= 1 print index_list uid_set = set() text_results = [] sorted_text_results = [] query_body = { "query":{ "bool":{ "must":should } }, "sort":{"user_fansnum":{"order":"desc"}}, "size":5000 } results = es_flow_text.search(index = index_list , doc_type = 'text' , body = query_body, _source=False, fields=["uid", "user_fansnum","text", "message_type", "sentiment","timestamp", "geo", "retweeted", "comment"])["hits"]["hits"] id_index = 0 index_list = [] un_uid_list = [] for item in results : if item['fields']['uid'][0] not in uid_set: uid_set.add(item['fields']['uid'][0]) un_uid_list.append(item['fields']['uid'][0]) index_list.append(id_index) id_index += 1 #get_all_filed(sort_norm , time) uid_list = [] print "un_uid_list: ", len(un_uid_list) portrait_list = [] count = 0 in_index = 0 if not isall and un_uid_list : # 库内 portrait_results = es_user_portrait.mget(index=USER_INDEX_NAME, doc_type=USER_INDEX_TYPE, body={"ids":un_uid_list}, _source=False, fields=['uname'])["docs"] for item in portrait_results: if item["found"]: portrait_list.append(item['_id']) nick_name = item['fields']['uname'][0] if nick_name == 'unknown': nick_name = item['_id'] index = index_list[in_index] weibo_url = weiboinfo2url(results[index]['fields']['uid'][0], results[index]['_id']) text_results.extend([results[index]['fields']['uid'][0], results[index]['fields']['user_fansnum'][0], results[index]['fields']['text'][0], results[index]['fields']['message_type'][0], results[index]['fields']['sentiment'][0], ts2date(results[index]['fields']['timestamp'][0]), results[index]['fields']['geo'][0], results[index]['fields']['retweeted'][0], results[index]['fields']['comment'][0], nick_name, weibo_url]) count += 1 if count == number: break print "portrait_len, ", len(portrait_list) in_index += 1 if portrait_list: uid_list = in_sort_filter(time,sort_norm ,sort_scope ,None , portrait_list , True, number) # sort for iter_uid in uid_list: iter_index = portrait_list.index(iter_uid) sorted_text_results.append(text_results[i]) elif un_uid_list: profile_result = es_user_profile.mget(index="weibo_user", doc_type="user", body={"ids":un_uid_list}, fields=['nick_name'])["docs"] for i in range(len(profile_result)): index = index_list[i] try: nick_name = profile_result[i]['fields']['nick_name'][0] except: nick_name = un_uid_list[i] item = results[index] weibo_url = weiboinfo2url(item['fields']['uid'][0], results[index]['_id']) text_results.append([item['fields']['uid'][0], item['fields']['user_fansnum'][0], item['fields']['text'][0], item['fields']['message_type'][0], item['fields']['sentiment'][0], ts2date(item['fields']['timestamp'][0]), results[index]['fields']['geo'][0], results[index]['fields']['retweeted'][0], results[index]['fields']['comment'][0], nick_name, weibo_url]) if i == number: break uid_list = all_sort_filter(un_uid_list[:number] , sort_norm , time ,True, number) sorted_text_results = [] f = open("small.txt", "wb") for iter_uid in uid_list: iter_index = un_uid_list.index(iter_uid) f.write(str(iter_uid)+"\n") sorted_text_results.append(text_results[iter_index]) f.close() print "filter_uid_list: ", len(uid_list) if uid_list: results = make_up_user_info(uid_list,isall,time,sort_norm) else: results = [] print "results: ", len(results) # 修改状态 task_detail = es_user_portrait.get(index=USER_RANK_KEYWORD_TASK_INDEX , doc_type=USER_RANK_KEYWORD_TASK_TYPE, id=task_id) item = task_detail['_source'] item['status'] = 1 item['result'] = json.dumps(results) item['text_results'] = json.dumps(sorted_text_results) item['number'] = len(results) es_user_portrait.index(index = USER_RANK_KEYWORD_TASK_INDEX , doc_type=USER_RANK_KEYWORD_TASK_TYPE , id=task_id, body=item) return "1"
def get_pushers(topic, new_peaks, new_bottom, ts_list, topic_xapian_id): #unit = 900 #p_during = Hour p_ts_list = [] results = [] end_ts = ts_list[new_peaks[0]] begin_ts = ts_list[new_bottom[0]] print 'pusher_start_ts:', ts2date(begin_ts) print 'pusher_end_ts:', ts2date(end_ts) if begin_ts>end_ts: begin_ts = ts_list[0] interval = (end_ts - begin_ts) / p_during for i in range(interval, 0, -1): begin_ts = end_ts - p_during * i over_ts = begin_ts + p_during p_ts_list.append(over_ts) items = db.session.query(PropagateCount).filter(PropagateCount.topic==topic ,\ PropagateCount.end<=over_ts ,\ PropagateCount.end>begin_ts ,\ PropagateCount.range==unit).all() if items: result = Merge_propagate(items) else: result = 0 results.append(float(result)) #print 'pusher_line:', results max_k_timestamp = get_max_k_timestamp(results, p_ts_list) # 获取增速最快的时间点 #save max_k_timestamp # save_mak_k(max_k_timestamp) end = max_k_timestamp start = max_k_timestamp - p_during xapian_search_weibo = getXapianWeiboByTopic(topic_xapian_id) query_dict = { 'timestamp':{'$gt':end, '$lt':end+3600} } ''' count , results = xapian_search_weibo.search(query=query_dict, fields=['_id', 'user','retweeted_uid','retweeted_mid', 'timestamp']) ruid_count = {} ruid_mid = {} for result in results(): r_uid = result['retweeted_uid'] if (r_uid == 0) or (not r_uid): continue ruid_mid[r_uid] = result['retweeted_mid'] try: ruid_count[r_uid] += 1 except KeyError: ruid_count[r_uid] = 1 sorted_pushers = sorted(ruid_count.items(), key=lambda d:d[1], reverse=True) print 'top_trend_pusher_uid:',sorted_pushers pusher_list = [] for pusher in sorted_pushers: uid = pusher[0] mid = ruid_mid[uid] value = pusher[1] ''' #以上是找到斜率最大的时间段内所有转发微博集中地源头用户--但是介于这些用户的相关信息找不到,因而选择使用下面的方法 #以下是通过找到斜率最大的时间段内所有微博中转发数最大的用户 count ,results = xapian_search_weibo.search(query=query_dict, sort_by=['reposts_count'], fields=['_id', 'user', 'reposts_count']) print 'pusher_search_count:', count print 'pusher_query_dict:', query_dict pusher_list = [] count = 0 for result in results(): count += 1 if count>100: break wid = result['_id'] uid = result['user'] value = result['reposts_count'] pusher_list.append((uid, wid, value)) # sort by reposts_count # sort_by_rc(pusher_list) return pusher_list
user_set = r_flow.rpop('update_bci_list') bulk_action = [] if user_set: items = json.loads(user_set) uid_list = [] for item in items: uid_list.append(item['id']) if uid_list: search_results = es.mget(index=BCIHIS_INDEX_NAME, doc_type=BCIHIS_INDEX_TYPE, body={"ids":uid_list})["docs"] cal_num_for_bci_history(uid_list, items, search_results) count += len(uid_list) if count % 10000 == 0: te = time.time() #print "count: %s, cost time: %s" %(count, te-ts) ts = te else: print count break if __name__ == "__main__": time.sleep(100) ts = time.time() print "all_bci&start&%s"%ts2date(ts) try: reducer() except Exception, e: print e, '&error&', ts2date(time.time()) print "all_bci&end&%s"%ts2date(time.time())
from parameter import TIME_INTERVAL def get_queue_index(timestamp): time_struc = time.gmtime(float(timestamp)) hour = time_struc.tm_hour minute = time_struc.tm_min index = hour*4+math.ceil(minute/15.0) #every 15 minutes return int(index) if __name__ == "__main__": now_ts = time.time() date_ts = datetime2ts(ts2datetime(now_ts)) if now_ts - TIME_INTERVAL < date_ts: sys.exit(0) tmp_date = ts2date(now_ts) print "cron_influence_start_" + tmp_date index = get_queue_index(now_ts) #当前时间戳所对应的时间区间 influence_ts = "influence_timestamp_" + str(index) scan_cursor = 0 count = 0 while 1: re_scan = r.hscan(influence_ts, scan_cursor, count=1000) scan_cursor = re_scan[0] detail = re_scan[1] if len(detail): for k,v in detail.iteritems(): r.zadd(influence_ts, v, k) count += 1 if int(scan_cursor) == 0: break
def get_pushers(topic, new_peaks, new_bottom, ts_list): #unit = 900 #p_during = Hour p_ts_list = [] results = [] end_ts = ts_list[new_peaks[0]] begin_ts = ts_list[new_bottom[0]] print 'pusher_start_ts:', ts2date(begin_ts) print 'pusher_end_ts:', ts2date( end_ts) #有两个时间 起点和终点 波峰和波谷 现在搞反了 不知道为什么 if begin_ts > end_ts: begin_ts = ts_list[0] interval = (end_ts - begin_ts) / p_during print end_ts - begin_ts print p_during print interval for i in range(interval, 0, -1): begin_ts = end_ts - p_during * i over_ts = begin_ts + p_during #print '383',begin_ts,over_ts p_ts_list.append(over_ts) items = db.session.query(PropagateCount).filter(PropagateCount.topic==topic ,\ PropagateCount.end<=over_ts ,\ PropagateCount.end>begin_ts ,\ PropagateCount.range==unit).all() if items: result = Merge_propagate(items) else: result = 0 results.append(float(result)) #print 'pusher_line:', results #try: print results print p_ts_list try: max_k_timestamp = get_max_k_timestamp(results, p_ts_list) # 获取增速最快的时间点 except: max_k_timestamp = end_ts #save max_k_timestamp # save_mak_k(max_k_timestamp) end = max_k_timestamp start = max_k_timestamp - p_during query_body = { 'query': { 'bool': { 'must': # {'term':{'name': topic}}, { 'range': { 'timestamp': { 'gte': end, 'lt': end + 3600 } #3600 } } } }, 'size': 1000000, # 返回条数限制 待删 'sort': { "timestamp": { "order": "asc" } } } es_search_weibos = weibo_es.search(index=topic, doc_type=weibo_index_type, body=query_body)['hits']['hits'] #xapian_search_weibo = getXapianWeiboByTopic(topic_xapian_id) #query_dict = { #'timestamp':{'$gt':end, '$lt':end+3600} #} #以下是通过找到斜率最大的时间段内所有微博中转发数最大的用户 #results = xapian_search_weibo.search(query=query_dict, sort_by=['reposts_count'], fields=['_id', 'user', 'reposts_count']) results = es_search_weibos print 'pusher_search_count:', len(results) #print 'pusher_query_dict:', query_dict pusher_list = [] count = 0 for result in results: count += 1 if count > 100: break wid = result['_source']['mid'] uid = result['_source']['uid'] value = result['_source']['retweeted'] pusher_list.append((uid, wid, value)) # sort by reposts_count # sort_by_rc(pusher_list) return pusher_list
one_item['today_bci'] = 0 one_item['update_time'] = TODAY_TIME array.append(one_item) count += 1 if count % 1000 == 0: r_flow.lpush('update_bci_list', json.dumps(array)) array = [] if count % 100000 == 0: print count except StopIteration: print "all done" if array: r_flow.lpush('update_bci_list', json.dumps(array)) break print count if __name__ == '__main__': todaydate = ts2datetime(time.time()) #todaydate = '2016-04-19' print todaydate print "push_bci_redis&start&%s" %ts2date(time.time()) ts = datetime2ts(todaydate) #print es_user_profile.indices.put_mapping(index="bci_history", doc_type="bci", body={'properties':{"user_friendsnum":{"type":"long"}}}) try: mapper_bci_today(todaydate) except Exception, e: print e, '&error&', ts2date(time.time()) print "push_bci_redis&end%s" %ts2date(time.time())
one_item['user_fansnum'] = temp['fields']["user_fansnum"][0] else: one_item['user_fansnum'] = 0 array.append(one_item) count += 1 if count % 1000 == 0: r_flow.lpush('update_bci_list', json.dumps(array)) array = [] if count % 100000 == 0: print count except StopIteration: print "all done" if array: r_flow.lpush('update_bci_list', json.dumps(array)) break print count if __name__ == '__main__': todaydate = ts2datetime(time.time()) todaydate = "2016-04-18" print todaydate lenth = r_flow.llen('update_bci_list') if not lenth: print "cron/scan/history_bci&start&%s" %ts2date(time.time()) mapper_bci_history(todaydate) print "cron/scan/history_bci&end&%s" %ts2date(time.time()) else: time.sleep(60)
new_mapping_dict = {} for uid in mapping_dict: user_list = json.loads(mapping_dict[uid]) user_list[1] = '4' new_mapping_dict[uid] = json.dumps(user_list) r.hmset(hash_name, new_mapping_dict) #use to deal compute fail situation def change_status_compute_fail(mapping_dict): hash_name = 'compute' status = 1 new_mapping_dict = {} for uid in mapping_dict: user_list = json.loads(mapping_dict[uid]) user_list[1] = '1' new_mapping_dict[uid] = json.dumps(user_list) r.hmset(hashname, new_mapping_dict) if __name__=='__main__': log_time_ts = int(time.time()) print 'cron/text_attribute/scan_compute_redis_imm.py&start&' + str(log_time_ts) try: scan_compute_redis() except Exception, e: print e, '&error&', ts2date(time.time()) log_time_ts = int(time.time()) print 'cron/text_attribute/scan_compute_redis_imm.py&end&' + str(log_time_ts)
network_es_mappings() network_count_es_mappings() tmp_file = tempfile.NamedTemporaryFile(delete=False) print 'step 1: scan', ts2date(timestamp) scan_retweet(tmp_file) tmp_file.close() if not tmp_file: return input_tmp_path = tmp_file.name print input_tmp_path ITER_COUNT = 10 TOP_N = 50 print 'step 2: pagerank', ts2date(time.time()) all_uids_count, dg_sorted_uids, pr_sorted_uids = pagerank( ITER_COUNT, input_tmp_path, TOP_N, 'all') print 'step 3: save', ts2date(time.time()) save_count_results(all_uids_count, es_num) save_dg_pr_results(dg_sorted_uids, es_num, 'dg') save_dg_pr_results(pr_sorted_uids, es_num, 'pr') print 'save done', ts2date(time.time()) if __name__ == '__main__': try: pagerank_rank() except Exception, e: print e, '&error&', ts2date(time.time())
def update_flow_information(user_info): results = {} # results ={uid: {'activity_geo_dict':'', 'activity_geo':'', 'hashtag_dict':'', 'hashtag':'', 'online_pattern_dict':'', 'online_pattern':''}} uid_list = user_info.keys() now_ts = time.time() now_date = ts2datetime(now_ts) timestamp = datetime2ts(now_date) #test timestamp = datetime2ts('2013-09-08') user_hashtag_dict = dict() user_online_dict = dict() ip_user_count_dict = {} new_day_ip_dict = dict() for i in range(7,0,-1): ts = timestamp - 24*3600*i print 'iter date:', ts2date(ts) results = r_cluster.hmget('hashtag_'+str(ts), uid_list) online_pattern_results = r_cluster.hmget('online_'+str(ts), uid_list) if i==0: ip_result = r_cluater.hmget('hashtag_'+str(ts), uid_list) for j in range(0, len(uid_list)): uid = uid_list[j] #attr: hashtag if results[j]: hashtag_dict = json.loads(results[j]) for hashtag in hashtag_dict: if uid in user_hashtag_dict: try: user_hashtag_dict[uid][hashtag] += hashtag_dict[hashtag] except: user_hashtag_dict[uid][hashtag] = hashtag_dict[hashtag] else: user_hashtag_dict[uid] = {hashtag: hashtag_dict[hashtag]} ''' #attr: online_pattern if online_pattern_results[j]: online_pattern_dict = json.loads(online_pattern_results[j]) for online_pattern in online_pattern_dict: if uid in user_online_dict: try: user_online_dict[uid][online_pattern] += online_pattern_dict[online_pattern] except: user_online_dict[uid][online_pattern] = online_pattern_dict[online_pattern] else: user_online_dict[uid] = {online_pattern: online_pattern_dict[online_pattern]} ''' #attr: activity_geo by ip-timestamp if i==0 and ip_result[j]: ip_timestamp_dict = json.loads(ip_result[j]) old_flow_information = user_info[uid] old_day_geo_list = json.loads(old_flow_information['activity_geo_dict']) for ip in ip_timestamp_dict: ip_count = len(ip_timestamp_dict[ip].split('&')) new_day_ip_dict[uid][ip] = ip_count geo_dict = ip2city(new_day_ip_dict[uid]) if len(old_day_geo_list)>=30: new_day_geo_list = old_day_geo_list[1:].append(geo_dict) else: new_day_geo_list = old_day_geo_list.append(geo_dict) week_geo_list = [] week_day_geo_list = new_day_geo[-7:] for day_geo_dict in week_day_geo_list: week_geo_list.extend(day_geo_dict.keys()) week_geo_list = list(set(week_geo_list)) activity_geo_string = '' new_week_geo_list = [] for geo_string in week_geo_list: day_geo_string = '&'.join(geo_string.split('\t')) new_week_geo_list.append(day_geo_string) activity_geo_string = '&'.join(new_week_geo_list) print 'activity_geo_string:', activity_geo_string for uid in uid_list: #attr: hashtag try: hashtag_dict = user_hashtag_dict[uid] hashtag_string = json.dumps(hashtag_dict) hashtag_list = '&'.join(hashtag_dict.keys()) except KeyError: hashtag_string = '' hashtag_list = '' ''' #attr: online_pattern try: online_dict = user_online_dict[uid] online_string = json.dumps(online_dict) online_list = '&'.join(online_dict.keys()) except KeyError: online_string = '' online_list = '' ''' result[uid] = {'hashtag_dict':hashtag_string, 'hashtag':hashtag_list, \ 'activity_geo_dict': json.loads(new_day_geo_list), 'activity_geo': activity_geo_string, \ 'online_pattern_dict': online_pattern_string, 'online_pattern': online_pattern_list} return result
def social_sensing(task_detail): # 任务名 传感器 终止时间 之前状态 创建者 时间 task_name = task_detail[0] social_sensors = task_detail[1] stop_time = task_detail[2] create_by = task_detail[3] ts = int(task_detail[4]) print ts2date(ts) # PART 1 #forward_result = get_forward_numerical_info(task_name, ts, create_by) # 之前时间阶段内的原创微博list/retweeted forward_origin_weibo_list = query_mid_list(ts-time_interval, social_sensors, forward_time_range) forward_retweeted_weibo_list = query_mid_list(ts-time_interval, social_sensors, forward_time_range, 3) # 当前阶段内原创微博list current_mid_list = query_mid_list(ts, social_sensors, time_interval) current_retweeted_mid_list = query_mid_list(ts, social_sensors, time_interval, 3) all_mid_list = [] all_mid_list.extend(current_mid_list) all_mid_list.extend(current_retweeted_mid_list) all_mid_list.extend(forward_origin_weibo_list) all_mid_list.extend(forward_retweeted_weibo_list) all_origin_list = [] all_origin_list.extend(current_mid_list) all_origin_list.extend(forward_origin_weibo_list) all_retweeted_list = [] all_retweeted_list.extend(current_retweeted_mid_list) all_retweeted_list.extend(forward_retweeted_weibo_list)#被转发微博的mid/root-mid print "all mid list: ", len(all_mid_list) #print "all_origin_list", all_origin_list #print "all_retweeted_list", all_retweeted_list # 查询微博在当前时间内的转发和评论数, 聚合按照message_type statistics_count = query_related_weibo(ts, all_mid_list, time_interval) if all_origin_list: origin_weibo_detail = query_hot_weibo(ts, all_origin_list, time_interval) # 原创微博详情 else: origin_weibo_detail = {} if all_retweeted_list: retweeted_weibo_detail = query_hot_weibo(ts, all_retweeted_list, time_interval) # 转发微博详情 else: retweeted_weibo_detail = {} current_total_count = statistics_count['total_count'] # 当前阶段内所有微博总数 current_retweeted_count = statistics_count['retweeted'] current_comment_count = statistics_count['comment'] # 聚合当前时间内重要的人 important_uid_list = [] datetime = ts2datetime(ts-time_interval) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) if exist_es: search_results = get_important_user(ts, all_mid_list, time_interval) important_uid_list = search_results # 根据获得uid_list,从人物库中匹配重要人物 if important_uid_list: important_results = es_user_portrait.mget(index=portrait_index_name,doc_type=portrait_index_type, body={"ids": important_uid_list})['docs'] else: important_results = [] filter_important_list = [] # uid_list if important_results: for item in important_results: if item['found']: #if item['_source']['importance'] > IMPORTANT_USER_THRESHOULD: filter_important_list.append(item['_id']) print "filter_important_list", filter_important_list print "important_results", important_uid_list #判断感知 finish = unfinish_signal # "0" process_status = "1" if int(stop_time) <= ts: # 检查任务是否已经完成 finish = finish_signal process_status = "0" # 感知到的事, all_mid_list sensitive_text_list = [] # 有事件发生时开始 if 1: index_list = [] important_words = [] datetime_1 = ts2datetime(ts) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es = es_text.indices.exists(index=index_name_1) if exist_es: index_list.append(index_name_1) datetime_2 = ts2datetime(ts-DAY) index_name_2 = flow_text_index_name_pre + datetime_2 exist_es = es_text.indices.exists(index=index_name_2) if exist_es: index_list.append(index_name_2) if index_list and all_mid_list: query_body = { "query":{ "filtered":{ "filter":{ "terms":{"mid": all_mid_list} } } }, "size": 5000 } search_results = es_text.search(index=index_list, doc_type="text", body=query_body)['hits']['hits'] tmp_sensitive_warning = "" text_dict = dict() # 文本信息 mid_value = dict() # 文本赋值 duplicate_dict = dict() # 重合字典 portrait_dict = dict() # 背景信息 classify_text_dict = dict() # 分类文本 classify_uid_list = [] duplicate_text_list = [] sensitive_words_dict = dict() if search_results: for item in search_results: iter_uid = item['_source']['uid'] iter_mid = item['_source']['mid'] iter_text = item['_source']['text'].encode('utf-8', 'ignore') iter_sensitive = item['_source'].get('sensitive', 0) duplicate_text_list.append({"_id":iter_mid, "title": "", "content":iter_text}) if iter_sensitive: tmp_sensitive_warning = signal_sensitive_variation #涉及到敏感词的微博 sensitive_words_dict[iter_mid] = iter_sensitive keywords_dict = json.loads(item['_source']['keywords_dict']) personal_keywords_dict = dict() for k, v in keywords_dict.iteritems(): k = k.encode('utf-8', 'ignore') personal_keywords_dict[k] = v classify_text_dict[iter_mid] = personal_keywords_dict classify_uid_list.append(iter_uid) # 去重 if duplicate_text_list: dup_results = duplicate(duplicate_text_list) for item in dup_results: if item['duplicate']: duplicate_dict[item['_id']] = item['same_from'] # 分类 if classify_text_dict: classify_results = topic_classfiy(classify_uid_list, classify_text_dict) mid_value = dict() #print "classify_results: ", classify_results for k,v in classify_results.iteritems(): # mid:value mid_value[k] = topic_value_dict[v[0]] sensitive_weibo_detail = {} if sensitive_words_dict: sensitive_mid_list = sensitive_words_dict.keys() sensitivie_weibo_detail = query_hot_weibo(ts, sensitive_mid_list, time_interval) results = dict() results['mid_topic_value'] = json.dumps(mid_value) results['duplicate_dict'] = json.dumps(duplicate_dict) results['sensitive_words_dict'] = json.dumps(sensitive_words_dict) results['sensitive_weibo_detail'] = json.dumps(sensitive_weibo_detail) results['origin_weibo_number'] = len(all_origin_list) results['retweeted_weibo_number'] = len(all_retweeted_list) results['origin_weibo_detail'] = json.dumps(origin_weibo_detail) results['retweeted_weibo_detail'] = json.dumps(retweeted_weibo_detail) results['retweeted_weibo_count'] = current_retweeted_count results['comment_weibo_count'] = current_comment_count results['weibo_total_number'] = current_total_count results['important_users'] = json.dumps(filter_important_list) results['unfilter_users'] = json.dumps(important_uid_list) results['timestamp'] = ts #results['clustering_topic'] = json.dumps(topic_list) # es存储当前时段的信息 doctype = create_by + '-' + task_name es_user_portrait.index(index=index_sensing_task, doc_type=doctype, id=ts, body=results) # 更新manage social sensing的es信息 temporal_result = es_user_portrait.get(index=index_manage_social_task, doc_type=task_doc_type, id=doctype)['_source'] temporal_result['finish'] = finish temporal_result['processing_status'] = process_status history_status = json.loads(temporal_result['history_status']) history_status.append(ts) temporal_result['history_status'] = json.dumps(history_status) es_user_portrait.index(index=index_manage_social_task, doc_type=task_doc_type, id=doctype, body=temporal_result) return "1"
print 'step 2: pagerank', ts2date(time.time()) all_uids_count, dg_sorted_uids, pr_sorted_uids = pagerank(ITER_COUNT, input_tmp_path, TOP_N, 'keywords') #save results print 'step 3: save', ts2date(time.time()) save_mark = save_task_results(dg_sorted_uids, pr_sorted_uids, network_task_information) print 'save done', ts2date(time.time()) #identify save status if not save_mark: #status fail: push task information to redis queue push_mark = push_task_information(network_task_information) if not push_mark: print 'error push task queue' else: #if no exist - pass pass if __name__=='__main__': log_time_ts = time.time() log_time_date = ts2date(log_time_ts) print 'cron/network/cron_network.py&start&' + log_time_date try: scan_network_keywords_task() except Exception, e: print e, '&error&', ts2date(time.time()) log_time_ts = time.time() log_time_date = ts2date(log_time_ts) print 'cron/network/cron_network.py&end&' + log_time_date
# -*- coding = utf-8 -*- import os import time import sys reload(sys) sys.path.append('./../../') from time_utils import ts2datetime, ts2date from global_utils import R_SPARK path = "/home/ubuntu01/txt" file_list = os.listdir(path) for each in file_list: filename = each.split('.')[0] if filename.split('_')[-1] == 'yes3': os.remove(path+'/'+each) R_SPARK.flushdb() ts = ts2date(time.time()) print "/cron/flow3/del_file_yes.py&end&%s" %ts print "/cron/flow3/flushdb.py&end&%s" %ts
def social_sensing(task_detail): # 任务名 传感器 终止时间 之前状态 创建者 时间 task_name = task_detail[0] social_sensors = task_detail[1] stop_time = task_detail[2] forward_warning_status = task_detail[3] create_by = task_detail[4] ts = int(task_detail[5]) new = int(task_detail[6]) print ts2date(ts) # PART 1 forward_result = get_forward_numerical_info(task_name, ts, create_by) # 之前时间阶段内的原创微博list/retweeted forward_origin_weibo_list = query_mid_list(ts - time_interval, social_sensors, forward_time_range) forward_retweeted_weibo_list = query_mid_list(ts - time_interval, social_sensors, forward_time_range, 3) # 当前阶段内原创微博list current_mid_list = query_mid_list(ts, social_sensors, time_interval) current_retweeted_mid_list = query_mid_list(ts, social_sensors, time_interval, 3) all_mid_list = [] all_mid_list.extend(current_mid_list) all_mid_list.extend(current_retweeted_mid_list) all_mid_list.extend(forward_origin_weibo_list) all_mid_list.extend(forward_retweeted_weibo_list) all_origin_list = [] all_origin_list.extend(current_mid_list) all_origin_list.extend(forward_origin_weibo_list) all_retweeted_list = [] all_retweeted_list.extend(current_retweeted_mid_list) all_retweeted_list.extend(forward_retweeted_weibo_list) # 被转发微博的mid/root-mid print "all mid list: ", len(all_mid_list) # print "all_origin_list", all_origin_list # print "all_retweeted_list", all_retweeted_list # 查询微博在当前时间内的转发和评论数, 聚合按照message_type statistics_count = query_related_weibo(ts, all_mid_list, time_interval) if all_origin_list: origin_weibo_detail = query_hot_weibo(ts, all_origin_list, time_interval) # 原创微博详情 else: origin_weibo_detail = {} if all_retweeted_list: retweeted_weibo_detail = query_hot_weibo(ts, all_retweeted_list, time_interval) # 转发微博详情 else: retweeted_weibo_detail = {} current_total_count = statistics_count["total_count"] # 当前阶段内所有微博总数 current_retweeted_count = statistics_count["retweeted"] current_comment_count = statistics_count["comment"] # PART 2 # 聚合当前时间内积极、中性、悲伤、愤怒情绪分布 # sentiment_dict = {"0": "neutral", "1":"positive", "2":"sad", "3": "anger"} sentiment_count = {"0": 0, "1": 0, "2": 0, "3": 0} search_results = aggregation_sentiment_related_weibo(ts, all_mid_list, time_interval) sentiment_count = search_results print "sentiment_count: ", sentiment_count negetive_key = ["2", "3", "4", "5", "6"] negetive_count = 0 for key in negetive_key: negetive_count += sentiment_count[key] # 聚合当前时间内重要的人 important_uid_list = [] datetime = ts2datetime(ts - time_interval) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) if exist_es: search_results = get_important_user(ts, all_mid_list, time_interval) important_uid_list = search_results # 根据获得uid_list,从人物库中匹配重要人物 if important_uid_list: important_results = es_user_portrait.mget( index=portrait_index_name, doc_type=portrait_index_type, body={"ids": important_uid_list} )["docs"] else: important_results = [] filter_important_list = [] # uid_list if important_results: for item in important_results: if item["found"]: # if item['_source']['importance'] > IMPORTANT_USER_THRESHOULD: filter_important_list.append(item["_id"]) # 判断感知 burst_reason = signal_nothing_variation warning_status = signal_nothing finish = unfinish_signal # "0" process_status = "1" if forward_result[0]: # 根据移动平均判断是否有时间发生 mean_count = forward_result[1] std_count = forward_result[2] mean_sentiment = forward_result[3] std_sentiment = forward_result[4] if ( mean_count >= MEAN_COUNT and current_total_count > mean_count + 1.96 * std_count or current_total_count >= len(all_mid_list) * AVERAGE_COUNT ): # 异常点发生 if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track else: warning_status = signal_brust burst_reason += signal_count_varition # 数量异常 if ( negetive_count > mean_sentiment + 1.96 * std_sentiment and mean_sentiment >= MEAN_COUNT or negetive_count >= len(all_mid_list) * AVERAGE_COUNT ): warning_status = signal_brust burst_reason += signal_sentiment_varition # 负面情感异常, "12"表示两者均异常 if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track if int(stop_time) <= ts: # 检查任务是否已经完成 finish = finish_signal process_status = "0" # 感知到的事, all_mid_list tmp_burst_reason = burst_reason topic_list = [] sensitive_text_list = [] # 有事件发生时开始 # if warning_status: if 1: index_list = [] important_words = [] datetime_1 = ts2datetime(ts) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es = es_text.indices.exists(index=index_name_1) if exist_es: index_list.append(index_name_1) datetime_2 = ts2datetime(ts - DAY) index_name_2 = flow_text_index_name_pre + datetime_2 exist_es = es_text.indices.exists(index=index_name_2) if exist_es: index_list.append(index_name_2) if index_list and all_mid_list: query_body = {"query": {"filtered": {"filter": {"terms": {"mid": all_mid_list}}}}, "size": 5000} search_results = es_text.search(index=index_list, doc_type="text", body=query_body)["hits"]["hits"] tmp_sensitive_warning = "" text_dict = dict() # 文本信息 mid_value = dict() # 文本赋值 duplicate_dict = dict() # 重合字典 portrait_dict = dict() # 背景信息 classify_text_dict = dict() # 分类文本 classify_uid_list = [] duplicate_text_list = [] sensitive_words_dict = dict() if search_results: for item in search_results: iter_uid = item["_source"]["uid"] iter_mid = item["_source"]["mid"] iter_text = item["_source"]["text"].encode("utf-8", "ignore") iter_sensitive = item["_source"].get("sensitive", 0) duplicate_text_list.append({"_id": iter_mid, "title": "", "content": iter_text}) if iter_sensitive: tmp_sensitive_warning = signal_sensitive_variation # 涉及到敏感词的微博 sensitive_words_dict[iter_mid] = iter_sensitive keywords_dict = json.loads(item["_source"]["keywords_dict"]) personal_keywords_dict = dict() for k, v in keywords_dict.iteritems(): k = k.encode("utf-8", "ignore") personal_keywords_dict[k] = v classify_text_dict[iter_mid] = personal_keywords_dict classify_uid_list.append(iter_uid) # 去重 if duplicate_text_list: dup_results = duplicate(duplicate_text_list) for item in dup_results: if item["duplicate"]: duplicate_dict[item["_id"]] = item["same_from"] # 分类 if classify_text_dict: classify_results = topic_classfiy(classify_uid_list, classify_text_dict) mid_value = dict() # print "classify_results: ", classify_results for k, v in classify_results.iteritems(): # mid:value mid_value[k] = topic_value_dict[v[0]] if tmp_sensitive_warning: warning_status = signal_brust burst_reason += signal_sensitive_variation sensitive_weibo_detail = {} if sensitive_words_dict: sensitive_mid_list = sensitive_words_dict.keys() sensitivie_weibo_detail = query_hot_weibo(ts, sensitive_mid_list, time_interval) results = dict() results["mid_topic_value"] = json.dumps(mid_value) results["duplicate_dict"] = json.dumps(duplicate_dict) results["sensitive_words_dict"] = json.dumps(sensitive_words_dict) results["sensitive_weibo_detail"] = json.dumps(sensitive_weibo_detail) results["origin_weibo_number"] = len(all_origin_list) results["retweeted_weibo_number"] = len(all_retweeted_list) results["origin_weibo_detail"] = json.dumps(origin_weibo_detail) results["retweeted_weibo_detail"] = json.dumps(retweeted_weibo_detail) results["retweeted_weibo_count"] = current_retweeted_count results["comment_weibo_count"] = current_comment_count results["weibo_total_number"] = current_total_count results["sentiment_distribution"] = json.dumps(sentiment_count) results["important_users"] = json.dumps(filter_important_list) results["unfilter_users"] = json.dumps(important_uid_list) results["burst_reason"] = tmp_burst_reason results["timestamp"] = ts # results['clustering_topic'] = json.dumps(topic_list) # es存储当前时段的信息 doctype = create_by + "-" + task_name es_user_portrait.index(index=index_sensing_task, doc_type=doctype, id=ts, body=results) # 更新manage social sensing的es信息 if not new: temporal_result = es_user_portrait.get(index=index_manage_social_task, doc_type=task_doc_type, id=doctype)[ "_source" ] temporal_result["warning_status"] = warning_status temporal_result["burst_reason"] = tmp_burst_reason temporal_result["finish"] = finish temporal_result["processing_status"] = process_status history_status = json.loads(temporal_result["history_status"]) history_status.append([ts, task_name, warning_status]) temporal_result["history_status"] = json.dumps(history_status) es_user_portrait.index(index=index_manage_social_task, doc_type=task_doc_type, id=doctype, body=temporal_result) else: print "test" return "1"
def save_weibos(excel_name, topic, child_topic_list, w_limit): # 这里需要根据文本内容查询相关微博id等 data = xlrd.open_workbook(excel_name) weibos_dict = {} for i in child_topic_list: #if i == '0': # continue weibos_dict[i] = [] table_weibos = data.sheet_by_name(str(int(i))) n_row_weibos = table_weibos.nrows if n_row_weibos <= w_limit: n_rows = n_row_weibo else: n_rows = w_limit # 考虑到数据已经根据权重从大到小排列 for j in range(n_rows): line = table_weibos.row_values(j) # 缺少根据文本查询微博文本对应的其他微博内容 weibo_text = line[1] weibo_weight = line[0] try: weibos_dict[i].append((weibo_text, weibo_weight)) # 实际上这里append的应该是weibo的完整内容,并且是将username等获取到的 except: weibos_dict[i]=[(weibo_text, weibo_weight)] #print 'weibos_dict:', weibos_dict #获取微博具体数据,仅作测试用 s = XapianSearch(path='/home/ubuntu3/huxiaoqian/case/20140724/20140804/', name='master_timeline_weibo',schema_version='5') begin_ts = 1378050300 end_ts = 1378051200 query_dict = { 'timestamp': {'$gt':begin_ts, '$lt': end_ts}, 'message_type' : 2 } weibos_dict_new = {} scount, weibo_results =s.search(query=query_dict, fields=fields_list) #print 'scount:', scount i = 0 j = 0 for weibo in weibo_results(): if i==11: break weibo['text'] = weibos_dict[str(i)][j][0] #获取username,profileimage,weibourl username, profileimage = getuserinfo(weibo['user']) weibo['username'] = username weibo['profile_image_url'] = profileimage weibo['timestamp'] = ts2date(weibo['timestamp']) weibo['weibo_link'] = weiboinfo2url(weibo['user'],weibo['_id']) #获取username, profileimage,weibourl结束 weight = weibos_dict[str(i)][j][1] try: weibos_dict_new[i].append((weibo, weight)) except: weibos_dict_new[i] = [(weibo, weight)] if j==4: j = 0 i += 1 else: j +=1 #分割线 for i in range(len(child_topic_list)): item = OpinionTestWeibos(topic, i, json.dumps(weibos_dict_new[i])) item_exist = db.session.query(OpinionTestWeibos).filter(OpinionTestWeibos.topic==topic, \ OpinionTestWeibos.child_topic==i).first() if item_exist: db.session.delete(item_exist) db.session.add(item) db.session.commit()