def save_trend_maker(topic, date, windowsize, trend_maker,topic_xapian_id): xapian_search_weibo = getXapianWeiboByTopic(topic_xapian_id) # topic id 要做一下处理 makers = trend_maker rank = 0 user_exist_list = [] items_exist = db.session.query(TrendMaker).filter(TrendMaker.topic==topic ,\ TrendMaker.date==date ,\ TrendMaker.windowsize==windowsize).all() for item_exist in items_exist: db.session.delete(item_exist) db.session.commit() for maker in makers: uid = maker[0] if uid in user_exist_list: continue user_exist_list.append(uid) if rank>=trend_maker_count: break rank += 1 wid = maker[1] value = maker[2] #内容相关度---关键词命中个数 key_item = maker[3] # 命中的关键词 user_info = get_user_info(uid) weibo_info = xapian_search_weibo.search_by_id(wid, fields=weibo_fields_list) #print 'trend_maker weibo_info:', weibo_info domain = uid2domain(uid) timestamp = int(weibo_info['timestamp']) # 修改model item = TrendMaker(topic, date, windowsize, uid, timestamp, json.dumps(user_info), json.dumps(weibo_info), domain, rank,value, json.dumps(key_item)) db.session.add(item) db.session.commit() print 'save_trend_maker success'
def save_trend_pusher(topic, date, windowsize, trend_pusher, topic_xapian_id): xapian_search_weibo = getXapianWeiboByTopic(topic_xapian_id) # topic id 要做一下处理 pushers = trend_pusher rank = 0 user_exist_list = [] items_exist = db.session.query(TrendPusher).filter(TrendPusher.topic==topic ,\ TrendPusher.date==date ,\ TrendPusher.windowsize==windowsize).all() for item_exist in items_exist: db.session.delete(item_exist) db.session.commit() for pusher in pushers: uid = pusher[0] if uid in user_exist_list: continue user_exist_list.append(uid) if rank>=trend_pusher_count: break rank += 1 wid = pusher[1] user_info = get_user_info(uid) weibo_info = xapian_search_weibo.search_by_id(wid, fields=weibo_fields_list) domain = uid2domain(uid) timestamp = int(weibo_info['timestamp']) item = TrendPusher(topic, date, windowsize, uid, timestamp, json.dumps(user_info), json.dumps(weibo_info), domain, rank) db.session.add(item) db.session.commit() print 'save_trend_pusher success'
def main(topic, start_time, end_time): start_ts = datetime2ts(start_time) end_ts = datetime2ts(end_time) + 24 * 3600 ## datestrlist = [] ## for datestr in datestr_list: ## datestr_new = datestr.replace('-', '') ## datestrlist.append(datestr_new) query_dict = { 'timestamp': { '$gt': start_ts, '$lt': end_ts }, } ## t = topic.split(',') ## for ctopic in t: ## query_dict['$and'].append({'topics': ctopic}) start = time.time() ## statuses_search = getXapianWeiboByDuration(datestrlist) ## count, get_results = statuses_search.search(query=query_dict, fields=fields_list) topic_id = getTopicByName(topic)['_id'] xapian_search_weibo = getXapianWeiboByTopic(topic_id) count, get_results = xapian_search_weibo.search(query=query_dict, fields=fields_list) end = time.time() #print count print 'search takes %s s' % (end - start) weibo = [] for r in get_results(): weibo.append([ r['_id'], r['user'], r['text'].encode('utf-8'), r['timestamp'], r['reposts_count'], r['comments_count'] ]) ad_main(topic, weibo, '0914', 10) #开始进行微博数据的观点挖掘
def save_trend_pusher(topic, date, windowsize, trend_pusher, topic_xapian_id): xapian_search_weibo = getXapianWeiboByTopic( topic_xapian_id) # topic id 要做一下处理 pushers = trend_pusher rank = 0 user_exist_list = [] items_exist = db.session.query(TrendPusher).filter(TrendPusher.topic==topic ,\ TrendPusher.date==date ,\ TrendPusher.windowsize==windowsize).all() for item_exist in items_exist: db.session.delete(item_exist) db.session.commit() for pusher in pushers: uid = pusher[0] if uid in user_exist_list: continue user_exist_list.append(uid) if rank >= trend_pusher_count: break rank += 1 wid = pusher[1] user_info = get_user_info(uid) weibo_info = xapian_search_weibo.search_by_id(wid, fields=weibo_fields_list) domain = uid2domain(uid) timestamp = int(weibo_info['timestamp']) item = TrendPusher(topic, date, windowsize, uid, timestamp, json.dumps(user_info), json.dumps(weibo_info), domain, rank) db.session.add(item) db.session.commit() print 'save_trend_pusher success'
def get_ds_info(text, userid, topic, timestamp_add, DEFAULT_INTERVAL, topic_xapian_id): # timestamp_add 表示最终极转发用户发表微博的时间戳 direct_superior_info = {} xapian_search_weibo = getXapianWeiboByTopic(topic_xapian_id) query_dict = {'user': userid, 'text': text} count, result = xapian_search_weibo.search(query=query_dict, fields=[ 'timestamp', 'comments_count', 'attitude_count', 'reposts_count', 'retweeted_uid' ]) # result是一个生成器 if result: for rr in result(): direct_superior_info = rr else: direct_superior_info['timestamp'] = DEFAULT_INTERVAL + timestamp_add direct_superior_info['comments_count'] = u'未知' direct_superior_info['attitude_count'] = u'未知' direct_superior_info['reposts_count'] = u'未知' direct_superior_info['retweeted_uid'] = None return direct_superior_info
def sort_makers(keyword_data, begin_ts, end_ts, ts_list, topic_xapian_id): ''' if begin_ts == ts_list[0]: start = begin_ts + 2 * Day end = begin_ts + 2 * Day + 12 * Hour else: start = begin_ts + 2 * Day - 6 * Hour end = begin_ts + 2 * Day + 12 * Hour query_dict = { 'timestamp' : {'$gt':start, '$lt':end} } print 'sort_maker-query_dict:', query_dict xapian_search_weibo = getXapianWeiboByTopic(topic_id='545f4c22cf198b18c57b8014') count , search_weibos = xapian_search_weibo.search(query=query_dict, fields=field_list) print 'sort_makers:', count if count == 0: return [] weibo_term = {} for weibo in search_weibos(): uid = weibo['user'] wid = weibo['_id'] terms_list = weibo['terms'] key_term_count = 0 for term in terms_list: term = term.decode('utf-8') #print 'term:', term, type(term) #print 'keyword_data:', keyword_data[0], type(keyword_data[0]) if term in keyword_data: key_term_count += 1 weibo_term[uid] = [wid, key_term_count] sort_weibo_term = sorted(weibo_term.items(), key=lambda x:x[1][1], reverse=True) ''' begin_ts = begin_ts - Hour query_dict = {'timestamp':{'$gt': begin_ts, '$lt': end_ts}} xapian_search_weibo = getXapianWeiboByTopic(topic_id=topic_xapian_id) count, search_weibo = xapian_search_weibo.search(query=query_dict, sort_by=['-timestamp'], fields=field_list) num = 0 if count == 0: return [] weibo_term = {} for weibo in search_weibo(): num += 1 if num > fu_tr_top_keyword: break uid = weibo['user'] wid = weibo['_id'] terms_list = weibo['terms'] key_term_count = 0 key_term = [] for term in terms_list: term = term.decode('utf-8') if term in keyword_data: key_term_count += 1 key_term.append(term) weibo_term[uid] = [wid, key_term_count, key_term] sort_weibo_term = sorted(weibo_term.items(), key=lambda x:x[1][1], reverse=True) return sort_weibo_term[:fu_tr_top_keyword]
def get_tsu(new_peaks, new_bottom, ts_list, topic_xapian_id): #print 'new_peaks:', new_peaks #print 'new_bottom:', new_bottom #print 'ts_list:', ts_list end_ts = ts_list[new_peaks[0]] begin_ts = ts_list[new_bottom[0]] if begin_ts > end_ts: begin_ts = ts_list[0] query_dict = { 'timestamp': { '$gt': begin_ts, '$lt': end_ts }, 'message_type': 3 } print 'query_dict:', query_dict print 'begin_ts:', ts2date(begin_ts) print 'end_ts:', ts2date(end_ts) xapian_search_weibo = getXapianWeiboByTopic(topic_xapian_id) # 这里需要考虑话题id count, results = xapian_search_weibo.search( query=query_dict, fields=['retweeted_uid', 'retweeted_mid']) print 'count:', count ruid_count = {} ruid_mid = {} for result in results(): r_uid = result['retweeted_uid'] if (r_uid == 0) or (not r_uid): continue ruid_mid[r_uid] = result['retweeted_mid'] try: ruid_count[r_uid] += 1 except KeyError: ruid_count[r_uid] = 1 sorted_result = sorted(ruid_count.items(), key=lambda d: d[1], reverse=True) print 'top_source_user:'******''' count, top_weibo = xapian_search_weibo.search(query={'_id':top_source_mid}, fields=['timestamp']) print 'count:', count for i in top_weibo(): timestamp = i['timestamp'] print 'timestamp:', ts2date(int(timestamp)) ''' return sorted_result
def get_tsu(new_peaks, new_bottom, ts_list, topic_xapian_id): #print 'new_peaks:', new_peaks #print 'new_bottom:', new_bottom #print 'ts_list:', ts_list end_ts = ts_list[new_peaks[0]] begin_ts = ts_list[new_bottom[0]] if begin_ts>end_ts: begin_ts = ts_list[0] query_dict = { 'timestamp':{'$gt':begin_ts, '$lt':end_ts}, 'message_type':3 } print 'query_dict:', query_dict print 'begin_ts:', ts2date(begin_ts) print 'end_ts:', ts2date(end_ts) xapian_search_weibo = getXapianWeiboByTopic(topic_xapian_id)# 这里需要考虑话题id count, results = xapian_search_weibo.search(query=query_dict, fields=['retweeted_uid','retweeted_mid']) print 'count:', count ruid_count = {} ruid_mid = {} for result in results(): r_uid = result['retweeted_uid'] if (r_uid == 0) or (not r_uid): continue ruid_mid[r_uid] = result['retweeted_mid'] try: ruid_count[r_uid] += 1 except KeyError: ruid_count[r_uid] = 1 sorted_result = sorted(ruid_count.items(), key=lambda d:d[1], reverse=True) print 'top_source_user:'******''' count, top_weibo = xapian_search_weibo.search(query={'_id':top_source_mid}, fields=['timestamp']) print 'count:', count for i in top_weibo(): timestamp = i['timestamp'] print 'timestamp:', ts2date(int(timestamp)) ''' return sorted_result
def get_ds_info(text, userid, topic, timestamp_add, DEFAULT_INTERVAL, topic_xapian_id): # timestamp_add 表示最终极转发用户发表微博的时间戳 direct_superior_info = {} xapian_search_weibo = getXapianWeiboByTopic(topic_xapian_id) query_dict = { 'user': userid , 'text': text } count, result = xapian_search_weibo.search(query=query_dict, fields=['timestamp', 'comments_count', 'attitude_count','reposts_count', 'retweeted_uid']) # result是一个生成器 if result: for rr in result(): direct_superior_info = rr else: direct_superior_info['timestamp'] = DEFAULT_INTERVAL + timestamp_add direct_superior_info['comments_count'] = u'未知' direct_superior_info['attitude_count'] = u'未知' direct_superior_info['reposts_count'] = u'未知' direct_superior_info['retweeted_uid'] = None return direct_superior_info
def save_trend_maker(topic, date, windowsize, trend_maker, topic_xapian_id): xapian_search_weibo = getXapianWeiboByTopic( topic_xapian_id) # topic id 要做一下处理 makers = trend_maker rank = 0 user_exist_list = [] items_exist = db.session.query(TrendMaker).filter(TrendMaker.topic==topic ,\ TrendMaker.date==date ,\ TrendMaker.windowsize==windowsize).all() for item_exist in items_exist: db.session.delete(item_exist) db.session.commit() for maker in makers: uid = maker[0] if uid in user_exist_list: continue user_exist_list.append(uid) if rank >= trend_maker_count: break rank += 1 wid = maker[1] value = maker[2] #内容相关度---关键词命中个数 key_item = maker[3] # 命中的关键词 user_info = get_user_info(uid) weibo_info = xapian_search_weibo.search_by_id(wid, fields=weibo_fields_list) #print 'trend_maker weibo_info:', weibo_info domain = uid2domain(uid) timestamp = int(weibo_info['timestamp']) # 修改model item = TrendMaker(topic, date, windowsize, uid, timestamp, json.dumps(user_info), json.dumps(weibo_info), domain, rank, value, json.dumps(key_item)) db.session.add(item) db.session.commit() print 'save_trend_maker success'
def cal_topic_quotasystem_count_by_date(topic, start, end): #确定要查询Weibo的时间段 start_date = ts2datetime(start) end_date = ts2datetime(end) # 若结束时间戳为2014:09:02 00:00:00,实际上还是算在9.1那一天中 print 'start, end:', start_date, end_date windowsize = (end - start) / Day print 'windowsize:', windowsize datestr_list = [] for i in range(windowsize): time = start + i * Day time_date = ts2datetime(time) datestr_list.append(time_date.replace('-', '')) print 'datestr_list:', datestr_list # topic_xapian_id = weibo_topic2xapian(topic, start, end) print 'topic_xapian_id:', topic_xapian_id xapian_search_weibo = getXapianWeiboByTopic(topic_xapian_id) ''' xapian_search_weibo = getXapianWeiboByDuration(datestr_list) # 这里是根据时间段进行查询的 xapian_search_topic = getXapianWeiboByTopic(topic) # 直接查topic建立的索引 ''' if xapian_search_weibo: print '******start_compute' quota_attention(topic, xapian_search_weibo, start_ts=start, end_ts=end) quota_duration(topic, start_ts=start, end_ts=end) print 'save duration success' quota_sensitivity(topic, start_ts=start, end_ts=end) print 'save sensitivity success' quota_importance(topic, start_ts=start, end_ts=end) print 'save importance success' quota_sentiment(topic, xapian_search_weibo, start_ts=start, end_ts=end) print 'save sentiment success' quota_coverage(topic, xapian_search_weibo, start_ts=start, end_ts=end) # 覆盖度计算 print 'save coverage success' quota_person_sensitivity(topic, xapian_search_weibo, start_ts=start, end_ts=end) # 敏感人物参与度 print 'save person_sensitivity success'
mset = xapian_search_weibo.search(query=query_dict, sort_by=[sort_field], \ max_offset=w_limit, mset_direct=True) kcount = top_keywords(gen_mset_iter(xapian_search_weibo, mset, fields=['terms']), top=k_limit) top_ws = top_weibos(results, top=w_limit) mtype_count[v] = [end_ts, count] mtype_kcount[v] = [end_ts, kcount] mtype_weibo[v] = [end_ts, top_ws] save_pc_results(topic, mtype_count, during) save_kc_results(topic, mtype_kcount, during, k_limit) save_ws_results(topic, mtype_weibo, during, w_limit) if __name__ == '__main__': topic = sys.argv[1] # u'香港自由行' u'张灵甫遗骨疑似被埋羊圈' u'高校思想宣传' u'高校宣传思想工作' u'外滩踩踏' 'APEC' u'全军政治工作会议' start_date = sys.argv[2] # '2015-02-23' end_date = sys.argv[3] # '2015-03-02' topic = topic.decode('utf-8') topic_id = getTopicByName(topic)['_id'] start_ts = datetime2ts(start_date) end_ts = datetime2ts(end_date) duration = Fifteenminutes xapian_search_weibo = getXapianWeiboByTopic(topic_id) print 'topic: ', topic.encode('utf8'), 'from %s to %s' % (start_ts, end_ts) propagateCronTopic(topic, xapian_search_weibo, start_ts, end_ts, during=duration)
def make_network(topic, date, window_size, topic_xapian_id, max_size=100000, attribute_add = False): topics = topic.strip().split(',') end_time = int(datetime2ts(date)) start_time = int(end_time - window2time(window_size)) print 'start, end:', start_time, end_time #topic_id='545f4c22cf198b18c57b8014' topic_id = topic_xapian_id statuses_search = getXapianWeiboByTopic(topic_id) ''' count, test_results = statuses_search.search(query={'timestamp':{'$gt': start_time, '$lt': end_time}}) for i in test_results(): print i break ''' g = nx.DiGraph() # 初始化一个有向图 gg = nx.Graph() # 为计算quota初始化一个无向图 ds_dg = nx.DiGraph() # direct_superior_network 有向直接上级转发网络 ds_udg = nx.Graph() # undirect_superior_network 无向直接上级转发网络 query_dict = {'timestamp':{'$gt': start_time, '$lt': end_time}} #need repost index ''' for ctopic in topics: query_dict['topics'].append(ctopic) ''' print 'query_dict:', query_dict count, get_statuses_results = statuses_search.search(query=query_dict, fields=['_id', 'user', 'timestamp', 'retweeted_mid','retweeted_uid', 'text', 'reposts_count', 'comments_count', 'attitude_count'], max_offset=max_size) print 'count_before_nad:', count results_list = [] ''' 根据微博文本进行广告微博筛选 ''' if count: for weibo in get_statuses_results(): results_list.append([weibo['_id'],weibo['text']]) scount, data_wid = ad_classifier(results_list) else: data_wid = [] scount = 0 print 'count_after_nad:', scount new_attribute_dict = {} # 星形源头转发网络需要添加的节点对应的text、reposts_count、comment_count、 attitude_count ds_new_attribute_dict = {} # 直接上级转发网络需要添加的属性:即节点对应的text、 reposts_count、 comment_count 、attitude_count new_query_dict = { '$or':[] } # 用于查询retweeted_mid对应的weibo内容 ds_new_query_dict ={ '$or':[] } # 用于查询direct_superior_user为retweeted_uid对应的retweeted_mid的微博内容 map_dict = {} # map_dict = {retweeted_mid:[retweeted_uid, user, timestamp],...} 保存_id timestamp与其对应的retweeted_mid之间的对应关系 ds_map_dict = {} # ds_dict = {retweeted_mid:[retweeted_uid, user, timestamp]} 直接上级转发网络中直接上级就是源头上级时,对应关系 get_statuses_results = [r for r in get_statuses_results() if r['retweeted_uid'] != 0] set_repost_name = set() for status in get_statuses_results: if str(status['_id']) in data_wid: ''' 当微博信息非垃圾时,进行new_attribute_dict的添加----即[a b]->添加a节点的微博信息 ''' nad_uid = status['user'] nad_id = status['_id'] r_uid = status['retweeted_uid'] r_mid = status['retweeted_mid'] if attribute_add == True: text_add = status['text'] reposts_count_add = status['reposts_count'] comment_count_add = status['comments_count'] attitude_count_add = status['attitude_count'] timestamp_add = status['timestamp'] try: new_attribute_dict[nad_uid].append([text_add, reposts_count_add, comment_count_add, attitude_count_add, timestamp_add, r_uid]) ds_new_attribute_dict[nad_uid].append([text_add, reposts_count_add, comment_count_add, attitude_count_add, timestamp_add, r_uid]) except: new_attribute_dict[nad_uid] = [[text_add, reposts_count_add, comment_count_add, attitude_count_add, timestamp_add, r_uid]] ds_new_attribute_dict[nad_uid] = [[text_add, reposts_count_add, comment_count_add, attitude_count_add, timestamp_add, r_uid]] #print 'len(new_attribute_dict):', len(new_attribute_dict) ''' 区别于原创微博 当是转发微博时,获取直接转发上级----例子:[a b c]->b ''' if status['retweeted_uid'] and status['retweeted_uid']!=0: print 'before get_superior_userid' direct_superior_userid = get_superior_userid(status) # 获取直接转发上级--只获取一跳 ''' repost_name = get_superior_userid(status) # test set_repost_name.add(repost_name) # test ''' print 'user_id', direct_superior_userid if not direct_superior_userid: #当直接转发上级的userid获取不到时,则认为直接转发上级就是源头转发微博 direct_superior_userid = r_uid ds_dg.add_edge(nad_uid, direct_superior_userid) ds_udg.add_edge(nad_uid, direct_superior_userid) ds_new_query_dict['$or'].append({'_id':r_mid}) ds_map_dict[r_mid] = [r_uid, nad_uid, status['timestamp']] # 当直接上级就是源头上级时,通过查询xapian获取weibo_text timestamp comment_counts repost_counts attitude_counts else: #存在直接转发上级 ds_dg.add_edge(nad_uid, direct_superior_userid) ds_udg.add_edge(nad_uid, direct_superior_userid) if attribute_add == 'True': weibo_text = status['text'] # 获取直接上级微博文本内容 weibo_test1 = weibo_text.split('//@') # 此处不会出现文本中没有//@结构,因为这里的直接转发上级是通过昵称转化而得到的,所以一定有//@结构 weibo_test2 = weibo_test1[1] m_index = weibo_test2.find(':') direct_superior_weibo = weibo_test2[m_index+1:] m_all_index = weibo_text.find(':') direct_superior_weibos = weibo_text[m_all_index+1:] #需要根据文本内容和r_uid获取timestamp direct_superior_info = get_ds_info(direct_superior_weibos, direct_superior_userid, topic, timestamp_add, topic_xapian_id) # timestamp_add是最终转发微博额时间戳 # 通过直接上级微博文本内容和用户id,查询topic对应的Xapian获取该条微博的时间戳等信息 # 如果在xapian中查不到这条信息的timestamp,则根据其转发链条关系,以(源头用户ts-重点用户ts)/(链条中人数-1)----源头微博还是需要根据mid查询,还是有可能不在xapian里面 # 根据uid和text获取[timestamp, comment_counts, attitude_counts, reposts_counts, r_uid] timestamp = direct_superior_info['timestamp'] comment_count = direct_superior_info['comments_count'] attitude_count = direct_superior_info['attitude_count'] reposts_count = direct_superior_info['reposts_count'] retweeted_uid = direct_superior_info['retweeted_uid'] try: ds_new_attribute_dict[direct_superior_userid].append([direct_superior_weibo, reposts_count, comment_count, attitude_count, timestamp, retweeted_uid]) except: ds_new_attribute_dict[direct_superior_userid] = [[direct_superior_weibo, reposts_count, comment_count, attitude_count, timestamp, retweeted_uid]] print 'after get_superior_userid' try: #源头转发网络构建 if status['retweeted_uid'] and status['retweeted_uid'] != 0: repost_uid = status['user'] source_uid = status['retweeted_uid'] if is_in_trash_list(repost_uid) or is_in_trash_list(source_uid): continue g.add_edge(repost_uid, source_uid) # 将所有topic相关的uid作为node,并将它们按照信息传递方向形成有向图 gg.add_edge(repost_uid, source_uid) new_query_dict['$or'].append({'_id':r_mid}) # 为了查询转发微博的内容 map_dict[r_mid] = [r_uid, nad_uid, status['timestamp']] except (TypeError, KeyError): continue print 'step_1:g', len(g) print 'step_1:ds_dg', len(ds_dg) # 这里还需要处理,由于根据mid查询xapian获取微博的时间戳,但是有可能会没有获取到。若没有获取到,使用check_attribude进行处理 if attribute_add == True: # 将retweeted_uid用户的text等取出来,存在new_attribute_dict中 ruid_count, r_results = statuses_search.search(query=new_query_dict, fields=['_id', 'user', 'timestamp', 'retweeted_mid','retweeted_uid', 'text', 'reposts_count', 'comments_count', 'attitude_count']) for rresult in r_results(): text = rresult['text'] # 这里的text需要再做一次处理----剔除掉’//@..:‘的内容,只获取作者自己的微博文本 text_spl = text.split('//@') try: text_add = text_spl[0] except: text_add = text timestamp_add = rresult['timestamp'] reposts_count_add = rresult['reposts_count'] comment_count_add = rresult['comments_count'] attitude_count_add = rresult['attitude_count'] ruid_add = rresult['retweeted_uid'] try: new_attribute_dict[rresult['user']].append([text_add, reposts_count_add, comment_count_add, attitude_count_add, timestamp_add, ruid_add]) except: new_attribute_dict[rresult['user']] = [[text_add, reposts_count_add, comment_count_add, attitude_count_add, timestamp_add, ruid_add]] #print 'map_dict:', map_dict new_attribute_dict = check_attribute(new_attribute_dict, new_query_dict, map_dict) # 对query_dict中没有查询到的r_mid,在new_attribute_dict中进行补全处理 #print 'quer_dict:', ds_new_query_dict print 'len(ds_new_attribute_dict):', len(ds_new_attribute_dict) if query_dict!={'$or':[]}: ds_ruid_count, ds_r_results = statuses_search.search(query=ds_new_query_dict, fields=['_id', 'user', 'timestamp', 'retweeted_mid','retweeted_uid', 'text', 'reposts_count', 'comments_count', 'attitude_count']) for ds_rresult in ds_r_results(): uid = ds_rresult['user'] timestamp_add = ds_rresult['timestamp'] text = ds_rresult['text'] # 这里的text需要再做一次处理----剔除掉’//@..:‘的内容,只获取作者自己的微博文本 text_spl = text.split('//@') try: text_add = text_spl[0] except: text_add = text reposts_count_add = ds_rresult['reposts_count'] comment_count_add = ds_rresult['comments_count'] attitude_count_add = ds_rresult['attitude_count'] ruid_add = rresult['retweeted_uid'] try: ds_new_attribute_dict[uid].append([text_add, reposts_count_add, comment_count_add, attitude_count_add, timestamp_add, ruid_add]) except: ds_new_attribute_dict[uid] = [[text_add, reposts_count_add, comment_count_add, attitude_count_add, timestamp_add, ruid_add]] ds_new_attribute_dict = check_attribute(ds_new_attribute_dict, ds_new_query_dict, ds_map_dict) #print 'new_attribute_dict:', new_attribute_dict print 'len(g):', len(g) print 'len(ds_dg):', len(ds_dg) return g , gg, new_attribute_dict, ds_dg, ds_udg, ds_new_attribute_dict '''
def make_network(topic, date, window_size, topic_xapian_id, max_size=100000, attribute_add=False): topics = topic.strip().split(',') end_time = int(datetime2ts(date)) start_time = int(end_time - window2time(window_size)) print 'start, end:', start_time, end_time #topic_id='545f4c22cf198b18c57b8014' topic_id = topic_xapian_id statuses_search = getXapianWeiboByTopic(topic_id) ''' count, test_results = statuses_search.search(query={'timestamp':{'$gt': start_time, '$lt': end_time}}) for i in test_results(): print i break ''' g = nx.DiGraph() # 初始化一个有向图 gg = nx.Graph() # 为计算quota初始化一个无向图 ds_dg = nx.DiGraph() # direct_superior_network 有向直接上级转发网络 ds_udg = nx.Graph() # undirect_superior_network 无向直接上级转发网络 query_dict = {'timestamp': {'$gt': start_time, '$lt': end_time}} #need repost index ''' for ctopic in topics: query_dict['topics'].append(ctopic) ''' print 'query_dict:', query_dict count, get_statuses_results = statuses_search.search( query=query_dict, fields=[ '_id', 'user', 'timestamp', 'retweeted_mid', 'retweeted_uid', 'text', 'reposts_count', 'comments_count', 'attitude_count' ], max_offset=max_size) print 'count_before_nad:', count results_list = [] ''' 根据微博文本进行广告微博筛选 ''' if count: for weibo in get_statuses_results(): results_list.append([weibo['_id'], weibo['text']]) scount, data_wid = ad_classifier(results_list) else: data_wid = [] scount = 0 print 'count_after_nad:', scount new_attribute_dict = { } # 星形源头转发网络需要添加的节点对应的text、reposts_count、comment_count、 attitude_count ds_new_attribute_dict = { } # 直接上级转发网络需要添加的属性:即节点对应的text、 reposts_count、 comment_count 、attitude_count new_query_dict = {'$or': []} # 用于查询retweeted_mid对应的weibo内容 ds_new_query_dict = { '$or': [] } # 用于查询direct_superior_user为retweeted_uid对应的retweeted_mid的微博内容 map_dict = { } # map_dict = {retweeted_mid:[retweeted_uid, user, timestamp],...} 保存_id timestamp与其对应的retweeted_mid之间的对应关系 ds_map_dict = { } # ds_dict = {retweeted_mid:[retweeted_uid, user, timestamp]} 直接上级转发网络中直接上级就是源头上级时,对应关系 get_statuses_results = [ r for r in get_statuses_results() if r['retweeted_uid'] != 0 ] set_repost_name = set() for status in get_statuses_results: if str(status['_id']) in data_wid: ''' 当微博信息非垃圾时,进行new_attribute_dict的添加----即[a b]->添加a节点的微博信息 ''' nad_uid = status['user'] nad_id = status['_id'] r_uid = status['retweeted_uid'] r_mid = status['retweeted_mid'] if attribute_add == True: text_add = status['text'] reposts_count_add = status['reposts_count'] comment_count_add = status['comments_count'] attitude_count_add = status['attitude_count'] timestamp_add = status['timestamp'] try: new_attribute_dict[nad_uid].append([ text_add, reposts_count_add, comment_count_add, attitude_count_add, timestamp_add, r_uid ]) ds_new_attribute_dict[nad_uid].append([ text_add, reposts_count_add, comment_count_add, attitude_count_add, timestamp_add, r_uid ]) except: new_attribute_dict[nad_uid] = [[ text_add, reposts_count_add, comment_count_add, attitude_count_add, timestamp_add, r_uid ]] ds_new_attribute_dict[nad_uid] = [[ text_add, reposts_count_add, comment_count_add, attitude_count_add, timestamp_add, r_uid ]] #print 'len(new_attribute_dict):', len(new_attribute_dict) ''' 区别于原创微博 当是转发微博时,获取直接转发上级----例子:[a b c]->b ''' if status['retweeted_uid'] and status['retweeted_uid'] != 0: print 'before get_superior_userid' direct_superior_userid = get_superior_userid( status) # 获取直接转发上级--只获取一跳 ''' repost_name = get_superior_userid(status) # test set_repost_name.add(repost_name) # test ''' print 'user_id', direct_superior_userid if not direct_superior_userid: #当直接转发上级的userid获取不到时,则认为直接转发上级就是源头转发微博 direct_superior_userid = r_uid ds_dg.add_edge(nad_uid, direct_superior_userid) ds_udg.add_edge(nad_uid, direct_superior_userid) ds_new_query_dict['$or'].append({'_id': r_mid}) ds_map_dict[r_mid] = [r_uid, nad_uid, status['timestamp']] # 当直接上级就是源头上级时,通过查询xapian获取weibo_text timestamp comment_counts repost_counts attitude_counts else: #存在直接转发上级 ds_dg.add_edge(nad_uid, direct_superior_userid) ds_udg.add_edge(nad_uid, direct_superior_userid) if attribute_add == 'True': weibo_text = status['text'] # 获取直接上级微博文本内容 weibo_test1 = weibo_text.split( '//@' ) # 此处不会出现文本中没有//@结构,因为这里的直接转发上级是通过昵称转化而得到的,所以一定有//@结构 weibo_test2 = weibo_test1[1] m_index = weibo_test2.find(':') direct_superior_weibo = weibo_test2[m_index + 1:] m_all_index = weibo_text.find(':') direct_superior_weibos = weibo_text[m_all_index + 1:] #需要根据文本内容和r_uid获取timestamp direct_superior_info = get_ds_info( direct_superior_weibos, direct_superior_userid, topic, timestamp_add, topic_xapian_id) # timestamp_add是最终转发微博额时间戳 # 通过直接上级微博文本内容和用户id,查询topic对应的Xapian获取该条微博的时间戳等信息 # 如果在xapian中查不到这条信息的timestamp,则根据其转发链条关系,以(源头用户ts-重点用户ts)/(链条中人数-1)----源头微博还是需要根据mid查询,还是有可能不在xapian里面 # 根据uid和text获取[timestamp, comment_counts, attitude_counts, reposts_counts, r_uid] timestamp = direct_superior_info['timestamp'] comment_count = direct_superior_info['comments_count'] attitude_count = direct_superior_info['attitude_count'] reposts_count = direct_superior_info['reposts_count'] retweeted_uid = direct_superior_info['retweeted_uid'] try: ds_new_attribute_dict[ direct_superior_userid].append([ direct_superior_weibo, reposts_count, comment_count, attitude_count, timestamp, retweeted_uid ]) except: ds_new_attribute_dict[direct_superior_userid] = [[ direct_superior_weibo, reposts_count, comment_count, attitude_count, timestamp, retweeted_uid ]] print 'after get_superior_userid' try: #源头转发网络构建 if status['retweeted_uid'] and status['retweeted_uid'] != 0: repost_uid = status['user'] source_uid = status['retweeted_uid'] if is_in_trash_list(repost_uid) or is_in_trash_list( source_uid): continue g.add_edge( repost_uid, source_uid) # 将所有topic相关的uid作为node,并将它们按照信息传递方向形成有向图 gg.add_edge(repost_uid, source_uid) new_query_dict['$or'].append({'_id': r_mid}) # 为了查询转发微博的内容 map_dict[r_mid] = [r_uid, nad_uid, status['timestamp']] except (TypeError, KeyError): continue print 'step_1:g', len(g) print 'step_1:ds_dg', len(ds_dg) # 这里还需要处理,由于根据mid查询xapian获取微博的时间戳,但是有可能会没有获取到。若没有获取到,使用check_attribude进行处理 if attribute_add == True: # 将retweeted_uid用户的text等取出来,存在new_attribute_dict中 ruid_count, r_results = statuses_search.search( query=new_query_dict, fields=[ '_id', 'user', 'timestamp', 'retweeted_mid', 'retweeted_uid', 'text', 'reposts_count', 'comments_count', 'attitude_count' ]) for rresult in r_results(): text = rresult[ 'text'] # 这里的text需要再做一次处理----剔除掉’//@..:‘的内容,只获取作者自己的微博文本 text_spl = text.split('//@') try: text_add = text_spl[0] except: text_add = text timestamp_add = rresult['timestamp'] reposts_count_add = rresult['reposts_count'] comment_count_add = rresult['comments_count'] attitude_count_add = rresult['attitude_count'] ruid_add = rresult['retweeted_uid'] try: new_attribute_dict[rresult['user']].append([ text_add, reposts_count_add, comment_count_add, attitude_count_add, timestamp_add, ruid_add ]) except: new_attribute_dict[rresult['user']] = [[ text_add, reposts_count_add, comment_count_add, attitude_count_add, timestamp_add, ruid_add ]] #print 'map_dict:', map_dict new_attribute_dict = check_attribute( new_attribute_dict, new_query_dict, map_dict) # 对query_dict中没有查询到的r_mid,在new_attribute_dict中进行补全处理 #print 'quer_dict:', ds_new_query_dict print 'len(ds_new_attribute_dict):', len(ds_new_attribute_dict) if query_dict != {'$or': []}: ds_ruid_count, ds_r_results = statuses_search.search( query=ds_new_query_dict, fields=[ '_id', 'user', 'timestamp', 'retweeted_mid', 'retweeted_uid', 'text', 'reposts_count', 'comments_count', 'attitude_count' ]) for ds_rresult in ds_r_results(): uid = ds_rresult['user'] timestamp_add = ds_rresult['timestamp'] text = ds_rresult[ 'text'] # 这里的text需要再做一次处理----剔除掉’//@..:‘的内容,只获取作者自己的微博文本 text_spl = text.split('//@') try: text_add = text_spl[0] except: text_add = text reposts_count_add = ds_rresult['reposts_count'] comment_count_add = ds_rresult['comments_count'] attitude_count_add = ds_rresult['attitude_count'] ruid_add = rresult['retweeted_uid'] try: ds_new_attribute_dict[uid].append([ text_add, reposts_count_add, comment_count_add, attitude_count_add, timestamp_add, ruid_add ]) except: ds_new_attribute_dict[uid] = [[ text_add, reposts_count_add, comment_count_add, attitude_count_add, timestamp_add, ruid_add ]] ds_new_attribute_dict = check_attribute(ds_new_attribute_dict, ds_new_query_dict, ds_map_dict) #print 'new_attribute_dict:', new_attribute_dict print 'len(g):', len(g) print 'len(ds_dg):', len(ds_dg) return g, gg, new_attribute_dict, ds_dg, ds_udg, ds_new_attribute_dict '''
def get_pushers(topic, new_peaks, new_bottom, ts_list, topic_xapian_id): #unit = 900 #p_during = Hour p_ts_list = [] results = [] end_ts = ts_list[new_peaks[0]] begin_ts = ts_list[new_bottom[0]] print 'pusher_start_ts:', ts2date(begin_ts) print 'pusher_end_ts:', ts2date(end_ts) if begin_ts > end_ts: begin_ts = ts_list[0] interval = (end_ts - begin_ts) / p_during for i in range(interval, 0, -1): begin_ts = end_ts - p_during * i over_ts = begin_ts + p_during p_ts_list.append(over_ts) items = db.session.query(PropagateCount).filter(PropagateCount.topic==topic ,\ PropagateCount.end<=over_ts ,\ PropagateCount.end>begin_ts ,\ PropagateCount.range==unit).all() if items: result = Merge_propagate(items) else: result = 0 results.append(float(result)) #print 'pusher_line:', results max_k_timestamp = get_max_k_timestamp(results, p_ts_list) # 获取增速最快的时间点 #save max_k_timestamp # save_mak_k(max_k_timestamp) end = max_k_timestamp start = max_k_timestamp - p_during xapian_search_weibo = getXapianWeiboByTopic(topic_xapian_id) query_dict = {'timestamp': {'$gt': end, '$lt': end + 3600}} ''' count , results = xapian_search_weibo.search(query=query_dict, fields=['_id', 'user','retweeted_uid','retweeted_mid', 'timestamp']) ruid_count = {} ruid_mid = {} for result in results(): r_uid = result['retweeted_uid'] if (r_uid == 0) or (not r_uid): continue ruid_mid[r_uid] = result['retweeted_mid'] try: ruid_count[r_uid] += 1 except KeyError: ruid_count[r_uid] = 1 sorted_pushers = sorted(ruid_count.items(), key=lambda d:d[1], reverse=True) print 'top_trend_pusher_uid:',sorted_pushers pusher_list = [] for pusher in sorted_pushers: uid = pusher[0] mid = ruid_mid[uid] value = pusher[1] ''' #以上是找到斜率最大的时间段内所有转发微博集中地源头用户--但是介于这些用户的相关信息找不到,因而选择使用下面的方法 #以下是通过找到斜率最大的时间段内所有微博中转发数最大的用户 count, results = xapian_search_weibo.search( query=query_dict, sort_by=['reposts_count'], fields=['_id', 'user', 'reposts_count']) print 'pusher_search_count:', count print 'pusher_query_dict:', query_dict pusher_list = [] count = 0 for result in results(): count += 1 if count > 100: break wid = result['_id'] uid = result['user'] value = result['reposts_count'] pusher_list.append((uid, wid, value)) # sort by reposts_count # sort_by_rc(pusher_list) return pusher_list
def sort_makers(keyword_data, begin_ts, end_ts, ts_list, topic_xapian_id): ''' if begin_ts == ts_list[0]: start = begin_ts + 2 * Day end = begin_ts + 2 * Day + 12 * Hour else: start = begin_ts + 2 * Day - 6 * Hour end = begin_ts + 2 * Day + 12 * Hour query_dict = { 'timestamp' : {'$gt':start, '$lt':end} } print 'sort_maker-query_dict:', query_dict xapian_search_weibo = getXapianWeiboByTopic(topic_id='545f4c22cf198b18c57b8014') count , search_weibos = xapian_search_weibo.search(query=query_dict, fields=field_list) print 'sort_makers:', count if count == 0: return [] weibo_term = {} for weibo in search_weibos(): uid = weibo['user'] wid = weibo['_id'] terms_list = weibo['terms'] key_term_count = 0 for term in terms_list: term = term.decode('utf-8') #print 'term:', term, type(term) #print 'keyword_data:', keyword_data[0], type(keyword_data[0]) if term in keyword_data: key_term_count += 1 weibo_term[uid] = [wid, key_term_count] sort_weibo_term = sorted(weibo_term.items(), key=lambda x:x[1][1], reverse=True) ''' begin_ts = begin_ts - Hour query_dict = {'timestamp': {'$gt': begin_ts, '$lt': end_ts}} xapian_search_weibo = getXapianWeiboByTopic(topic_id=topic_xapian_id) count, search_weibo = xapian_search_weibo.search(query=query_dict, sort_by=['-timestamp'], fields=field_list) num = 0 if count == 0: return [] weibo_term = {} for weibo in search_weibo(): num += 1 if num > fu_tr_top_keyword: break uid = weibo['user'] wid = weibo['_id'] terms_list = weibo['terms'] key_term_count = 0 key_term = [] for term in terms_list: term = term.decode('utf-8') if term in keyword_data: key_term_count += 1 key_term.append(term) weibo_term[uid] = [wid, key_term_count, key_term] sort_weibo_term = sorted(weibo_term.items(), key=lambda x: x[1][1], reverse=True) return sort_weibo_term[:fu_tr_top_keyword]
def get_first_node(topic, start_date, date, windowsize, topic_xapian_id): ''' 根据timestamp,获取top20的用户----微博可能就不只20条了 根据微博获取对应的用户信息------可能会出现用户重复的情况,这里只取时间最早的那一个 将其保存 ''' print 'first_user_topic_id:', topic_xapian_id if topic and topic != '': datestr = start_date.replace('-', '') xapian_search_weibo = getXapianWeiboByTopic(topic_id=topic_xapian_id) begin_ts = datetime2ts(start_date) end_ts = datetime2ts(date) topics = topic.strip().split(',') query_dict = { 'timestamp': { '$gte': begin_ts, '$lte': end_ts }, '$or': [{ 'message_type': 1 }, { 'message_type': 3 }] } #query_dict = {'$or':[{'message_type':1}, {'message_type':3}]} print 'first_user_query:', query_dict # 这里只选取原创和转发微博进行计算 ''' for c_topic in topics: query_dict['$and'].append({'topics': c_topic}) ''' time_top_nodes = xapian_search_weibo.search(query=query_dict, sort_by=['-timestamp'], fields=fields_list) user_list = [] if not time_top_nodes: print 'search error' else: #print 'time_top_nodes:', time_top_nodes s = 0 ''' domain_count_list = {'folk':0, 'media':0, 'opinion_leader':0, 'oversea':0, 'other':0} domain_user_list = {'folk':[], 'media':[], 'opinion_leader':[], 'oversea':[], 'other':[]} ''' domain_count_list, domain_user_list = init_domain_list() print 'start_node:' for node in time_top_nodes[1](): #print 'node:', node uid = node['user'] user_domain = uid2domain(uid) timestamp = node['timestamp'] user_info = get_user_info(uid) # 获取top_time微博对应的用户信息 if s < first_user_count: if user_info and (not (uid in user_list)): s += 1 weibo_info = node user_list.append(uid) save_first_nodes(topic, date, windowsize, uid, timestamp, user_info, weibo_info, user_domain) #if domain_count_list == {'folk':first_user_count, 'media':first_user_count, 'opinion_leader':first_user_count, 'oversea':first_user_count, 'other':first_user_count}: # break stop_s = 0 for domain in domain_list: if domain_count_list[domain] == first_user_count: stop_s += 1 if stop_s == len(domain_list): break for domain in domain_list: if domain_count_list[domain] >= first_user_count: continue elif user_domain == domain: if user_info and (not (uid in domain_user_list[domain])): domain_user_list[domain].append(uid) domain_count_list[domain] += 1 rank = domain_count_list[domain] save_domain_nodes(topic, date, windowsize, uid, timestamp, user_info, weibo_info, user_domain, rank)
def get_pushers(topic, new_peaks, new_bottom, ts_list, topic_xapian_id): #unit = 900 #p_during = Hour p_ts_list = [] results = [] end_ts = ts_list[new_peaks[0]] begin_ts = ts_list[new_bottom[0]] print 'pusher_start_ts:', ts2date(begin_ts) print 'pusher_end_ts:', ts2date(end_ts) if begin_ts>end_ts: begin_ts = ts_list[0] interval = (end_ts - begin_ts) / p_during for i in range(interval, 0, -1): begin_ts = end_ts - p_during * i over_ts = begin_ts + p_during p_ts_list.append(over_ts) items = db.session.query(PropagateCount).filter(PropagateCount.topic==topic ,\ PropagateCount.end<=over_ts ,\ PropagateCount.end>begin_ts ,\ PropagateCount.range==unit).all() if items: result = Merge_propagate(items) else: result = 0 results.append(float(result)) #print 'pusher_line:', results max_k_timestamp = get_max_k_timestamp(results, p_ts_list) # 获取增速最快的时间点 #save max_k_timestamp # save_mak_k(max_k_timestamp) end = max_k_timestamp start = max_k_timestamp - p_during xapian_search_weibo = getXapianWeiboByTopic(topic_xapian_id) query_dict = { 'timestamp':{'$gt':end, '$lt':end+3600} } ''' count , results = xapian_search_weibo.search(query=query_dict, fields=['_id', 'user','retweeted_uid','retweeted_mid', 'timestamp']) ruid_count = {} ruid_mid = {} for result in results(): r_uid = result['retweeted_uid'] if (r_uid == 0) or (not r_uid): continue ruid_mid[r_uid] = result['retweeted_mid'] try: ruid_count[r_uid] += 1 except KeyError: ruid_count[r_uid] = 1 sorted_pushers = sorted(ruid_count.items(), key=lambda d:d[1], reverse=True) print 'top_trend_pusher_uid:',sorted_pushers pusher_list = [] for pusher in sorted_pushers: uid = pusher[0] mid = ruid_mid[uid] value = pusher[1] ''' #以上是找到斜率最大的时间段内所有转发微博集中地源头用户--但是介于这些用户的相关信息找不到,因而选择使用下面的方法 #以下是通过找到斜率最大的时间段内所有微博中转发数最大的用户 count ,results = xapian_search_weibo.search(query=query_dict, sort_by=['reposts_count'], fields=['_id', 'user', 'reposts_count']) print 'pusher_search_count:', count print 'pusher_query_dict:', query_dict pusher_list = [] count = 0 for result in results(): count += 1 if count>100: break wid = result['_id'] uid = result['user'] value = result['reposts_count'] pusher_list.append((uid, wid, value)) # sort by reposts_count # sort_by_rc(pusher_list) return pusher_list
from case.identify import utils as identifyModule import search as searchModule from case.time_utils import ts2datetime, ts2date from xapian_case.xapian_backend import XapianSearch from xapian_case.utils import cut, load_scws from case.dynamic_xapian_weibo import getXapianWeiboByTopic from case.global_config import XAPIAN_USER_DATA_PATH from case.Database import Event, EventManager from case.topic_manage import topics_name_start_end from flask import Blueprint, url_for, render_template, request, abort, flash, session, redirect, make_response scws = load_scws() mod = Blueprint('case', __name__, url_prefix='/index') xapian_search_weibo = getXapianWeiboByTopic() em = EventManager() def acquire_user_by_id(uid): user_search = XapianSearch(path=XAPIAN_USER_DATA_PATH, name='master_timeline_user', schema_version=1) result = user_search.search_by_id(int(uid), fields=[ 'name', 'location', 'followers_count', 'friends_count', 'profile_image_url' ]) user = {}
location_dict['mid'] = r['_id'] location_dict['topic'] = topic location_dict['ts'] = r['timestamp'] location_dict['origin_location'] = origin_location.split('\t')[1] location_dict['repost_location'] = None return location_dict return None if __name__ == '__main__': START_TS = datetime2ts('2015-03-02') END_TS = datetime2ts('2015-03-15') topic = u'两会2015' topic_id = getTopicByName(topic)['_id'] print 'topic: ', topic.encode('utf8') print topic_id, START_TS, END_TS xapian_search = getXapianWeiboByTopic(topic_id) repost_search(topic, START_TS, END_TS) """ item_exist = db.session.query(CityRepost).filter(CityRepost.topic == topic).all() if item_exist: for item in item_exist: db.session.delete(item) db.session.commit() print 'commited' """