def make_network(topic, date, window_size, max_size=100000): end_time = datetime2ts(date) start_time = end_time - window2time(window_size) statuses_search = getXapianweiboByTs(start_time, end_time) g = nx.DiGraph() #need repost index query_dict = {'text': topic, 'timestamp': {'$gt': start_time, '$lt': end_time}} count, get_statuses_results = statuses_search.search(query=query_dict, field=['user', 'retweeted_uid'], max_offset=max_size) print 'topic statuses count %s' % count for status in get_statuses_results(): try: if status['retweeted_uid'] and status['retweeted_uid'] != 0: repost_uid = status['user'] source_uid = status['retweeted_uid'] if is_in_trash_list(repost_uid) or is_in_trash_list(source_uid): continue g.add_edge(repost_uid, source_uid) except (TypeError, KeyError): continue return g
def make_network(topic, date, window_size, max_size=100000, ts=False): end_time = datetime2ts(date) start_time = end_time - window2time(window_size) g = nx.DiGraph() #need repost index topic = cut(s, topic.encode('utf-8')) statuses_search = XapianSearch(path='/opt/xapian_weibo/data/', name='master_timeline_weibo', schema_version=2) query_dict = {'text': topic, 'timestamp': {'$gt': start_time, '$lt': end_time}} if ts: count, get_statuses_results = statuses_search.search(query=query_dict, field=['text', 'user', 'timestamp', 'retweeted_status'], max_offset=max_size) else: count, get_statuses_results = statuses_search.search(query=query_dict, field=['text', 'user', 'retweeted_status'], max_offset=max_size) print 'topic statuses count %s' % count if ts: uid_ts = {} for status in get_statuses_results(): try: if status['retweeted_status']: repost_uid = status['user'] rt_mid = status['retweeted_status'] repost_ts = int(status['timestamp']) source_status = acquire_status_by_id(rt_mid) source_uid = source_status['user'] source_ts = int(source_status['timestamp']) if is_in_trash_list(repost_uid) or is_in_trash_list(source_uid): continue if repost_uid not in uid_ts: uid_ts[repost_uid] = repost_ts else: if uid_ts[repost_uid] > repost_ts: uid_ts[repost_uid] = repost_ts if source_uid not in uid_ts: uid_ts[source_uid] = source_ts else: if uid_ts[source_uid] > source_ts: uid_ts[source_uid] = source_ts g.add_edge(repost_uid, source_uid) except (TypeError, KeyError): continue return uid_ts, g else: for status in get_statuses_results(): try: if status['retweeted_status']: repost_uid = status['user'] rt_mid = status['retweeted_status'] source_uid = acquire_status_by_id(rt_mid)['user'] if is_in_trash_list(repost_uid) or is_in_trash_list(source_uid): continue g.add_edge(repost_uid, source_uid) except (TypeError, KeyError): continue return g
def important_rank(top_n, date, window_size): date_time = datetime2ts(date) uid_important = {} if window_size == 1: db_name = get_leveldb('important', date_time) daily_user_important_bucket = leveldb.LevelDB(os.path.join(LEVELDBPATH, db_name), block_cache_size=8 * (2 << 25), write_buffer_size=8 * (2 << 25)) count = 0 for uid, important in daily_user_important_bucket.RangeIter(): count = count + 1 print count uid = int(uid) important = float(important) uid_important[uid] = important else: for i in range(window_size): db_name = get_leveldb('important', date_time - i*24*60*60) daily_user_important_bucket = leveldb.LevelDB(os.path.join(LEVELDBPATH, db_name), block_cache_size=8 * (2 << 25), write_buffer_size=8 * (2 << 25)) for uid, important in daily_user_important_bucket.RangeIter(): uid = int(uid) important = float(important) if uid not in uid_important: uid_important[uid] = 0 uid_important[uid] += important if len(uid_important) < 100000000: sorted_uid_important = sorted(uid_important.iteritems(), key=operator.itemgetter(1), reverse=True) sorted_uids = [] count = 0 for uid, value in sorted_uid_important: if is_in_trash_list(uid): continue if count >= top_n: break sorted_uids.append(uid) count += 1 else: sorted_uids = user_rank(uid_important, 'whole_active', top_n, date, window_size) return sorted_uids
def make_network(topic, date, window_size, topic_xapian_id, max_size=100000, attribute_add=False): topics = topic.strip().split(',') end_time = int(datetime2ts(date)) start_time = int(end_time - window2time(window_size)) print 'start, end:', start_time, end_time #topic_id='545f4c22cf198b18c57b8014' topic_id = topic_xapian_id statuses_search = getXapianWeiboByTopic(topic_id) ''' count, test_results = statuses_search.search(query={'timestamp':{'$gt': start_time, '$lt': end_time}}) for i in test_results(): print i break ''' g = nx.DiGraph() # 初始化一个有向图 gg = nx.Graph() # 为计算quota初始化一个无向图 ds_dg = nx.DiGraph() # direct_superior_network 有向直接上级转发网络 ds_udg = nx.Graph() # undirect_superior_network 无向直接上级转发网络 query_dict = {'timestamp': {'$gt': start_time, '$lt': end_time}} #need repost index ''' for ctopic in topics: query_dict['topics'].append(ctopic) ''' print 'query_dict:', query_dict count, get_statuses_results = statuses_search.search( query=query_dict, fields=[ '_id', 'user', 'timestamp', 'retweeted_mid', 'retweeted_uid', 'text', 'reposts_count', 'comments_count', 'attitude_count' ], max_offset=max_size) print 'count_before_nad:', count results_list = [] ''' 根据微博文本进行广告微博筛选 ''' if count: for weibo in get_statuses_results(): results_list.append([weibo['_id'], weibo['text']]) scount, data_wid = ad_classifier(results_list) else: data_wid = [] scount = 0 print 'count_after_nad:', scount new_attribute_dict = { } # 星形源头转发网络需要添加的节点对应的text、reposts_count、comment_count、 attitude_count ds_new_attribute_dict = { } # 直接上级转发网络需要添加的属性:即节点对应的text、 reposts_count、 comment_count 、attitude_count new_query_dict = {'$or': []} # 用于查询retweeted_mid对应的weibo内容 ds_new_query_dict = { '$or': [] } # 用于查询direct_superior_user为retweeted_uid对应的retweeted_mid的微博内容 map_dict = { } # map_dict = {retweeted_mid:[retweeted_uid, user, timestamp],...} 保存_id timestamp与其对应的retweeted_mid之间的对应关系 ds_map_dict = { } # ds_dict = {retweeted_mid:[retweeted_uid, user, timestamp]} 直接上级转发网络中直接上级就是源头上级时,对应关系 get_statuses_results = [ r for r in get_statuses_results() if r['retweeted_uid'] != 0 ] set_repost_name = set() for status in get_statuses_results: if str(status['_id']) in data_wid: ''' 当微博信息非垃圾时,进行new_attribute_dict的添加----即[a b]->添加a节点的微博信息 ''' nad_uid = status['user'] nad_id = status['_id'] r_uid = status['retweeted_uid'] r_mid = status['retweeted_mid'] if attribute_add == True: text_add = status['text'] reposts_count_add = status['reposts_count'] comment_count_add = status['comments_count'] attitude_count_add = status['attitude_count'] timestamp_add = status['timestamp'] try: new_attribute_dict[nad_uid].append([ text_add, reposts_count_add, comment_count_add, attitude_count_add, timestamp_add, r_uid ]) ds_new_attribute_dict[nad_uid].append([ text_add, reposts_count_add, comment_count_add, attitude_count_add, timestamp_add, r_uid ]) except: new_attribute_dict[nad_uid] = [[ text_add, reposts_count_add, comment_count_add, attitude_count_add, timestamp_add, r_uid ]] ds_new_attribute_dict[nad_uid] = [[ text_add, reposts_count_add, comment_count_add, attitude_count_add, timestamp_add, r_uid ]] #print 'len(new_attribute_dict):', len(new_attribute_dict) ''' 区别于原创微博 当是转发微博时,获取直接转发上级----例子:[a b c]->b ''' if status['retweeted_uid'] and status['retweeted_uid'] != 0: print 'before get_superior_userid' direct_superior_userid = get_superior_userid( status) # 获取直接转发上级--只获取一跳 ''' repost_name = get_superior_userid(status) # test set_repost_name.add(repost_name) # test ''' print 'user_id', direct_superior_userid if not direct_superior_userid: #当直接转发上级的userid获取不到时,则认为直接转发上级就是源头转发微博 direct_superior_userid = r_uid ds_dg.add_edge(nad_uid, direct_superior_userid) ds_udg.add_edge(nad_uid, direct_superior_userid) ds_new_query_dict['$or'].append({'_id': r_mid}) ds_map_dict[r_mid] = [r_uid, nad_uid, status['timestamp']] # 当直接上级就是源头上级时,通过查询xapian获取weibo_text timestamp comment_counts repost_counts attitude_counts else: #存在直接转发上级 ds_dg.add_edge(nad_uid, direct_superior_userid) ds_udg.add_edge(nad_uid, direct_superior_userid) if attribute_add == 'True': weibo_text = status['text'] # 获取直接上级微博文本内容 weibo_test1 = weibo_text.split( '//@' ) # 此处不会出现文本中没有//@结构,因为这里的直接转发上级是通过昵称转化而得到的,所以一定有//@结构 weibo_test2 = weibo_test1[1] m_index = weibo_test2.find(':') direct_superior_weibo = weibo_test2[m_index + 1:] m_all_index = weibo_text.find(':') direct_superior_weibos = weibo_text[m_all_index + 1:] #需要根据文本内容和r_uid获取timestamp direct_superior_info = get_ds_info( direct_superior_weibos, direct_superior_userid, topic, timestamp_add, topic_xapian_id) # timestamp_add是最终转发微博额时间戳 # 通过直接上级微博文本内容和用户id,查询topic对应的Xapian获取该条微博的时间戳等信息 # 如果在xapian中查不到这条信息的timestamp,则根据其转发链条关系,以(源头用户ts-重点用户ts)/(链条中人数-1)----源头微博还是需要根据mid查询,还是有可能不在xapian里面 # 根据uid和text获取[timestamp, comment_counts, attitude_counts, reposts_counts, r_uid] timestamp = direct_superior_info['timestamp'] comment_count = direct_superior_info['comments_count'] attitude_count = direct_superior_info['attitude_count'] reposts_count = direct_superior_info['reposts_count'] retweeted_uid = direct_superior_info['retweeted_uid'] try: ds_new_attribute_dict[ direct_superior_userid].append([ direct_superior_weibo, reposts_count, comment_count, attitude_count, timestamp, retweeted_uid ]) except: ds_new_attribute_dict[direct_superior_userid] = [[ direct_superior_weibo, reposts_count, comment_count, attitude_count, timestamp, retweeted_uid ]] print 'after get_superior_userid' try: #源头转发网络构建 if status['retweeted_uid'] and status['retweeted_uid'] != 0: repost_uid = status['user'] source_uid = status['retweeted_uid'] if is_in_trash_list(repost_uid) or is_in_trash_list( source_uid): continue g.add_edge( repost_uid, source_uid) # 将所有topic相关的uid作为node,并将它们按照信息传递方向形成有向图 gg.add_edge(repost_uid, source_uid) new_query_dict['$or'].append({'_id': r_mid}) # 为了查询转发微博的内容 map_dict[r_mid] = [r_uid, nad_uid, status['timestamp']] except (TypeError, KeyError): continue print 'step_1:g', len(g) print 'step_1:ds_dg', len(ds_dg) # 这里还需要处理,由于根据mid查询xapian获取微博的时间戳,但是有可能会没有获取到。若没有获取到,使用check_attribude进行处理 if attribute_add == True: # 将retweeted_uid用户的text等取出来,存在new_attribute_dict中 ruid_count, r_results = statuses_search.search( query=new_query_dict, fields=[ '_id', 'user', 'timestamp', 'retweeted_mid', 'retweeted_uid', 'text', 'reposts_count', 'comments_count', 'attitude_count' ]) for rresult in r_results(): text = rresult[ 'text'] # 这里的text需要再做一次处理----剔除掉’//@..:‘的内容,只获取作者自己的微博文本 text_spl = text.split('//@') try: text_add = text_spl[0] except: text_add = text timestamp_add = rresult['timestamp'] reposts_count_add = rresult['reposts_count'] comment_count_add = rresult['comments_count'] attitude_count_add = rresult['attitude_count'] ruid_add = rresult['retweeted_uid'] try: new_attribute_dict[rresult['user']].append([ text_add, reposts_count_add, comment_count_add, attitude_count_add, timestamp_add, ruid_add ]) except: new_attribute_dict[rresult['user']] = [[ text_add, reposts_count_add, comment_count_add, attitude_count_add, timestamp_add, ruid_add ]] #print 'map_dict:', map_dict new_attribute_dict = check_attribute( new_attribute_dict, new_query_dict, map_dict) # 对query_dict中没有查询到的r_mid,在new_attribute_dict中进行补全处理 #print 'quer_dict:', ds_new_query_dict print 'len(ds_new_attribute_dict):', len(ds_new_attribute_dict) if query_dict != {'$or': []}: ds_ruid_count, ds_r_results = statuses_search.search( query=ds_new_query_dict, fields=[ '_id', 'user', 'timestamp', 'retweeted_mid', 'retweeted_uid', 'text', 'reposts_count', 'comments_count', 'attitude_count' ]) for ds_rresult in ds_r_results(): uid = ds_rresult['user'] timestamp_add = ds_rresult['timestamp'] text = ds_rresult[ 'text'] # 这里的text需要再做一次处理----剔除掉’//@..:‘的内容,只获取作者自己的微博文本 text_spl = text.split('//@') try: text_add = text_spl[0] except: text_add = text reposts_count_add = ds_rresult['reposts_count'] comment_count_add = ds_rresult['comments_count'] attitude_count_add = ds_rresult['attitude_count'] ruid_add = rresult['retweeted_uid'] try: ds_new_attribute_dict[uid].append([ text_add, reposts_count_add, comment_count_add, attitude_count_add, timestamp_add, ruid_add ]) except: ds_new_attribute_dict[uid] = [[ text_add, reposts_count_add, comment_count_add, attitude_count_add, timestamp_add, ruid_add ]] ds_new_attribute_dict = check_attribute(ds_new_attribute_dict, ds_new_query_dict, ds_map_dict) #print 'new_attribute_dict:', new_attribute_dict print 'len(g):', len(g) print 'len(ds_dg):', len(ds_dg) return g, gg, new_attribute_dict, ds_dg, ds_udg, ds_new_attribute_dict '''
def make_network(topic, date, window_size, max_size=100000, attribute_add=False): topics = topic.strip().split(',') end_time = int(datetime2ts(date)) start_time = int(end_time - window2time(window_size)) print 'start, end:', start_time, end_time query_body = { 'query': { 'bool': { 'should': [ #{'term': { #'message_type': 1 #} #}, { 'term': { 'message_type': 3 } } ], 'must': # {'term':{'name': topic}}, { 'range': { 'timestamp': { 'gte': start_time, 'lt': end_time } } } } }, 'size': max_size, # 返回条数限制 待删 #'sort': {"timestamp": {"order": "asc"}} } es_search_weibos = weibo_es.search(index=topic, doc_type=weibo_index_type, body=query_body)['hits']['hits'] get_statuses_results = es_search_weibos g = nx.DiGraph() # 初始化一个有向图 gg = nx.Graph() # 为计算quota初始化一个无向图 results_list = [] ''' 根据微博文本进行广告微博筛选 ''' if len(es_search_weibos) > 1: for weibo in get_statuses_results: results_list.append( [weibo['_source']['mid'], weibo['_source']['text']]) scount, data_wid = ad_classifier(results_list) #print data_wid else: data_wid = [] scount = 0 print 'count_after_nad:', scount new_attribute_dict = { } # 星形源头转发网络需要添加的节点对应的text、reposts_count、comment_count、 attitude_count map_dict = { } # map_dict = {retweeted_mid:[retweeted_uid, user, timestamp],...} 保存_id timestamp与其对应的retweeted_mid之间的对应关系 ds_map_dict = { } # ds_dict = {retweeted_mid:[retweeted_uid, user, timestamp]} 直接上级转发网络中直接上级就是源头上级时,对应关系 get_statuses_results = [ r for r in get_statuses_results if r['_source']['uid'] != 0 ] #print get_statuses_results print len(get_statuses_results) set_repost_name = set() for status in get_statuses_results: if str(status['_source']['mid']) in data_wid: #print status['_source'] ''' 当微博信息非垃圾时,进行new_attribute_dict的添加----即[a b]->添加a节点的微博信息 ''' nad_uid = status['_source']['uid'] nad_id = status['_source']['mid'] #r_uid = status['_source']['root_uid'] #r_mid = status['_source']['root_mid'] try: r_uid = status['_source']['root_uid'] r_mid = status['_source']['root_mid'] except: r_uid = 0 r_mid = 0 #print 'hahahahahahahahaha' if attribute_add == True: text_add = status['_source']['text'] try: reposts_count_add = status['_source']['retweeted'] except: reposts_count_add = 0 try: comment_count_add = status['_source']['comment'] except: comment_count_add = 0 #attitude_count_add = status['_source']['attitude_count'] timestamp_add = status['_source']['timestamp'] try: new_attribute_dict[nad_uid].append([ text_add, reposts_count_add, comment_count_add, timestamp_add, r_uid ]) #ds_new_attribute_dict[nad_uid].append([text_add, reposts_count_add, comment_count_add, timestamp_add, r_uid]) except: new_attribute_dict[nad_uid] = [[ text_add, reposts_count_add, comment_count_add, timestamp_add, r_uid ]] #ds_new_attribute_dict[nad_uid] = [[text_add, reposts_count_add, comment_count_add, timestamp_add, r_uid]] try: #源头转发网络构建 if status['_source'][ 'root_uid'] and status['_source']['root_uid'] != 0: repost_uid = status['_source']['uid'] source_uid = status['_source']['root_uid'] # print '405',repost_uid,source_uid if is_in_trash_list(repost_uid) or is_in_trash_list( source_uid): continue g.add_edge( repost_uid, source_uid) # 将所有topic相关的uid作为node,并将它们按照信息传递方向形成有向图 gg.add_edge(repost_uid, source_uid) #new_query_dict['$or'].append({'_id':r_mid}) # 为了查询转发微博的内容 map_dict[r_mid] = [r_uid, nad_uid, status['timestamp']] except (TypeError, KeyError): continue print g return g, gg, new_attribute_dict print 'step_1:g', len(g)
def make_network(topic, date, window_size, topic_xapian_id, max_size=100000, attribute_add = False): topics = topic.strip().split(',') end_time = int(datetime2ts(date)) start_time = int(end_time - window2time(window_size)) print 'start, end:', start_time, end_time #topic_id='545f4c22cf198b18c57b8014' topic_id = topic_xapian_id statuses_search = getXapianWeiboByTopic(topic_id) ''' count, test_results = statuses_search.search(query={'timestamp':{'$gt': start_time, '$lt': end_time}}) for i in test_results(): print i break ''' g = nx.DiGraph() # 初始化一个有向图 gg = nx.Graph() # 为计算quota初始化一个无向图 ds_dg = nx.DiGraph() # direct_superior_network 有向直接上级转发网络 ds_udg = nx.Graph() # undirect_superior_network 无向直接上级转发网络 query_dict = {'timestamp':{'$gt': start_time, '$lt': end_time}} #need repost index ''' for ctopic in topics: query_dict['topics'].append(ctopic) ''' print 'query_dict:', query_dict count, get_statuses_results = statuses_search.search(query=query_dict, fields=['_id', 'user', 'timestamp', 'retweeted_mid','retweeted_uid', 'text', 'reposts_count', 'comments_count', 'attitude_count'], max_offset=max_size) print 'count_before_nad:', count results_list = [] ''' 根据微博文本进行广告微博筛选 ''' if count: for weibo in get_statuses_results(): results_list.append([weibo['_id'],weibo['text']]) scount, data_wid = ad_classifier(results_list) else: data_wid = [] scount = 0 print 'count_after_nad:', scount new_attribute_dict = {} # 星形源头转发网络需要添加的节点对应的text、reposts_count、comment_count、 attitude_count ds_new_attribute_dict = {} # 直接上级转发网络需要添加的属性:即节点对应的text、 reposts_count、 comment_count 、attitude_count new_query_dict = { '$or':[] } # 用于查询retweeted_mid对应的weibo内容 ds_new_query_dict ={ '$or':[] } # 用于查询direct_superior_user为retweeted_uid对应的retweeted_mid的微博内容 map_dict = {} # map_dict = {retweeted_mid:[retweeted_uid, user, timestamp],...} 保存_id timestamp与其对应的retweeted_mid之间的对应关系 ds_map_dict = {} # ds_dict = {retweeted_mid:[retweeted_uid, user, timestamp]} 直接上级转发网络中直接上级就是源头上级时,对应关系 get_statuses_results = [r for r in get_statuses_results() if r['retweeted_uid'] != 0] set_repost_name = set() for status in get_statuses_results: if str(status['_id']) in data_wid: ''' 当微博信息非垃圾时,进行new_attribute_dict的添加----即[a b]->添加a节点的微博信息 ''' nad_uid = status['user'] nad_id = status['_id'] r_uid = status['retweeted_uid'] r_mid = status['retweeted_mid'] if attribute_add == True: text_add = status['text'] reposts_count_add = status['reposts_count'] comment_count_add = status['comments_count'] attitude_count_add = status['attitude_count'] timestamp_add = status['timestamp'] try: new_attribute_dict[nad_uid].append([text_add, reposts_count_add, comment_count_add, attitude_count_add, timestamp_add, r_uid]) ds_new_attribute_dict[nad_uid].append([text_add, reposts_count_add, comment_count_add, attitude_count_add, timestamp_add, r_uid]) except: new_attribute_dict[nad_uid] = [[text_add, reposts_count_add, comment_count_add, attitude_count_add, timestamp_add, r_uid]] ds_new_attribute_dict[nad_uid] = [[text_add, reposts_count_add, comment_count_add, attitude_count_add, timestamp_add, r_uid]] #print 'len(new_attribute_dict):', len(new_attribute_dict) ''' 区别于原创微博 当是转发微博时,获取直接转发上级----例子:[a b c]->b ''' if status['retweeted_uid'] and status['retweeted_uid']!=0: print 'before get_superior_userid' direct_superior_userid = get_superior_userid(status) # 获取直接转发上级--只获取一跳 ''' repost_name = get_superior_userid(status) # test set_repost_name.add(repost_name) # test ''' print 'user_id', direct_superior_userid if not direct_superior_userid: #当直接转发上级的userid获取不到时,则认为直接转发上级就是源头转发微博 direct_superior_userid = r_uid ds_dg.add_edge(nad_uid, direct_superior_userid) ds_udg.add_edge(nad_uid, direct_superior_userid) ds_new_query_dict['$or'].append({'_id':r_mid}) ds_map_dict[r_mid] = [r_uid, nad_uid, status['timestamp']] # 当直接上级就是源头上级时,通过查询xapian获取weibo_text timestamp comment_counts repost_counts attitude_counts else: #存在直接转发上级 ds_dg.add_edge(nad_uid, direct_superior_userid) ds_udg.add_edge(nad_uid, direct_superior_userid) if attribute_add == 'True': weibo_text = status['text'] # 获取直接上级微博文本内容 weibo_test1 = weibo_text.split('//@') # 此处不会出现文本中没有//@结构,因为这里的直接转发上级是通过昵称转化而得到的,所以一定有//@结构 weibo_test2 = weibo_test1[1] m_index = weibo_test2.find(':') direct_superior_weibo = weibo_test2[m_index+1:] m_all_index = weibo_text.find(':') direct_superior_weibos = weibo_text[m_all_index+1:] #需要根据文本内容和r_uid获取timestamp direct_superior_info = get_ds_info(direct_superior_weibos, direct_superior_userid, topic, timestamp_add, topic_xapian_id) # timestamp_add是最终转发微博额时间戳 # 通过直接上级微博文本内容和用户id,查询topic对应的Xapian获取该条微博的时间戳等信息 # 如果在xapian中查不到这条信息的timestamp,则根据其转发链条关系,以(源头用户ts-重点用户ts)/(链条中人数-1)----源头微博还是需要根据mid查询,还是有可能不在xapian里面 # 根据uid和text获取[timestamp, comment_counts, attitude_counts, reposts_counts, r_uid] timestamp = direct_superior_info['timestamp'] comment_count = direct_superior_info['comments_count'] attitude_count = direct_superior_info['attitude_count'] reposts_count = direct_superior_info['reposts_count'] retweeted_uid = direct_superior_info['retweeted_uid'] try: ds_new_attribute_dict[direct_superior_userid].append([direct_superior_weibo, reposts_count, comment_count, attitude_count, timestamp, retweeted_uid]) except: ds_new_attribute_dict[direct_superior_userid] = [[direct_superior_weibo, reposts_count, comment_count, attitude_count, timestamp, retweeted_uid]] print 'after get_superior_userid' try: #源头转发网络构建 if status['retweeted_uid'] and status['retweeted_uid'] != 0: repost_uid = status['user'] source_uid = status['retweeted_uid'] if is_in_trash_list(repost_uid) or is_in_trash_list(source_uid): continue g.add_edge(repost_uid, source_uid) # 将所有topic相关的uid作为node,并将它们按照信息传递方向形成有向图 gg.add_edge(repost_uid, source_uid) new_query_dict['$or'].append({'_id':r_mid}) # 为了查询转发微博的内容 map_dict[r_mid] = [r_uid, nad_uid, status['timestamp']] except (TypeError, KeyError): continue print 'step_1:g', len(g) print 'step_1:ds_dg', len(ds_dg) # 这里还需要处理,由于根据mid查询xapian获取微博的时间戳,但是有可能会没有获取到。若没有获取到,使用check_attribude进行处理 if attribute_add == True: # 将retweeted_uid用户的text等取出来,存在new_attribute_dict中 ruid_count, r_results = statuses_search.search(query=new_query_dict, fields=['_id', 'user', 'timestamp', 'retweeted_mid','retweeted_uid', 'text', 'reposts_count', 'comments_count', 'attitude_count']) for rresult in r_results(): text = rresult['text'] # 这里的text需要再做一次处理----剔除掉’//@..:‘的内容,只获取作者自己的微博文本 text_spl = text.split('//@') try: text_add = text_spl[0] except: text_add = text timestamp_add = rresult['timestamp'] reposts_count_add = rresult['reposts_count'] comment_count_add = rresult['comments_count'] attitude_count_add = rresult['attitude_count'] ruid_add = rresult['retweeted_uid'] try: new_attribute_dict[rresult['user']].append([text_add, reposts_count_add, comment_count_add, attitude_count_add, timestamp_add, ruid_add]) except: new_attribute_dict[rresult['user']] = [[text_add, reposts_count_add, comment_count_add, attitude_count_add, timestamp_add, ruid_add]] #print 'map_dict:', map_dict new_attribute_dict = check_attribute(new_attribute_dict, new_query_dict, map_dict) # 对query_dict中没有查询到的r_mid,在new_attribute_dict中进行补全处理 #print 'quer_dict:', ds_new_query_dict print 'len(ds_new_attribute_dict):', len(ds_new_attribute_dict) if query_dict!={'$or':[]}: ds_ruid_count, ds_r_results = statuses_search.search(query=ds_new_query_dict, fields=['_id', 'user', 'timestamp', 'retweeted_mid','retweeted_uid', 'text', 'reposts_count', 'comments_count', 'attitude_count']) for ds_rresult in ds_r_results(): uid = ds_rresult['user'] timestamp_add = ds_rresult['timestamp'] text = ds_rresult['text'] # 这里的text需要再做一次处理----剔除掉’//@..:‘的内容,只获取作者自己的微博文本 text_spl = text.split('//@') try: text_add = text_spl[0] except: text_add = text reposts_count_add = ds_rresult['reposts_count'] comment_count_add = ds_rresult['comments_count'] attitude_count_add = ds_rresult['attitude_count'] ruid_add = rresult['retweeted_uid'] try: ds_new_attribute_dict[uid].append([text_add, reposts_count_add, comment_count_add, attitude_count_add, timestamp_add, ruid_add]) except: ds_new_attribute_dict[uid] = [[text_add, reposts_count_add, comment_count_add, attitude_count_add, timestamp_add, ruid_add]] ds_new_attribute_dict = check_attribute(ds_new_attribute_dict, ds_new_query_dict, ds_map_dict) #print 'new_attribute_dict:', new_attribute_dict print 'len(g):', len(g) print 'len(ds_dg):', len(ds_dg) return g , gg, new_attribute_dict, ds_dg, ds_udg, ds_new_attribute_dict '''
def realtime_burst_user(top_n, current_time): current_datetime = datetime.fromtimestamp(current_time) current_hour = current_datetime.hour total_size = 0 total_uid_count = {} for h in range(current_hour): db_name = get_leveldb(current_time, h) uid_group_bucket = leveldb.LevelDB(os.path.join(LEVELDBPATH, db_name), block_cache_size=8 * (2 << 25), write_buffer_size=8 * (2 << 25)) try: group_size = uid_group_bucket.Get('size') except KeyError: continue if group_size: group_size = int(group_size) if group_size > 0: total_size += group_size for uid, value in uid_group_bucket.RangeIter(): if uid == 'size': continue uid = int(uid) value = float(value) if uid not in total_uid_count: total_uid_count[uid] = 0 total_uid_count[uid] += value uid_burst = {} for h in range(current_hour): db_name = get_leveldb(current_time, h) uid_group_bucket = leveldb.LevelDB(os.path.join(LEVELDBPATH, db_name), block_cache_size=8 * (2 << 25), write_buffer_size=8 * (2 << 25)) try: group_size = uid_group_bucket.Get('size') except KeyError: continue if group_size: group_size = int(group_size) if group_size <= 0: continue for uid, value in uid_group_bucket.RangeIter(): if uid == 'size': continue uid = int(uid) value = float(value) A = value B = total_uid_count[uid] - A C = group_size - A D = total_size - total_uid_count[uid] - C if uid not in uid_burst: uid_burst[uid] = 0 uid_burst[uid] += (A + B + C + D) * ((A*D - B*C) ** 2) * 1.0 / ((A + B) * (C + D) * (A + C) * (B + D)) sorted_uid_burst = sorted(uid_burst.iteritems(), key=operator.itemgetter(1), reverse=True) sorted_uids = [] count = 0 for uid, value in sorted_uid_burst: if is_in_trash_list(uid): continue if count >= top_n: break sorted_uids.append(uid) count += 1 data = generate_rank_results(sorted_uids) return data