def read_trend_pusher(topic, date, windowsize, rank_method): results = [] #print 'topic, date, windowsize:', topic.encode('utf-8'), date, windowsize items = db.session.query(TrendPusher).filter(TrendPusher.topic==topic ,\ TrendPusher.date==date ,\ TrendPusher.windowsize==windowsize).all() #print 'len(items):', len(items) if items: for item in items: uid = item.uid timestamp = item.timestamp timestamp = ts2date(timestamp) user_info = json.loads(item.user_info) uname = user_info['name'] location = user_info['location'] profile_image_url = user_info['profile_image_url'] if profile_image_url == u'未知': profile_image_url = 'no' friends_count = user_info['friends_count'] if friends_count == u'未知': friends_count = -1 followers_count = user_info['followers_count'] statuses_count = user_info['statuses_count'] if statuses_count == u'未知': statuses_count = -1 created_at = user_info['created_at'] weibo_info = json.loads(item.weibo_info) text = weibo_info['text'] geo = weibo_info['geo'] source = weibo_info['source'] _id = weibo_info['_id'] reposts_count = weibo_info['reposts_count'] weibo_link = weiboinfo2url(uid, _id) user_domain = item.domain domain_name = domain_dict[user_domain] rank = item.rank row = [ rank, uid, uname, location, domain_name, timestamp, text, profile_image_url, friends_count, followers_count, statuses_count, created_at, geo, source, weibo_link, _id, reposts_count ] results.append(row) sort_result = results if rank_method == 'reposts_count': rank_results = results sort_result = deal_rank(rank_results) elif rank_method == 'timestamp': sort_result = sorted(results, key=lambda x: x[5]) sort_result = deal_rank(sort_result) elif rank_method == 'friends_count': sort_result = sorted(results, key=lambda x: x[8], reverse=True) sort_result = deal_rank(sort_result) elif rank_method == 'statuses_count': sort_result = sorted(results, key=lambda x: x[10], reverse=True) sort_result = deal_rank(sort_result) #print 'results:', sort_result return sort_result
def read_trend_maker(topic, date, windowsize, rank_method): results = [] #print 'topic, date, windowsize:', topic.encode('utf-8'), date, windowsize items = db.session.query(TrendMaker).filter(TrendMaker.topic==topic ,\ TrendMaker.date==date ,\ TrendMaker.windowsize==windowsize).all() #print 'len(items):', len(items) if items: for item in items: uid= item.uid timestamp = item.timestamp timestamp = ts2date(timestamp) user_info = json.loads(item.user_info) uname = user_info['name'] location = user_info['location'] profile_image_url = user_info['profile_image_url'] if profile_image_url == u'未知': profile_image_url = 'no' friends_count = user_info['friends_count'] if friends_count == u'未知': friends_count = -1 followers_count = user_info['followers_count'] statuses_count = user_info['statuses_count'] if statuses_count == u'未知': statuses_count = -1 created_at = user_info['created_at'] weibo_info = json.loads(item.weibo_info) text = weibo_info['text'] geo = weibo_info['geo'] source = weibo_info['source'] _id = weibo_info['_id'] reposts_count = weibo_info['reposts_count'] weibo_link = weiboinfo2url(uid, _id) user_domain = item.domain domain_name = domain_dict[user_domain] rank = item.rank value = item.value # 关键词命中个数 key_item = json.loads(item.key_item) # 命中关键词 row = [rank, uid, uname, location, domain_name, timestamp, text, profile_image_url, friends_count, followers_count, statuses_count, created_at,geo, source, weibo_link, _id, reposts_count, value, key_item] results.append(row) sort_result = results if rank_method == 'content': rank_results = results sort_result = deal_rank(rank_results) elif rank_method == 'timestamp': sort_result = sorted(results, key=lambda x:x[5]) sort_result = deal_rank(sort_result) elif rank_method == 'friends_count': sort_result = sorted(results, key=lambda x:x[8], reverse=True) sort_result = deal_rank(sort_result) elif rank_method == 'statuses_count': sort_result = sorted(results, key=lambda x:x[10], reverse=True) sort_result = deal_rank(sort_result) elif rank_method == 'reposts_count': sort_result = sorted(results, key=lambda x:x[16], reverse=True) sort_result = deal_rank(sort_result) #print 'results:', sort_result return sort_result
def read_uid_weibos(topic, date, windowsize, uid): # change end_ts = datetime2ts(date) start_ts = end_ts - Day * windowsize xapian_search_weibo = getXapianWeiboByTopic(topic, start_ts, end_ts) query_dict = {'user': uid} count, results = xapian_search_weibo.search(query=query_dict, fields=weibo_fields_list) if count == 0: weibo_list = [] else: weibo_list = [] for weibo in results(): wid = weibo['_id'] uid = weibo['user'] result = user_search.search_by_id(uid, fields=user_fields_list) if result: name = result['name'] location = result['location'] friends_count = result['friends_count'] followers_count = result['followers_count'] created_at = result['created_at'] statuses_count = result['statuses_count'] profile_image_url = result['profile_image_url'] else: name = u'未知' location = u'未知' friends_count = u'未知' followers_count = u'未知' created_at = u'未知' statuses_count = u'未知' profile_image_url = u'no' text = weibo['text'] geo = weibo['geo'] source = weibo['source'] timestamp = weibo['timestamp'] date = ts2date(timestamp) reposts_count = weibo['reposts_count'] comments_count = weibo['comments_count'] weibo_link = weiboinfo2url(uid, wid) domain = uid2domain(uid) row = [ wid, uid, name, location, friends_count, followers_count, created_at, statuses_count, profile_image_url, date, text, geo, source, reposts_count, comments_count, weibo_link ] weibo_list.append(row) sort_weibo_list = sorted(weibo_list, key=lambda x: x[9]) return sort_weibo_list
def parseWeibos(weibos): weibo_dict = {} weibos = _json_loads(weibos) if not weibos: return {} for weibo in weibos: try: _id = weibo['_id'] reposts_count = weibo['reposts_count'] weibo['weibo_link'] = weiboinfo2url(weibo['user'], _id) weibo_dict[_id] = [reposts_count, weibo] except: continue return weibo_dict
def parseWeibos(weibos): weibo_dict = {} weibos = _json_loads(weibos) if not weibos: return {} for weibo in weibos: try: _id = weibo['_id'] username, profileimage = getuserinfo(weibo['user']) # get username and profile_image_url reposts_count = weibo['reposts_count'] weibo['weibo_link'] = weiboinfo2url(weibo['user'], _id) weibo['name'] = username weibo['profile_image_url'] = profileimage weibo['date'] = ts2date(weibo['timestamp']) weibo_dict[_id] = [reposts_count, weibo] except: continue return weibo_dict
def parseWeibos(weibos): weibo_dict = {} weibos = _json_loads(weibos) if not weibos: return {} for weibo in weibos: try: _id = weibo['_id'] username, profileimage = getuserinfo( weibo['user']) # get username and profile_image_url reposts_count = weibo['reposts_count'] weibo['weibo_link'] = weiboinfo2url(weibo['user'], _id) weibo['name'] = username weibo['profile_image_url'] = profileimage weibo['date'] = ts2date(weibo['timestamp']) weibo_dict[_id] = [reposts_count, weibo] except: continue return weibo_dict
def parseWeibos(weibos): weibo_dict = {} weibos = _json_loads(weibos) if not weibos: return {} for weibo in weibos: try: _id = weibo["_id"] username, profileimage = getuserinfo(weibo["user"]) # print 'username', profileimage reposts_count = weibo["reposts_count"] # print 'reposts_count', reposts_count weibo["weibo_link"] = weiboinfo2url(weibo["user"], _id) weibo["username"] = username weibo["profile_image_url"] = profileimage weibo["timestamp"] = ts2date(weibo["timestamp"]) # print 'weibo:', weibo weibo_dict[_id] = [reposts_count, weibo] except: continue # print 'there :', weibo_dict return weibo_dict
def get_city_weibo(topic, start_ts, end_ts, unit=MinInterval, limit=TOP_WEIBOS_LIMIT): weibos = [] if end_ts - start_ts < unit: upbound = int(math.ceil(end_ts / (unit * 1.0)) * unit) item = ( db.session.query(CityWeibos) .filter( CityWeibos.end == upbound, CityWeibos.topic == topic, CityWeibos.range == unit, CityWeibos.limit == limit, ) .first() ) if item: news = _json_loads(item.weibos) for weibo_item in news: weibos.append((weibo_item["reposts_count"], weibo_item)) else: upbound = int(math.ceil(end_ts / (unit * 1.0)) * unit) lowbound = (start_ts / unit) * unit items = ( db.session.query(CityWeibos) .filter( CityWeibos.end > lowbound, CityWeibos.end <= upbound, CityWeibos.topic == topic, CityWeibos.range == unit, CityWeibos.limit == limit, ) .all() ) for item in items: news = _json_loads(item.weibos) for weibo_item in news: try: weibos.append((weibo_item["reposts_count"], weibo_item)) except: continue sorted_weibos = sorted(weibos, key=lambda k: k[0], reverse=True) city_dict = {} k = 0 for reposts_count, result in sorted_weibos: k += 1 if k > 1000: break uid = result["user"] user_info = acquire_user_by_id(uid) if user_info: result["username"] = user_info["name"] else: result["username"] = "******" time = ts2date(result["timestamp"]) result["time"] = time try: if len(result["geo"].split(".")) == 4: full_area = IP2city(result["geo"]) result["geo"] = full_area city = full_area.split("\t")[1] else: city = geo2city(result["geo"]).split("\t")[1] except: city = "" result["weibo_link"] = weiboinfo2url(result["user"], result["_id"]) if city in province_list: try: city_dict[city].append(result) except: city_dict[city] = [result] return city_dict
def get_city_weibo(topic, start_ts, end_ts, unit=MinInterval, limit=TOP_WEIBOS_LIMIT): weibos = [] if (end_ts - start_ts < unit): upbound = int(math.ceil(end_ts / (unit * 1.0)) * unit) item = db.session.query(CityWeibos).filter(CityWeibos.end==upbound, \ CityWeibos.topic==topic, \ CityWeibos.range==unit, \ CityWeibos.limit==limit).first() if item: news = _json_loads(item.weibos) for weibo_item in news: weibos.append((weibo_item['reposts_count'], weibo_item)) else: upbound = int(math.ceil(end_ts / (unit * 1.0)) * unit) lowbound = (start_ts / unit) * unit items = db.session.query(CityWeibos).filter(CityWeibos.end>lowbound, \ CityWeibos.end<=upbound, \ CityWeibos.topic==topic, \ CityWeibos.range==unit, \ CityWeibos.limit==limit).all() for item in items: news = _json_loads(item.weibos) for weibo_item in news: try: weibos.append((weibo_item['reposts_count'], weibo_item)) except: continue sorted_weibos = sorted(weibos, key=lambda k: k[0], reverse=True) city_dict = {} k = 0 for reposts_count, result in sorted_weibos: k += 1 if k > 1000: break uid = result['user'] user_info = acquire_user_by_id(uid) if user_info: result['username'] = user_info['name'] else: result['username'] = '******' time = ts2date(result['timestamp']) result['time'] = time try: if (len(result['geo'].split('.')) == 4): full_area = IP2city(result['geo']) result['geo'] = full_area city = full_area.split('\t')[1] else: city = geo2city(result['geo']).split('\t')[1] except: city = '' result['weibo_link'] = weiboinfo2url(result['user'], result['_id']) if city in province_list: try: city_dict[city].append(result) except: city_dict[city] = [result] return city_dict
def community_result(community_user_list, topic, date, windowsize): #change end_ts = datetime2ts(date) start_ts = end_ts - windowsize * Day xapian_search_weibo = getXapianWeiboByTopic(topic, start_ts, end_ts) query_dict = {'$or': []} for uid in community_user_list: query_dict['$or'].append({'user': int(uid)}) community_info = [] count, weibo_results = xapian_search_weibo.search(query=query_dict, fields=weibo_fields_list) if count == 0: return None, None, None sentiment_count = {} for weibo in weibo_results(): uid = weibo['user'] _id = weibo['_id'] result = user_search.search_by_id(uid, fields=user_fields_list) if result: name = result['name'] location = result['location'] friends_count = result['friends_count'] followers_count = result['followers_count'] created_at = result['created_at'] statuses_count = result['statuses_count'] profile_image_url = result['profile_image_url'] else: name = u'未知' location = u'未知' friends_count = u'未知' followers_count = u'未知' created_at = u'未知' statuses_count = u'未知' profile_image_url = u'no' text = weibo['text'] timestamp = weibo['timestamp'] date = ts2date(timestamp) reposts_count = weibo['reposts_count'] source = weibo['source'] geo = weibo['geo'] comments_count = weibo['comments_count'] sentiment = weibo['sentiment'] sentiment_name = emotions_kv[sentiment] weibo_link = weiboinfo2url(uid, _id) domain = uid2domain(uid) try: sentiment_count[sentiment] += 1 except KeyError: sentiment_count[sentiment] = 1 community_info.append([ _id, name, location, friends_count, followers_count, created_at, statuses_count, profile_image_url, text, date, reposts_count, source, geo, comments_count, sentiment_name, weibo_link, domain ]) sort_community_info = sorted(community_info, key=lambda x: x[10], reverse=True) #以转发量排序 mset = xapian_search_weibo.search(query=query_dict, max_offset=50, mset_direct=True) top_keyword = top_keywords(gen_mset_iter(xapian_search_weibo, mset, fields=['terms']), top=50) sort_top_keyword = sorted(top_keyword, key=lambda x: x[1], reverse=True) new_sentiment_list = [] for sentiment in sentiment_count: sentiment_ch = emotions_kv[int(sentiment)] num = sentiment_count[sentiment] ratio = float(num) / float(count) new_sentiment_list.append([sentiment_ch, num, ratio]) return sort_community_info, sort_top_keyword, new_sentiment_list, query_dict
def read_uid_weibos(topic, date, windowsize, uid): # change end_ts = datetime2ts(date) start_ts = end_ts - Day * windowsize xapian_search_weibo = getXapianWeiboByTopic(topic, start_ts, end_ts) query_dict = {"user": uid} count, results = xapian_search_weibo.search(query=query_dict, fields=weibo_fields_list) if count == 0: weibo_list = [] else: weibo_list = [] for weibo in results(): wid = weibo["_id"] uid = weibo["user"] result = user_search.search_by_id(uid, fields=user_fields_list) if result: name = result["name"] location = result["location"] friends_count = result["friends_count"] followers_count = result["followers_count"] created_at = result["created_at"] statuses_count = result["statuses_count"] profile_image_url = result["profile_image_url"] else: name = u"未知" location = u"未知" friends_count = u"未知" followers_count = u"未知" created_at = u"未知" statuses_count = u"未知" profile_image_url = u"no" text = weibo["text"] geo = weibo["geo"] source = weibo["source"] timestamp = weibo["timestamp"] date = ts2date(timestamp) reposts_count = weibo["reposts_count"] comments_count = weibo["comments_count"] weibo_link = weiboinfo2url(uid, wid) domain = uid2domain(uid) row = [ wid, uid, name, location, friends_count, followers_count, created_at, statuses_count, profile_image_url, date, text, geo, source, reposts_count, comments_count, weibo_link, ] weibo_list.append(row) sort_weibo_list = sorted(weibo_list, key=lambda x: x[9]) return sort_weibo_list
def c_weibo_by_ts(topic, date, windowsize, uid, network_type, cid, rank_method): real_topic_id = acquire_real_topic_id(topic, date, windowsize) if not real_topic_id: return None, None, None # 该话题存在进行下面的计算 key_pre = str(real_topic_id) + '_' + str(date) + '_' + str(windowsize) # 选择有向图进行社区信息的计算 if network_type=='source_graph': key = str(GRAPH_PATH)+key_pre + '_gg_graph.gexf' elif network_type=='direct_superior_graph': key = str(GRAPH_PATH)+key_pre + '_ds_udg_graph.gexf' g = nx.read_gexf(key) # 获取图结构中节点uid对应的社区包括的节点list community_user_list = get_community_user(g, uid, cid) # 考虑节点社区属性存放的位置 # change end_ts = datetime2ts(date) start_ts = end_ts - Day * windowsize xapian_search_weibo = getXapianWeiboByTopic(topic, start_ts, end_ts) query_dict = { '$or' : [] } for uid in community_user_list: query_dict['$or'].append({'user': int(uid)}) community_info = [] count, weibo_results = xapian_search_weibo.search(query=query_dict, fields= weibo_fields_list) if count==0: return None for weibo in weibo_results(): uid = weibo['user'] _id = weibo['_id'] result = user_search.search_by_id(uid, fields=user_fields_list) if result: name = result['name'] location = result['location'] friends_count = result['friends_count'] followers_count = result['followers_count'] created_at = result['created_at'] statuses_count = result['statuses_count'] profile_image_url = result['profile_image_url'] else: name = u'未知' location = u'未知' friends_count = u'未知' followers_count = u'未知' created_at = u'未知' statuses_count = u'未知' profile_image_url = u'no' text = weibo['text'] timestamp = weibo['timestamp'] date = ts2date(timestamp) reposts_count = weibo['reposts_count'] source = weibo['source'] geo = weibo['geo'] comments_count = weibo['comments_count'] sentiment = weibo['sentiment'] sentiment_name = emotions_kv[sentiment] weibo_link = weiboinfo2url(uid, _id) community_info.append([_id, name, location, friends_count, followers_count, created_at, statuses_count, profile_image_url, text, date, reposts_count, source, geo, comments_count, sentiment_name,weibo_link, uid, timestamp]) if rank_method == 'reposts_count': sort_community_info = sorted(community_info, key=lambda x:x[10], reverse=True) #以转发量排序 else: sort_community_info = sorted(community_info, key=lambda x:x[17]) # 以时间戳排序 return sort_community_info
def save_weibos(excel_name, topic, child_topic_list, w_limit): # 这里需要根据文本内容查询相关微博id等 data = xlrd.open_workbook(excel_name) weibos_dict = {} for i in child_topic_list: #if i == '0': # continue weibos_dict[i] = [] table_weibos = data.sheet_by_name(str(int(i))) n_row_weibos = table_weibos.nrows if n_row_weibos <= w_limit: n_rows = n_row_weibo else: n_rows = w_limit # 考虑到数据已经根据权重从大到小排列 for j in range(n_rows): line = table_weibos.row_values(j) # 缺少根据文本查询微博文本对应的其他微博内容 weibo_text = line[1] weibo_weight = line[0] try: weibos_dict[i].append((weibo_text, weibo_weight)) # 实际上这里append的应该是weibo的完整内容,并且是将username等获取到的 except: weibos_dict[i]=[(weibo_text, weibo_weight)] #print 'weibos_dict:', weibos_dict #获取微博具体数据,仅作测试用 s = XapianSearch(path='/home/ubuntu3/huxiaoqian/case/20140724/20140804/', name='master_timeline_weibo',schema_version='5') begin_ts = 1378050300 end_ts = 1378051200 query_dict = { 'timestamp': {'$gt':begin_ts, '$lt': end_ts}, 'message_type' : 2 } weibos_dict_new = {} scount, weibo_results =s.search(query=query_dict, fields=fields_list) #print 'scount:', scount i = 0 j = 0 for weibo in weibo_results(): if i==11: break weibo['text'] = weibos_dict[str(i)][j][0] #获取username,profileimage,weibourl username, profileimage = getuserinfo(weibo['user']) weibo['username'] = username weibo['profile_image_url'] = profileimage weibo['timestamp'] = ts2date(weibo['timestamp']) weibo['weibo_link'] = weiboinfo2url(weibo['user'],weibo['_id']) #获取username, profileimage,weibourl结束 weight = weibos_dict[str(i)][j][1] try: weibos_dict_new[i].append((weibo, weight)) except: weibos_dict_new[i] = [(weibo, weight)] if j==4: j = 0 i += 1 else: j +=1 #分割线 for i in range(len(child_topic_list)): item = OpinionTestWeibos(topic, i, json.dumps(weibos_dict_new[i])) item_exist = db.session.query(OpinionTestWeibos).filter(OpinionTestWeibos.topic==topic, \ OpinionTestWeibos.child_topic==i).first() if item_exist: db.session.delete(item_exist) db.session.add(item) db.session.commit()
def get_info(neighbor_list, topic, date, windowsize): end_ts = datetime2ts(date) start_ts = end_ts - windowsize * Day xapian_search_weibo = getXapianWeiboByTopic(topic, start_ts, end_ts) query_dict = { '$or' : [] } for uid in neighbor_list: query_dict['$or'].append({'user': int(uid)}) neighbor_info = [] count, weibo_results = xapian_search_weibo.search(query=query_dict, fields= weibo_fields_list) if count==0: return None, None, None sentiment_count = {} for weibo in weibo_results(): uid = weibo['user'] _id = weibo['_id'] result = user_search.search_by_id(uid, fields=user_fields_list) if result: name = result['name'] location = result['location'] friends_count = result['friends_count'] followers_count = result['followers_count'] created_at = result['created_at'] statuses_count = result['statuses_count'] profile_image_url = result['profile_image_url'] else: name = u'未知' location = u'未知' friends_count = u'未知' followers_count = u'未知' created_at = u'未知' statuses_count = u'未知' profile_image_url = u'no' text = weibo['text'] timestamp = weibo['timestamp'] date = ts2date(timestamp) reposts_count = weibo['reposts_count'] source = weibo['source'] geo = weibo['geo'] comments_count = weibo['comments_count'] sentiment = weibo['sentiment'] sentiment_name = emotions_kv[sentiment] weibo_link = weiboinfo2url(uid, _id) try: sentiment_count[sentiment] += 1 except KeyError: sentiment_count[sentiment] = 1 neighbor_info.append([_id, name, location, friends_count, followers_count, created_at, statuses_count, profile_image_url, text, date, reposts_count, source, geo, comments_count, sentiment_name,weibo_link, uid]) sort_neighbor_info = sorted(neighbor_info, key=lambda x:x[10], reverse=True) #以转发量排序 mset = xapian_search_weibo.search(query=query_dict, max_offset=50, mset_direct=True) top_keyword = top_keywords(gen_mset_iter(xapian_search_weibo, mset, fields=['terms']), top=50) sort_top_keyword = sorted(top_keyword, key=lambda x:x[1], reverse=True) new_sentiment_list = [] for sentiment in sentiment_count: sentiment_ch = emotions_kv[int(sentiment)] num = sentiment_count[sentiment] ratio = float(num) / float(count) new_sentiment_list.append([sentiment_ch, num, ratio]) return sort_neighbor_info, sort_top_keyword, new_sentiment_list, query_dict
def time_domain_top_user(topic, date, windowsize, domain, rank_method): #results = {'folk':[], 'media':[], 'opinion_leader':[], 'oversea':[], 'other':[]} print 'topic, date, windowsize:', topic.encode('utf-8'), date, windowsize #domain_list = ['folk', 'media', 'opinion_leader', 'oversea', 'other'] items = db.session.query(FirstDomainUser).filter(FirstDomainUser.topic==topic ,\ FirstDomainUser.date==date ,\ FirstDomainUser.windowsize==windowsize ,\ FirstDomainUser.user_domain==domain).all() results = [] for item in items: domain = item.user_domain domain_name = domain_dict[domain] timestamp = item.timestamp timestamp = ts2date(timestamp) uid = item.uid weibo_info = json.loads(item.weibo_info) text = weibo_info['text'] geo = weibo_info['geo'] source = weibo_info['source'] _id = weibo_info['_id'] weibo_link = weiboinfo2url(uid, _id) user_info = json.loads(item.user_info) uname = user_info['name'] location = user_info['location'] profile_image_url = user_info['profile_image_url'] if profile_image_url == u'未知': profile_image_url = '' friends_count = user_info['friends_count'] followers_count = user_info['followers_count'] statuses_count = user_info['statuses_count'] if friends_count == u'未知': friends_count = -1 if statuses_count == u'未知': statuses_count = -1 created_at = user_info['created_at'] #rank = item.rank row = [ uid, uname, location, domain_name, timestamp, text, profile_image_url, friends_count, followers_count, statuses_count, created_at, geo, source, weibo_link, _id ] results.append(row) sorted_results = [] if rank_method == 'timestamp': sorted_results = sorted(results, key=lambda x: x[4]) elif rank_method == 'friends_count': sorted_results = sorted(results, key=lambda x: x[7], reverse=True) elif rank_method == 'statuses_count': sorted_results = sorted(results, key=lambda x: x[9], reverse=True) new_results = [] for i in range(len(items)): new_row = [i + 1] for j in range(len(sorted_results[0])): if j == 7 and sorted_results[i][j] == -1: new_row.append(u'未知') elif j == 9 and sorted_results[i][j] == -1: new_row.append(u'未知') else: new_row.append(sorted_results[i][j]) new_results.append(new_row) return new_results
def time_top_user(topic, date, windowsize, rank_method): results = [] print 'topic, date, windowsize:', topic.encode('utf-8'), date, windowsize items = db.session.query(FirstUser).filter(FirstUser.topic==topic ,\ FirstUser.date==date ,\ FirstUser.windowsize==windowsize).all() #print 'len(items):', len(items) if items: for item in items: uid = item.uid timestamp = item.timestamp timestamp = ts2date(timestamp) user_info = json.loads(item.user_info) uname = user_info['name'] location = user_info['location'] profile_image_url = user_info['profile_image_url'] if profile_image_url == u'未知': profile_image_url = '' friends_count = user_info['friends_count'] followers_count = user_info['followers_count'] statuses_count = user_info['statuses_count'] if friends_count == u'未知': friends_count = -1 if statuses_count == u'未知': statuses_count = -1 created_at = user_info['created_at'] weibo_info = json.loads(item.weibo_info) text = weibo_info['text'] geo = weibo_info['geo'] source = weibo_info['source'] _id = weibo_info['_id'] weibo_link = weiboinfo2url(uid, _id) user_domain = item.user_domain domain_name = domain_dict[user_domain] row = [ uid, uname, location, domain_name, timestamp, text, profile_image_url, friends_count, followers_count, statuses_count, created_at, geo, source, weibo_link, _id ] results.append(row) #print 'results:', results sorted_results = [] #print 'rank_method:', rank_method if rank_method == 'timestamp': sorted_results = sorted(results, key=lambda x: x[4]) elif rank_method == 'friends_count': sorted_results = sorted(results, key=lambda x: x[7], reverse=True) elif rank_method == 'statuses_count': sorted_results = sorted(results, key=lambda x: x[9], reverse=True) #print 'sorted_results',sorted_results #print 'sorted_results[0]:', sorted_results[0] new_results = [] for i in range(len(items)): new_row = [i + 1] for j in range(len(sorted_results[0])): if j == 7 and sorted_results[i][j] == -1: new_row.append(u'未知') elif j == 9 and sorted_results[i][j] == -1: new_row.append(u'未知') else: new_row.append(sorted_results[i][j]) new_results.append(new_row) return new_results
def community_result(community_user_list, topic, date, windowsize): # change end_ts = datetime2ts(date) start_ts = end_ts - windowsize * Day xapian_search_weibo = getXapianWeiboByTopic(topic, start_ts, end_ts) query_dict = {"$or": []} for uid in community_user_list: query_dict["$or"].append({"user": int(uid)}) community_info = [] count, weibo_results = xapian_search_weibo.search(query=query_dict, fields=weibo_fields_list) if count == 0: return None, None, None sentiment_count = {} for weibo in weibo_results(): uid = weibo["user"] _id = weibo["_id"] result = user_search.search_by_id(uid, fields=user_fields_list) if result: name = result["name"] location = result["location"] friends_count = result["friends_count"] followers_count = result["followers_count"] created_at = result["created_at"] statuses_count = result["statuses_count"] profile_image_url = result["profile_image_url"] else: name = u"未知" location = u"未知" friends_count = u"未知" followers_count = u"未知" created_at = u"未知" statuses_count = u"未知" profile_image_url = u"no" text = weibo["text"] timestamp = weibo["timestamp"] date = ts2date(timestamp) reposts_count = weibo["reposts_count"] source = weibo["source"] geo = weibo["geo"] comments_count = weibo["comments_count"] sentiment = weibo["sentiment"] sentiment_name = emotions_kv[sentiment] weibo_link = weiboinfo2url(uid, _id) domain = uid2domain(uid) try: sentiment_count[sentiment] += 1 except KeyError: sentiment_count[sentiment] = 1 community_info.append( [ _id, name, location, friends_count, followers_count, created_at, statuses_count, profile_image_url, text, date, reposts_count, source, geo, comments_count, sentiment_name, weibo_link, domain, ] ) sort_community_info = sorted(community_info, key=lambda x: x[10], reverse=True) # 以转发量排序 mset = xapian_search_weibo.search(query=query_dict, max_offset=50, mset_direct=True) top_keyword = top_keywords(gen_mset_iter(xapian_search_weibo, mset, fields=["terms"]), top=50) sort_top_keyword = sorted(top_keyword, key=lambda x: x[1], reverse=True) new_sentiment_list = [] for sentiment in sentiment_count: sentiment_ch = emotions_kv[int(sentiment)] num = sentiment_count[sentiment] ratio = float(num) / float(count) new_sentiment_list.append([sentiment_ch, num, ratio]) return sort_community_info, sort_top_keyword, new_sentiment_list, query_dict
def save_weibos(excel_name, topic, child_topic_list, w_limit): # 这里需要根据文本内容查询相关微博id等 data = xlrd.open_workbook(excel_name) weibos_dict = {} for i in child_topic_list: #if i == '0': # continue weibos_dict[i] = [] table_weibos = data.sheet_by_name(str(int(i))) n_row_weibos = table_weibos.nrows if n_row_weibos <= w_limit: n_rows = n_row_weibo else: n_rows = w_limit # 考虑到数据已经根据权重从大到小排列 for j in range(n_rows): line = table_weibos.row_values(j) # 缺少根据文本查询微博文本对应的其他微博内容 weibo_text = line[1] weibo_weight = line[0] try: weibos_dict[i].append( (weibo_text, weibo_weight )) # 实际上这里append的应该是weibo的完整内容,并且是将username等获取到的 except: weibos_dict[i] = [(weibo_text, weibo_weight)] #print 'weibos_dict:', weibos_dict #获取微博具体数据,仅作测试用 s = XapianSearch(path='/home/ubuntu3/huxiaoqian/case/20140724/20140804/', name='master_timeline_weibo', schema_version='5') begin_ts = 1378050300 end_ts = 1378051200 query_dict = { 'timestamp': { '$gt': begin_ts, '$lt': end_ts }, 'message_type': 2 } weibos_dict_new = {} scount, weibo_results = s.search(query=query_dict, fields=fields_list) #print 'scount:', scount i = 0 j = 0 for weibo in weibo_results(): if i == 11: break weibo['text'] = weibos_dict[str(i)][j][0] #获取username,profileimage,weibourl username, profileimage = getuserinfo(weibo['user']) weibo['username'] = username weibo['profile_image_url'] = profileimage weibo['timestamp'] = ts2date(weibo['timestamp']) weibo['weibo_link'] = weiboinfo2url(weibo['user'], weibo['_id']) #获取username, profileimage,weibourl结束 weight = weibos_dict[str(i)][j][1] try: weibos_dict_new[i].append((weibo, weight)) except: weibos_dict_new[i] = [(weibo, weight)] if j == 4: j = 0 i += 1 else: j += 1 #分割线 for i in range(len(child_topic_list)): item = OpinionTestWeibos(topic, i, json.dumps(weibos_dict_new[i])) item_exist = db.session.query(OpinionTestWeibos).filter(OpinionTestWeibos.topic==topic, \ OpinionTestWeibos.child_topic==i).first() if item_exist: db.session.delete(item_exist) db.session.add(item) db.session.commit()
def time_domain_top_user(topic, date, windowsize, domain, rank_method): # results = {'folk':[], 'media':[], 'opinion_leader':[], 'oversea':[], 'other':[]} print "topic, date, windowsize:", topic.encode("utf-8"), date, windowsize # domain_list = ['folk', 'media', 'opinion_leader', 'oversea', 'other'] items = ( db.session.query(FirstDomainUser) .filter( FirstDomainUser.topic == topic, FirstDomainUser.date == date, FirstDomainUser.windowsize == windowsize, FirstDomainUser.user_domain == domain, ) .all() ) results = [] for item in items: domain = item.user_domain domain_name = domain_dict[domain] timestamp = item.timestamp timestamp = ts2date(timestamp) uid = item.uid weibo_info = json.loads(item.weibo_info) text = weibo_info["text"] geo = weibo_info["geo"] source = weibo_info["source"] _id = weibo_info["_id"] weibo_link = weiboinfo2url(uid, _id) user_info = json.loads(item.user_info) uname = user_info["name"] location = user_info["location"] profile_image_url = user_info["profile_image_url"] if profile_image_url == u"未知": profile_image_url = "" friends_count = user_info["friends_count"] followers_count = user_info["followers_count"] statuses_count = user_info["statuses_count"] if friends_count == u"未知": friends_count = -1 if statuses_count == u"未知": statuses_count = -1 created_at = user_info["created_at"] # rank = item.rank row = [ uid, uname, location, domain_name, timestamp, text, profile_image_url, friends_count, followers_count, statuses_count, created_at, geo, source, weibo_link, _id, ] results.append(row) sorted_results = [] if rank_method == "timestamp": sorted_results = sorted(results, key=lambda x: x[4]) elif rank_method == "friends_count": sorted_results = sorted(results, key=lambda x: x[7], reverse=True) elif rank_method == "statuses_count": sorted_results = sorted(results, key=lambda x: x[9], reverse=True) new_results = [] for i in range(len(items)): new_row = [i + 1] for j in range(len(sorted_results[0])): if j == 7 and sorted_results[i][j] == -1: new_row.append(u"未知") elif j == 9 and sorted_results[i][j] == -1: new_row.append(u"未知") else: new_row.append(sorted_results[i][j]) new_results.append(new_row) return new_results
def time_top_user(topic, date, windowsize, rank_method): results = [] print "topic, date, windowsize:", topic.encode("utf-8"), date, windowsize items = ( db.session.query(FirstUser) .filter(FirstUser.topic == topic, FirstUser.date == date, FirstUser.windowsize == windowsize) .all() ) # print 'len(items):', len(items) if items: for item in items: uid = item.uid timestamp = item.timestamp timestamp = ts2date(timestamp) user_info = json.loads(item.user_info) uname = user_info["name"] location = user_info["location"] profile_image_url = user_info["profile_image_url"] if profile_image_url == u"未知": profile_image_url = "" friends_count = user_info["friends_count"] followers_count = user_info["followers_count"] statuses_count = user_info["statuses_count"] if friends_count == u"未知": friends_count = -1 if statuses_count == u"未知": statuses_count = -1 created_at = user_info["created_at"] weibo_info = json.loads(item.weibo_info) text = weibo_info["text"] geo = weibo_info["geo"] source = weibo_info["source"] _id = weibo_info["_id"] weibo_link = weiboinfo2url(uid, _id) user_domain = item.user_domain domain_name = domain_dict[user_domain] row = [ uid, uname, location, domain_name, timestamp, text, profile_image_url, friends_count, followers_count, statuses_count, created_at, geo, source, weibo_link, _id, ] results.append(row) # print 'results:', results sorted_results = [] # print 'rank_method:', rank_method if rank_method == "timestamp": sorted_results = sorted(results, key=lambda x: x[4]) elif rank_method == "friends_count": sorted_results = sorted(results, key=lambda x: x[7], reverse=True) elif rank_method == "statuses_count": sorted_results = sorted(results, key=lambda x: x[9], reverse=True) # print 'sorted_results',sorted_results # print 'sorted_results[0]:', sorted_results[0] new_results = [] for i in range(len(items)): new_row = [i + 1] for j in range(len(sorted_results[0])): if j == 7 and sorted_results[i][j] == -1: new_row.append(u"未知") elif j == 9 and sorted_results[i][j] == -1: new_row.append(u"未知") else: new_row.append(sorted_results[i][j]) new_results.append(new_row) return new_results