def sentimentCronTopic(topic, xapian_search_weibo, start_ts, over_ts, sort_field=SORT_FIELD, save_fields=RESP_ITER_KEYS, \ during=Fifteenminutes, w_limit=TOP_WEIBOS_LIMIT, k_limit=TOP_KEYWORDS_LIMIT): if topic and topic != '': start_ts = int(start_ts) over_ts = int(over_ts) over_ts = ts2HourlyTime(over_ts, during) interval = (over_ts - start_ts) / during for i in range(interval, 0, -1): emotions_kcount = {} emotions_count = {} emotions_weibo = {} begin_ts = over_ts - during * i end_ts = begin_ts + during print begin_ts, end_ts, 'topic %s starts calculate' % topic.encode( 'utf-8') query_dict = { 'timestamp': { '$gt': begin_ts, '$lt': end_ts }, '$and': [ { '$or': [{ 'message_type': 1 }, { 'message_type': 3 }] }, ] } for k, v in emotions_kv.iteritems(): query_dict['sentiment'] = v count, results = xapian_search_weibo.search(query=query_dict, fields=save_fields) mset = xapian_search_weibo.search(query=query_dict, sort_by=[sort_field], \ max_offset=w_limit, mset_direct=True) kcount = top_keywords(gen_mset_iter(xapian_search_weibo, mset, fields=['terms']), top=k_limit) top_ws = top_weibos(results, top=w_limit) emotions_count[v] = [end_ts, count] emotions_kcount[v] = [end_ts, kcount] emotions_weibo[v] = [end_ts, top_ws] save_rt_results('count', topic, emotions_count, during) save_rt_results('kcount', topic, emotions_kcount, during, k_limit, w_limit) save_rt_results('weibos', topic, emotions_weibo, during, k_limit, w_limit)
def propagateCronTopic(topic, xapian_search_weibo, start_ts, over_ts, sort_field=SORT_FIELD, \ save_fields=RESP_ITER_KEYS, during=Fifteenminutes, w_limit=TOP_WEIBOS_LIMIT, k_limit=TOP_KEYWORDS_LIMIT): if topic and topic != '': start_ts = int(start_ts) over_ts = int(over_ts) over_ts = ts2HourlyTime(over_ts, during) interval = (over_ts - start_ts) / during for i in range(interval, 0, -1): begin_ts = over_ts - during * i end_ts = begin_ts + during print begin_ts, end_ts, 'topic %s starts calculate' % topic.encode( 'utf-8') mtype_count = {} mtype_kcount = {} # mtype_kcount={mtype:[terms]} mtype_weibo = {} # mtype_weibo={mtype:weibo} query_dict = {'timestamp': {'$gt': begin_ts, '$lt': end_ts}} for k, v in mtype_kv.iteritems(): query_dict['message_type'] = v count, results = xapian_search_weibo.search(query=query_dict, fields=fields_list) mset = xapian_search_weibo.search(query=query_dict, sort_by=[sort_field], \ max_offset=w_limit, mset_direct=True) kcount = top_keywords(gen_mset_iter(xapian_search_weibo, mset, fields=['terms']), top=k_limit) top_ws = top_weibos(results, top=w_limit) mtype_count[v] = [end_ts, count] mtype_kcount[v] = [end_ts, kcount] mtype_weibo[v] = [end_ts, top_ws] save_pc_results(topic, mtype_count, during) save_kc_results(topic, mtype_kcount, during, k_limit) save_ws_results(topic, mtype_weibo, during, w_limit)
def sentimentCronTopic(topic, xapian_search_weibo, start_ts, over_ts, sort_field=SORT_FIELD, save_fields=RESP_ITER_KEYS, \ during=Fifteenminutes, w_limit=TOP_WEIBOS_LIMIT, k_limit=TOP_KEYWORDS_LIMIT): if topic and topic != '': start_ts = int(start_ts) over_ts = int(over_ts) over_ts = ts2HourlyTime(over_ts, during) interval = (over_ts - start_ts) / during for i in range(interval, 0, -1): emotions_kcount = {} emotions_count = {} emotions_weibo = {} begin_ts = over_ts - during * i end_ts = begin_ts + during print begin_ts, end_ts, 'topic %s starts calculate' % topic.encode('utf-8') query_dict = { 'timestamp': {'$gt': begin_ts, '$lt': end_ts}, '$and': [ {'$or': [{'message_type': 1}, {'message_type': 3}]}, ] } for k, v in emotions_kv.iteritems(): query_dict['sentiment'] = v count, results = xapian_search_weibo.search(query=query_dict, fields=save_fields) mset = xapian_search_weibo.search(query=query_dict, sort_by=[sort_field], \ max_offset=w_limit, mset_direct=True) kcount = top_keywords(gen_mset_iter(xapian_search_weibo, mset, fields=['terms']), top=k_limit) top_ws = top_weibos(results, top=w_limit) emotions_count[v] = [end_ts, count] emotions_kcount[v] = [end_ts, kcount] emotions_weibo[v] = [end_ts, top_ws] save_rt_results('count', topic, emotions_count, during) save_rt_results('kcount', topic, emotions_kcount, during, k_limit, w_limit) save_rt_results('weibos', topic, emotions_weibo, during, k_limit, w_limit)
def propagateCronTopic(topic, xapian_search_weibo, start_ts, over_ts, sort_field=SORT_FIELD, \ save_fields=RESP_ITER_KEYS, during=Fifteenminutes, w_limit=TOP_WEIBOS_LIMIT, k_limit=TOP_KEYWORDS_LIMIT): if topic and topic != '': start_ts = int(start_ts) over_ts = int(over_ts) over_ts = ts2HourlyTime(over_ts, during) interval = (over_ts - start_ts) / during for i in range(interval, 0, -1): begin_ts = over_ts - during * i end_ts = begin_ts + during print begin_ts, end_ts, 'topic %s starts calculate' % topic.encode('utf-8') mtype_count = {} mtype_kcount = {} # mtype_kcount={mtype:[terms]} mtype_weibo = {} # mtype_weibo={mtype:weibo} query_dict = { 'timestamp': {'$gt': begin_ts, '$lt': end_ts} } for k, v in mtype_kv.iteritems(): query_dict['message_type'] = v count, results = xapian_search_weibo.search(query=query_dict, fields=fields_list) mset = xapian_search_weibo.search(query=query_dict, sort_by=[sort_field], \ max_offset=w_limit, mset_direct=True) kcount = top_keywords(gen_mset_iter(xapian_search_weibo, mset, fields=['terms']), top=k_limit) top_ws = top_weibos(results, top=w_limit) mtype_count[v] = [end_ts, count] mtype_kcount[v] = [end_ts, kcount] mtype_weibo[v] = [end_ts, top_ws] save_pc_results(topic, mtype_count, during) save_kc_results(topic, mtype_kcount, during, k_limit) save_ws_results(topic, mtype_weibo, during, w_limit)
def cron_index_topic(topic, begin_ts = BEGIN_TS, end_ts = END_TS): if topic and topic != '': topic = topic.strip() query_dict = { 'timestamp':{'$gt':begin_ts,'$lt':end_ts}, 'topics':topic } count,results = s.search(query=query_dict, sort_by=[SORT_FIELD], fields=RESP_ITER_KEYS) count2,results2 = s.search(query=query_dict, sort_by=[SORT_FIELD], fields=RESP_ITER_KEYS) user_raw_list = [] time_list = [] sublist_by_time=[] city_dict = {} # 测试city_dict是否可用 # city = geo2city('219.224.135.46') # print city,city_dict.setdefault(city,0) # city_dict[city] += 1 # print 'city_dict[city]',city_dict[city] print 'count',count for r in results(): user_raw_list.append(r['user']) # 原始用户列表 time_list.append(r['timestamp']) # 时间列表 sublist_by_time.append((r['timestamp'],r['reposts_count'],r['user'],r['source'],r['text'],r['comments_count'],r['geo'])) # 部分字段构成子列表 city = geo2city(r['geo']) # print city city_dict.setdefault(city,0) city_dict[city] += 1 # 字典相应键值加1 # print 'city_dict[city]',city_dict[city] # print '_id',r['_id'] # print 'user',r['user'] # print 'timestamp',ts2datetime_full(r['timestamp']) # print 'reposts_count',r['reposts_count'] user_count = len(list(set(user_raw_list))) # 去重后的参与人数 print 'user_count',user_count topic_begin_ts = ts2datetime_full(time_list[0]) topic_end_ts = ts2datetime_full(time_list[-1]) print 'topic_begin_ts',topic_begin_ts print 'topic_end_ts',topic_end_ts final_list_by_time = select_by_time(sublist_by_time,O_LIMIT) final_list_by_media =select_by_media(sublist_by_time, M_LIMIT) print 'final_list_by_media' for item in final_list_by_media: print 'timestamp', ts2datetime_full(item['timestamp']) print 'reposts_count',item['reposts_count'] print 'user',item['user'] print 'domain',uid2domain(item['user']) print 'source',item['source'] print 'text',item['text'] print 'comments_count', item['comments_count'] print 'geo', item['geo'] print 'username', item['username'] print 'profile_image_url', item['profile_image_url'] print 'final_list_by_time' for item in final_list_by_time: print 'timestamp', ts2datetime_full(item['timestamp']) print 'reposts_count',item['reposts_count'] print 'user',item['user'] print 'source',item['source'] print 'text',item['text'] print 'comments_count', item['comments_count'] print 'geo', item['geo'] print 'username', item['username'] print 'profile_image_url', item['profile_image_url'] top_city_list = top_city(city_dict) ''' for city in top_city_list: print 'top_city_list',city ''' top_keywords_list = top_keywords(results2, top = K_LIMIT) ''' print 'top_keywords' for keyword in top_keywords_list: print 'keyword',keyword[0].decode('utf-8').encode('utf-8'),keyword[1] ''' save_rt_results(topic, count, user_count, time_list,\ top_city_list, top_keywords_list, final_list_by_time, final_list_by_media)
def community_result(community_user_list, topic, date, windowsize): #change end_ts = datetime2ts(date) start_ts = end_ts - windowsize * Day xapian_search_weibo = getXapianWeiboByTopic(topic, start_ts, end_ts) query_dict = {'$or': []} for uid in community_user_list: query_dict['$or'].append({'user': int(uid)}) community_info = [] count, weibo_results = xapian_search_weibo.search(query=query_dict, fields=weibo_fields_list) if count == 0: return None, None, None sentiment_count = {} for weibo in weibo_results(): uid = weibo['user'] _id = weibo['_id'] result = user_search.search_by_id(uid, fields=user_fields_list) if result: name = result['name'] location = result['location'] friends_count = result['friends_count'] followers_count = result['followers_count'] created_at = result['created_at'] statuses_count = result['statuses_count'] profile_image_url = result['profile_image_url'] else: name = u'未知' location = u'未知' friends_count = u'未知' followers_count = u'未知' created_at = u'未知' statuses_count = u'未知' profile_image_url = u'no' text = weibo['text'] timestamp = weibo['timestamp'] date = ts2date(timestamp) reposts_count = weibo['reposts_count'] source = weibo['source'] geo = weibo['geo'] comments_count = weibo['comments_count'] sentiment = weibo['sentiment'] sentiment_name = emotions_kv[sentiment] weibo_link = weiboinfo2url(uid, _id) domain = uid2domain(uid) try: sentiment_count[sentiment] += 1 except KeyError: sentiment_count[sentiment] = 1 community_info.append([ _id, name, location, friends_count, followers_count, created_at, statuses_count, profile_image_url, text, date, reposts_count, source, geo, comments_count, sentiment_name, weibo_link, domain ]) sort_community_info = sorted(community_info, key=lambda x: x[10], reverse=True) #以转发量排序 mset = xapian_search_weibo.search(query=query_dict, max_offset=50, mset_direct=True) top_keyword = top_keywords(gen_mset_iter(xapian_search_weibo, mset, fields=['terms']), top=50) sort_top_keyword = sorted(top_keyword, key=lambda x: x[1], reverse=True) new_sentiment_list = [] for sentiment in sentiment_count: sentiment_ch = emotions_kv[int(sentiment)] num = sentiment_count[sentiment] ratio = float(num) / float(count) new_sentiment_list.append([sentiment_ch, num, ratio]) return sort_community_info, sort_top_keyword, new_sentiment_list, query_dict
def get_info(neighbor_list, topic, date, windowsize): end_ts = datetime2ts(date) start_ts = end_ts - windowsize * Day xapian_search_weibo = getXapianWeiboByTopic(topic, start_ts, end_ts) query_dict = { '$or' : [] } for uid in neighbor_list: query_dict['$or'].append({'user': int(uid)}) neighbor_info = [] count, weibo_results = xapian_search_weibo.search(query=query_dict, fields= weibo_fields_list) if count==0: return None, None, None sentiment_count = {} for weibo in weibo_results(): uid = weibo['user'] _id = weibo['_id'] result = user_search.search_by_id(uid, fields=user_fields_list) if result: name = result['name'] location = result['location'] friends_count = result['friends_count'] followers_count = result['followers_count'] created_at = result['created_at'] statuses_count = result['statuses_count'] profile_image_url = result['profile_image_url'] else: name = u'未知' location = u'未知' friends_count = u'未知' followers_count = u'未知' created_at = u'未知' statuses_count = u'未知' profile_image_url = u'no' text = weibo['text'] timestamp = weibo['timestamp'] date = ts2date(timestamp) reposts_count = weibo['reposts_count'] source = weibo['source'] geo = weibo['geo'] comments_count = weibo['comments_count'] sentiment = weibo['sentiment'] sentiment_name = emotions_kv[sentiment] weibo_link = weiboinfo2url(uid, _id) try: sentiment_count[sentiment] += 1 except KeyError: sentiment_count[sentiment] = 1 neighbor_info.append([_id, name, location, friends_count, followers_count, created_at, statuses_count, profile_image_url, text, date, reposts_count, source, geo, comments_count, sentiment_name,weibo_link, uid]) sort_neighbor_info = sorted(neighbor_info, key=lambda x:x[10], reverse=True) #以转发量排序 mset = xapian_search_weibo.search(query=query_dict, max_offset=50, mset_direct=True) top_keyword = top_keywords(gen_mset_iter(xapian_search_weibo, mset, fields=['terms']), top=50) sort_top_keyword = sorted(top_keyword, key=lambda x:x[1], reverse=True) new_sentiment_list = [] for sentiment in sentiment_count: sentiment_ch = emotions_kv[int(sentiment)] num = sentiment_count[sentiment] ratio = float(num) / float(count) new_sentiment_list.append([sentiment_ch, num, ratio]) return sort_neighbor_info, sort_top_keyword, new_sentiment_list, query_dict
def community_result(community_user_list, topic, date, windowsize): # change end_ts = datetime2ts(date) start_ts = end_ts - windowsize * Day xapian_search_weibo = getXapianWeiboByTopic(topic, start_ts, end_ts) query_dict = {"$or": []} for uid in community_user_list: query_dict["$or"].append({"user": int(uid)}) community_info = [] count, weibo_results = xapian_search_weibo.search(query=query_dict, fields=weibo_fields_list) if count == 0: return None, None, None sentiment_count = {} for weibo in weibo_results(): uid = weibo["user"] _id = weibo["_id"] result = user_search.search_by_id(uid, fields=user_fields_list) if result: name = result["name"] location = result["location"] friends_count = result["friends_count"] followers_count = result["followers_count"] created_at = result["created_at"] statuses_count = result["statuses_count"] profile_image_url = result["profile_image_url"] else: name = u"未知" location = u"未知" friends_count = u"未知" followers_count = u"未知" created_at = u"未知" statuses_count = u"未知" profile_image_url = u"no" text = weibo["text"] timestamp = weibo["timestamp"] date = ts2date(timestamp) reposts_count = weibo["reposts_count"] source = weibo["source"] geo = weibo["geo"] comments_count = weibo["comments_count"] sentiment = weibo["sentiment"] sentiment_name = emotions_kv[sentiment] weibo_link = weiboinfo2url(uid, _id) domain = uid2domain(uid) try: sentiment_count[sentiment] += 1 except KeyError: sentiment_count[sentiment] = 1 community_info.append( [ _id, name, location, friends_count, followers_count, created_at, statuses_count, profile_image_url, text, date, reposts_count, source, geo, comments_count, sentiment_name, weibo_link, domain, ] ) sort_community_info = sorted(community_info, key=lambda x: x[10], reverse=True) # 以转发量排序 mset = xapian_search_weibo.search(query=query_dict, max_offset=50, mset_direct=True) top_keyword = top_keywords(gen_mset_iter(xapian_search_weibo, mset, fields=["terms"]), top=50) sort_top_keyword = sorted(top_keyword, key=lambda x: x[1], reverse=True) new_sentiment_list = [] for sentiment in sentiment_count: sentiment_ch = emotions_kv[int(sentiment)] num = sentiment_count[sentiment] ratio = float(num) / float(count) new_sentiment_list.append([sentiment_ch, num, ratio]) return sort_community_info, sort_top_keyword, new_sentiment_list, query_dict