def cal_topic_sentiment_by_date(topic, datestr, duration): start_ts = datetime2ts(datestr) end_ts = start_ts + Day datestr = datestr.replace('-', '') xapian_search_weibo = getXapianWeiboByDate(datestr) if xapian_search_weibo: sentimentCronTopic(topic, xapian_search_weibo, start_ts=start_ts, over_ts=end_ts, during=duration)
def read_xapian(date): # init leveldb print 'init leveldb' dailycount_bucket = leveldb.LevelDB(os.path.join(LEVELDBPATH, 'yuanshi_daily_count_%s' % date), block_cache_size=8 * (2 << 25), write_buffer_size=8 * (2 << 25)) print 'init xapian weibo' # init xapian weibo datestr = date.replace('-', '') xapian_search_weibo = getXapianWeiboByDate(datestr) if not xapian_search_weibo: return 'wrong' # iter weibos = xapian_search_weibo.iter_all_docs(fields=['user', 'reposts_count', 'comments_count', 'attitudes_count']) # cal count = 0 te = ts = time.time() for weibo in weibos: if count % 10000 == 0: te = time.time() print count, '%s sec' % (te - ts), 'identify weibo calc to leveldb ', date ts = te count += 1 try: reposts_count = int(weibo['reposts_count']) except: reposts_count = 0 try: comments_count = int(weibo['comments_count']) except: comments_count = 0 try: attitudes_count = int(weibo['attitudes_count']) except: attitudes_count = 0 uid = weibo['user'] try: active, important, follower, domain = dailycount_bucket.Get(str(uid)).split('_') active = int(active) important = int(important) active += 1 important += reposts_count + comments_count + attitudes_count except KeyError: active = 1 important = reposts_count + comments_count + attitudes_count domain = user2domainFromLeveldb(uid) follower = user2FollowersCount(uid) dailycount_bucket.Put(str(uid), str(active) + '_' + str(important) + '_' + str(follower) + '_' + str(domain)) return 'Done'
def cal_topic_sentiment_by_date(topic, datestr, duration): start_ts = datetime2ts(datestr) end_ts = start_ts + Fifteenminutes datestr = datestr.replace('-', '') xapian_search_weibo = getXapianWeiboByDate(datestr) if xapian_search_weibo: sentimentCronTopic(topic, xapian_search_weibo, start_ts=start_ts, over_ts=end_ts, during=duration)
def topic_not_calc(): topics = _topic_not_calc() for topic in topics: query = topic.topic end_ts = topic.end during = topic.range start_ts = end_ts - during start_datestr = ts2datetime(start_ts) end_datestr = ts2datetime(end_ts) ts_1 = datetime2ts(start_datestr) ts_2 = datetime2ts(end_datestr) days = (ts_2 - ts_1) / Day date_list = [] for i in range(0, days): datestr = datetime.date.fromtimestamp(ts_1 + i * Day).isoformat() date_list.append(datestr) datestr = datestr.replace('-', '') xapian_search_weibo = getXapianWeiboByDate(datestr) print xapian_search_weibo if xapian_search_weibo: sentimentRealTimeTopic(xapian_search_weibo, query, ts_1 + i * Day, ts_1 + (i + 1) * Day)
now_datestr = sys.argv[1] # datestr as '20130921' daily_identify_aifd_bucket = leveldb.LevelDB(os.path.join(LEVELDBPATH, 'yuanshi_daily_count_%s' % now_datestr), block_cache_size=8 * (2 << 25), write_buffer_size=8 * (2 << 25)) # check if xapian data is ready xapian_stub_file = '%s%s' % (DYNAMIC_XAPIAN_WEIBO_STUB_PATH, now_datestr) while 1: if os.path.isfile(xapian_stub_file): print '%s xapian data stub file is prepared' % now_datestr break else: print '%s xapian data stub file is not prepared' % now_datestr time.sleep(60) # init xapian weibo xapian_search_weibo = getXapianWeiboByDate(now_datestr) # init leveldb try: shutil.rmtree(os.path.join(LEVELDBPATH, 'yuanshi_daily_count_%s' % now_datestr)) except: pass daily_identify_aifd_bucket = leveldb.LevelDB(os.path.join(LEVELDBPATH, 'yuanshi_daily_count_%s' % now_datestr), block_cache_size=8 * (2 << 25), write_buffer_size=8 * (2 << 25)) try: os.mkdir(os.path.join(LEVELDBPATH, 'linhao_user2followers_identify_r_%s' % now_datestr)) except: pass copytree(os.path.join(LEVELDBPATH, 'yuanshi_daily_user_followers'), \ os.path.join(LEVELDBPATH, 'linhao_user2followers_identify_r_%s' % now_datestr))
print r['text'].encode('utf-8') print r['timestamp'] print r['terms'] print 'hits: %s' % count else: print 'no results' ''' #测试topics字段 datestr_list = ['20130902', '20130903', '20130904', \ '20130905', '20130906', '20130907'] # datestr_list = ['20130907'] k = 0 for datestr in datestr_list: s = getXapianWeiboByDate(datestr) count, results = s.search(query={'topics': [u'东盟', u'博览会']}, fields=['text'])#fields=fields_list) ''' f = open(datestr+'.txt', 'wb') for result in results(): save_line = result['text'].encode('utf-8') f.write(save_line+'\n') ''' print 'count:', count k = k+count print 'all_count:', k ''' stopic=u'中国' query_dict = {
def cal_field_sentiment_by_date(domainid, datestr, duration): start_ts = datetime2ts(datestr) end_ts = start_ts + Day datestr = datestr.replace('-', '') xapian_search_weibo = getXapianWeiboByDate(datestr) sentiment_field(domainid, xapian_search_weibo, start_ts=start_ts, over_ts=end_ts, during=duration)
def cal_sentiment_kcount_by_date(datestr, duration): start_ts = datetime2ts(datestr) end_ts = start_ts + Day datestr = datestr.replace('-', '') xapian_search_weibo = getXapianWeiboByDate(datestr) sentiment_keywords(xapian_search_weibo, start_ts=start_ts, over_ts=end_ts, during=duration)
def sentimentRealTimeTopic(query, start_ts, end_ts, save_fields=RESP_ITER_KEYS, during=Fifteenminutes, calc='all', w_limit=TOP_WEIBOS_LIMIT, k_limit=TOP_KEYWORDS_LIMIT, sort_field=SORT_FIELD): if query and query != '': start_ts = int(start_ts) over_ts = int(end_ts) over_ts = ts2HourlyTime(over_ts, during) interval = (over_ts - start_ts) / during for i in range(interval, 0, -1): emotions_count = {} emotions_kcount = {} emotions_weibo = {} begin_ts = over_ts - during * i xapian_datestr = datetime.date.fromtimestamp(begin_ts).isoformat() xapian_search_weibo = getXapianWeiboByDate(xapian_datestr.replace('-', '')) if not xapian_search_weibo: return end_ts = begin_ts + during print begin_ts, end_ts, 'topic realtime %s starts calculate' % query.encode('utf-8') query_dict = { 'timestamp': {'$gt': begin_ts, '$lt': end_ts}, '$or': [] } for term in query.strip().split(','): if term: query_dict['$or'].append({'text': [term]}) if calc == 'all': for k, v in emotions_kv.iteritems(): query_dict['sentiment'] = v scount = xapian_search_weibo.search(query=query_dict, count_only=True) mset = xapian_search_weibo.search(query=query_dict, sort_by=[sort_field], \ max_offset=w_limit, mset_direct=True) kcount = top_keywords(gen_mset_iter(xapian_search_weibo, mset, fields=['terms']), top=k_limit) top_ws = top_weibos(gen_mset_iter(xapian_search_weibo, mset, fields=save_fields), top=w_limit) emotions_count[v] = [end_ts, scount] emotions_kcount[v] = [end_ts, kcount] emotions_weibo[v] = [end_ts, top_ws] print k, v, ', emotions count: ', emotions_count, ', keywords length: ', len(kcount), ', weibos count: ', len(top_ws) print 'save emotions count, keywords and weibo' save_rt_results('count', query, emotions_count, during) save_rt_results('kcount', query, emotions_kcount, during, klimit=k_limit) save_rt_results('weibos', query, emotions_weibo, during, wlimit=w_limit) elif calc == 'count': for k, v in emotions_kv.iteritems(): query_dict['sentiment'] = v scount = xapian_search_weibo.search(query=query_dict, count_only=True) emotions_count[v] = [end_ts, scount] save_rt_results('count', query, emotions_count, during) else: if calc == 'kcount': for k, v in emotions_kv.iteritems(): query_dict['sentiment'] = v count, get_results = xapian_search_weibo.search(query=query_dict, fields=RESP_ITER_KEYS, \ sort_by=[SORT_FIELD], max_offset=w_limit) kcount = top_keywords(get_results, top=k_limit) emotions_kcount[v] = [end_ts, kcount] save_rt_results('kcount', query, emotions_kcount, during, TOP_KEYWORDS_LIMIT) if calc == 'weibos': for k, v in emotions_kv.iteritems(): query_dict['sentiment'] = v count, get_results = xapian_search_weibo.search(query=query_dict, fields=RESP_ITER_KEYS, \ sort_by=[sort_field], max_offset=w_limit) top_ws = top_weibos(get_results, top=w_limit) emotions_weibo[v] = [end_ts, top_ws] save_rt_results('weibos', query, emotions_weibo, during, TOP_WEIBOS_LIMIT)
terms = cut(scws, _utf_encode(text), f='n') for term in terms: try: kcount = int(daily_profile_domain_keywords_db.Get(str(term))) daily_profile_domain_keywords_db.Put(str(term), str(kcount + 1)) except KeyError: daily_profile_domain_keywords_db.Put(str(term), str(1)) count += 1 if __name__ == '__main__': # init xapian weibo import sys batch_date_1 = sys.argv[1] # '20130905' xapian_search_weibo = getXapianWeiboByDate(batch_date_1) # seed_set = get_official_seed_set() scws = load_scws() # update person basics once a week # sharding = False # if sharding: # # mysqldb连接数据库 # try: # cobar_conn = MySQLdb.connect(host=COBAR_HOST, user=COBAR_USER, db='cobar_db_weibo', port=COBAR_PORT, charset='utf8') # print 'connection success' # except Exception, e: # print e # sys.exit()