def cal_forever(receiver, controller, poller, sender=None, fill_field_funcs=[]): count = 0 ts = time.time() tb = ts receive_kill = False # prepare item = receiver.recv_json() item_timestamp = item["timestamp"] now_db_no = get_now_db_no(item_timestamp) print "redis db no now", now_db_no global_profile_r = _default_redis(host=PROFILE_REDIS_HOST, port=PROFILE_REDIS_PORT, db=now_db_no) set_now_accepted_tsrange(item_timestamp) while 1: evts = poller.poll(XAPIAN_ZMQ_POLL_TIMEOUT) if evts: socks = dict(poller.poll(XAPIAN_ZMQ_POLL_TIMEOUT)) elif receive_kill and time.time() - tb > XAPIAN_ZMQ_WORK_KILL_INTERVAL: """ 定期kill,可以记录work开启的时间 然后收到kill的时候判断一下当前时间减去work开启的时间 是否超过某个阈值,是则执行kill操作 配套的prod模式下,应该在每隔XAPIAN_ZMQ_WORK_KILL_INTERVAL新开work """ print 'receive "KILL", worker stop, cost: %ss' % (time.time() - tb) break else: socks = None if socks and socks.get(receiver) == zmq.POLLIN: item = receiver.recv_json() if fill_field_funcs: for func in fill_field_funcs: item = func(item) item_timestamp = item["timestamp"] now_a_start_ts, now_a_end_ts = get_now_accepted_tsrange() if int(item_timestamp) < now_a_start_ts or int(item_timestamp) >= now_a_end_ts: # 超出接受范围,抛弃该条微博 continue new_db_no = get_now_db_no(item_timestamp) if new_db_no != now_db_no: now_db_no = new_db_no print "redis db no now", now_db_no global_profile_r = _default_redis(db=now_db_no) set_now_accepted_tsrange(item_timestamp) realtime_profile_keywords_cal(item, global_profile_r) count += 1 if count % XAPIAN_FLUSH_DB_SIZE == 0: te = time.time() cost = te - ts ts = te print "[%s] total profile calc: %s, %s sec/per %s" % ( datetime.now().strftime("%Y-%m-%d %H:%M:%S"), count, cost, XAPIAN_FLUSH_DB_SIZE, ) # Any waiting controller command acts as 'KILL' if socks and socks.get(controller) == zmq.POLLIN: controller.recv() receive_kill = True
last_complete_start_ts = global_r0.get(LAST_COMPLETE_START_TS) if last_complete_start_ts: last_complete_start_ts = int(last_complete_start_ts) print 'last_complete_start_ts', last_complete_start_ts # 正常应该更新的情况是,last_complete_start_ts是now_db-3甚至更早单元的起始时间, # 所以应该去取last_complete_start_ts往后一个时间段的数据,每次挪一个时间单元 if last_complete_start_ts <= now_db_start_ts - 60 * 15 * 3: # 更新last_complete_start_ts last_complete_start_ts += 60 * 15 global_r0.set(LAST_COMPLETE_START_TS, last_complete_start_ts) # 开始计算 end_ts = last_complete_start_ts + 60 * 15 now_db_no = get_now_db_no(last_complete_start_ts) r = _default_redis(db=now_db_no) calc_sentiment() calc_profile() clear_current_redis() # 当last_complete_start_ts 达到 23:15时,把剩余的两段15分钟时间单元更新计算 elif last_complete_start_ts < now_db_start_ts and (datetime.datetime.fromtimestamp(last_complete_start_ts).strftime("%H:%M:%S") == '23:15:00' or datetime.datetime.fromtimestamp(last_complete_start_ts).strftime("%H:%M:%S") == '23:30:00'): # 更新last_complete_start_ts last_complete_start_ts += 60 * 15 global_r0.set(LAST_COMPLETE_START_TS, last_complete_start_ts) # 开始计算 end_ts = last_complete_start_ts + 60 * 15 now_db_no = get_now_db_no(last_complete_start_ts) r = _default_redis(db=now_db_no)
# print uid keywords_with_count = r.zrange(USER_KEYWORDS % uid, 0, -1, withscores=True) daily_profile_keywords_bucket.Put(str(uid), zlib.compress(pickle.dumps(keywords_with_count, pickle.HIGHEST_PROTOCOL), zlib.Z_BEST_COMPRESSION)) cursor, members = r.sscan(USER_SET, cursor=cursor, count=10000) def get_now_leveldb_no(): local_ts = time.time() - time.timezone return int(local_ts) % (24 * 60 * 60) / (15 * 60) + 1 def get_now_datestr(): return datetime.datetime.now().strftime("%Y%m%d") if __name__ == '__main__': # init redis now_db_no = get_now_db_no() print "redis db no now", now_db_no r = _default_redis(db=now_db_no) # init leveldb now_datestr = get_now_datestr() now_leveldb_no = get_now_leveldb_no() print "leveldb no now", now_leveldb_no, now_datestr daily_profile_keywords_bucket = leveldb.LevelDB(os.path.join(LEVELDBPATH, './keywords/linhao_profile_keywords_%s_%s' % (now_datestr, now_leveldb_no)), block_cache_size=8 * (2 << 25), write_buffer_size=8 * (2 << 25)) profile_keywords_redis2leveldb()