class IPLoaderFromMG(IPLoader): ''' load 需要定位的IP地址 -> IPContainer(StackSet) ''' def __init__(self): self.client = PyMongoClient() self.IPContainer = StackSet() def load(self): t = threading.Thread(target=self.load_thread, name="Thread_LoadIPFromMG") t.setDaemon(True) t.start() def load_thread(self, once_sleep=30): logger.info("IP Loader Starting.......") _counter = 0 while True: if self.IPContainer.size() >= 10000: time.sleep(once_sleep) continue size_before = self.IPContainer.size() cur = self.client.find( "jh", "UserIP", { "timestamp": { "$gte": time.time() - 5 * 60 }, "city": { "$exists": False } }) for item in cur: ip = item["_id"] self.IPContainer.push(ip) _counter += 1 size_after = self.IPContainer.size() # print self.IPContainer.items logger.info( "Total Read IP: %s, IPContainer has ip: %d, load ip: %d" % (_counter, size_after, size_after - size_before)) time.sleep(once_sleep) def iter(self): while True: try: item = self.IPContainer.pop() except IndexError: import traceback logger.warning("IPContainer is empty!") time.sleep(10) continue yield item
def usetimeDistribute(num, appkey="BIQU_ANDROID", delta=120): # def usetimeDistribute(num, appkey="biqu", delta=120): curday = datetime.datetime.today().strftime("%Y%m%d") dayStr = time.strftime("%Y-%m-%d", time.localtime(time.time() - 86400 * num)) client = PyMongoClient() result = {} # for item in client.find(appkey, "uvfile", {"tm": dayStr, "jhd_userkey": userkey}): m, n = 0, 0 for item in client.find(appkey, "uvfile", {"tm": dayStr}): opas = ["action", "page", "in", "end"] uid = item["jhd_userkey"] end_sum = item["item_add"].get("end", 0) opatms = list( set( reduce( lambda a, b: a + b, map( lambda opa: item["item_count"].get(opa, {}).get( "opatm", []), opas)))) opatms.sort() opsdtsmps = map( lambda opatm: int( time.mktime( time.strptime("".join([curday, opatm]), "%Y%m%d%H:%M:%S")) ), opatms) tmp = [ 0, ] for opastamp, pos in zip(opsdtsmps, range(len(opsdtsmps) - 1)): a = opsdtsmps[pos] b = opsdtsmps[pos + 1] tmp.append(b - a) if end_sum >= 600: print(uid, end_sum, sum([i for i in tmp if i <= delta]), tmp) print(uid, end_sum, sum([i for i in tmp if i <= delta]), opatms) m += 1 print(i, end_sum) else: print(uid, end_sum, sum([i for i in tmp if i <= delta]), tmp) print(uid, end_sum, sum([i for i in tmp if i <= delta]), opatms) n += 1 total_opatm = sum([i for i in tmp if i <= delta]) if total_opatm != 0: result.setdefault(uid, total_opatm) print(m, n)
class UserActiveWriter(ModeWriter): def __init__(self): self.client = PyMongoClient() self.modename = "UserActive" def setClient(self, client): self.client = client def remove(self, appkey, modename, tm): modename = self.modename tm = tm.replace("-", "") # 格式化 tm = time.strftime( "%Y%m%d", time.localtime(time.mktime(time.strptime(tm, "%Y%m%d")))) self.client.remove(appkey, modename, {"partition_date": tm}) # def write(self, appkey, modename, curDay=time.strftime("%Y-%m-%d", time.localtime(time.time()-86400)), *args, **kwargs): def write(self, data, appkey, modename, modetools, *args, **kwargs): a = time.time() modename = self.modename curDay = kwargs["today"].replace( "-", "") if "today" in kwargs else time.strftime( "%Y%m%d", time.localtime(time.time() - 86400)) conn = self.client.getConn() userActiveCollection = conn[appkey][modename] docs = self.client.find(appkey, "UserProfile", {}) yesterday = getDay(curDay, "%Y%m%d", -1) op = [] a = time.time() for doc in docs: try: key = doc["_id"] activelife = doc.get("activelife", [0]) firstLoginTime = doc["firstLoginTime"][:8] login = getDayDelta(curDay, firstLoginTime) in activelife # 查找前一天的用户活跃记录,需要配合索引提升速度db.UserActive.ensureIndex({partition_date: -1, jh_uid: 1}) userActive = userActiveCollection.find_one({ "jh_uid": key, "partition_date": yesterday }) # 构造今天的用户记录 newUserActive = UserActiveBuilder() newUserActive.setJhdUid(key) newUserActive.setPartitionDate(curDay) if userActive is None: newUserActive.setActive([1] if login else [0]) else: userActive["active"].append(1 if login else 0) newUserActive.setActive(userActive["active"]) # 计算衡量指标 newUserActive.setFirstLoginTime(doc["firstLoginTime"]) newUserActive.setLastLoginTime(doc["lastLoginTime"]) op.append( ReplaceOne({ "jh_uid": key, "partition_date": curDay }, newUserActive.builder(), upsert=True)) except: import traceback print(traceback.print_exc(), doc) print("find cost time: %d" % int(time.time() - a)) # print("len(op): ", len(op), "yesterday: ", yesterday) try: if op: userActiveCollection.bulk_write(op) except: import traceback print(traceback.print_exc()) print( "Warn: bulkStore 'UserActive' Rise a error; Switch to Single Mode" ) for op_item in op: try: userActiveCollection.bulk_write([op_item]) except: import traceback print(traceback.print_exc()) finallyMask(appkey, modename, self.client) print("UserActiveWriter cost seconds %.10f" % ((time.time() - a), ))
class UserProfileWriter(ModeWriter): def __init__(self, mongo_id=1): self.client = PyMongoClient(mongo_id=mongo_id) self.conn = self.client.getConn() self.modename = "UserProfile" def setClient(self, client): self.client = client self.conn = self.client.getConn() def remove(self, appkey, modename, tm): pass def write(self, data, appkey, modename, modetools, *args, **kwargs): a = time.time() curDay = kwargs["today"].replace("-", "") fix_deltaday = getDayDelta(curDay, "20160101") modename = self.modename uids = data.keys() docs = self.client.find(appkey, modename, {"_id": {"$in": uids}}) oldusers = set() for doc in docs: try: # UserProfile _id 为 userkey key = doc["_id"] oldusers.add(key) # 如果新添加数据比首次访问时间要早,对历史数据进行修正处理 if "lastLoginTime" in data[key]: lastLoginTime_new = data[key]["lastLoginTime"][:8] if "firstLoginTime" in doc and "firstLoginTime" in data[key]: activelife = doc.get("activelife", [0]) firstLoginTime_new = data[key]["firstLoginTime"][:8] firstLoginTime_old = doc["firstLoginTime"][:8] if firstLoginTime_new < firstLoginTime_old: firstLoginDelta = getDayDelta(firstLoginTime_old, firstLoginTime_new) doc["activelife"] = map(lambda i: i + firstLoginDelta, activelife) data[key] = modetools.mergeUserProfile(data[key], doc) # 生成用户生命周期数据 firstLoginDay = data[key]["firstLoginTime"][:8] lastLoginDay = data[key]["lastLoginTime"][:8] dayDelta = getDayDelta(lastLoginTime_new, firstLoginDay) data[key].setdefault("activelife", [0]) # 兼容历史数据 if dayDelta not in data[key]["activelife"]: data[key]["activelife"].append(dayDelta) data[key]["activelife"].sort() # 用户绝对活跃数据,起始 日期为 2016-01-01 try: firstlogin_deltaday = getDayDelta(firstLoginDay, "20160101") data[key]["activelifeabs"] = [ firstlogin_deltaday + remain_day for remain_day in data[key]["activelife"] ] except: import traceback print(traceback.print_exc()) except: import traceback print(traceback.print_exc()) # 设置新增版本 # for key in set(uids)-set([item["_id"] for item in docs]): for key in set(uids) - oldusers: data[key]["comever"] = data[key]["ver"] # 用户绝对活跃数据,起始 日期为 2016-01-01 try: firstLoginDay = data[key]["lastLoginTime"][:8] firstlogin_deltaday = getDayDelta(firstLoginDay, "20160101") data[key]["activelifeabs"] = [ firstlogin_deltaday + remain_day for remain_day in data[key]["activelife"] ] except: import traceback print(traceback.print_exc()) op = [] for key in data: op.append(ReplaceOne({"_id": key}, data[key], True)) try: if op: self.client.bulkWrite(appkey, modename, op) except: print( "Warn: bulkStore 'UserProfile' Rise a error; Switch to Single Mode" ) try: replace_onebyone(data, appkey, modename, self.client) except: import traceback print(traceback.print_exc()) finallyMask(appkey, modename, self.client) print("UserProfileWriter cost seconds %.3f" % ((time.time() - a), ))
class UserCrumbsWriter(ModeWriter): def __init__(self, mongo_id=1): self.client = PyMongoClient(mongo_id=mongo_id) self.conn = self.client.getConn() self.modename = "uvfile" # self.store_attachmode = UserIP() self.attachmode_storers = [] try: # self.attachmode_storers = [UserIP(), UserProfileUpdateWriter()] self.attachmode_storers = [UserIP()] except: import traceback print(traceback.print_exc()) def setClient(self, client): self.client = client self.conn = self.client.getConn() def remove(self, appkey, modename, tm): modename = self.modename tm = tm.replace("-", "") tm = time.strftime( "%Y-%m-%d", time.localtime(time.mktime(time.strptime(tm, "%Y%m%d")))) self.client.remove(appkey, modename, {"tm": tm}) def getMeasure(self, activelifeabs, fix_deltaday): activelifeabs = [i for i in activelifeabs if i <= fix_deltaday] measure = { "last7ActiveNum": 0, "last14ActiveNum": 0, "last28ActiveNum": 0, "last30ActiveNum": 0, } for activelifeabs_delta in activelifeabs: delta = fix_deltaday - activelifeabs_delta if delta <= 6: measure["last7ActiveNum"] += 1 if delta <= 13: measure["last14ActiveNum"] += 1 if delta <= 27: measure["last28ActiveNum"] += 1 if delta <= 29: measure["last30ActiveNum"] += 1 return measure def write(self, data, appkey, modename, modetools, *args, **kwargs): modename = self.modename # today = kwargs["today"] if "today" in kwargs else time.strftime("%Y-%m-%d", time.localtime(time.time()-86400)) today = kwargs["today"] today = today.replace("-", "") uids = data.keys() yesterday = time.strftime( "%Y-%m-%d", time.localtime( time.mktime(time.strptime(today, "%Y%m%d")) - 86400)) yyyy_mm_dd = time.strftime( "%Y-%m-%d", time.localtime( time.mktime(time.strptime(yesterday.replace("-", ""), "%Y%m%d")) + 86400)) uvfile = self.client.find( appkey, "uvfile", OrderedDict([("tm", yyyy_mm_dd), ("jhd_userkey", { "$in": uids })])) user_profile = self.client.find(appkey, "UserProfile", {"_id": { "$in": uids }}) ips = set() ip_loc = {} try: for uid in data: ips = ips.union(data[uid].get("jhd_ip")) ip_loc_cur = self.conn["jh"]["UserIP"].find( OrderedDict([("_id", { "$in": list(ips) }), ("province", { "$exists": True }), ("city", { "$exists": True })]), { "province": True, "city": True }) for item in ip_loc_cur: ip = item["_id"] province = item["province"] city = item["city"] if not province: continue if not city: city = province # ip_loc.setdefault(ip, "_".join([province, city])) ip_loc.setdefault(ip, {"prov": province, "city": city}) except: import traceback print traceback.print_exc() # 合并 数据 for doc in uvfile: uid = doc["jhd_userkey"] data[uid] = modetools.mergeUserCrumbs(doc, data[uid]) try: ip_lis = data[uid]["jhd_ip"] data[uid].setdefault("jhd_loc", []) for ip in ip_lis: loc = ip_loc.get(ip, None) if loc and loc not in data[uid]["jhd_loc"]: data[uid]["jhd_loc"].append(loc) # tmp = [] # for item in data[uid]["jhd_loc"]: # if isinstance(item, dict): # tmp.append(item) # data[uid]["jhd_loc"] = tmp except: import traceback print traceback.print_exc() fix_deltaday = getDayDelta(today, "20160101") # lastActiveInterval # firstLoginTime for doc in user_profile: # print("doc", doc["_id"], fix_deltaday, doc.get("activelifeabs", [])) key = doc["_id"] tmp = {} # 获取用户首次登录时间 tmp["firstLoginTime"] = doc.get("firstLoginTime", "unknown") firstloginday = tmp["firstLoginTime"][:8] activelifeabs = doc.get("activelifeabs", []) # 获取用户最近最近活跃信息 tmp["measure"] = self.getMeasure(activelifeabs, fix_deltaday) tmp["measure"]["firstLoginTime"] = tmp["firstLoginTime"] # 更新数据 data[key] = dict(data[key], **tmp) op = [] for key in data: if "_id" not in data[key]: data[key]["_id"] = ObjectId() _id = data[key]["_id"] op.append( ReplaceOne({"_id": _id}, modetools.formatList(data[key]), True)) try: if op: self.client.bulkWrite(appkey, modename, op) except: print( "Warn: bulkStore 'uvfile' Rise a error; Switch to Single Mode") try: replace_onebyone(data, appkey, modename, self.client) except: import traceback print(traceback.print_exc()) finallyMask(appkey, modename, self.client) try: kwargs["ip_loc"] = ip_loc self.store_attachmode(data, appkey, modename, modetools, *args, **kwargs) except: import traceback print traceback.print_exc()