def __init__(self): super(Popular, self).__init__() self.redshift = RedShift() self.userPopular = 'rec_popular' # 记录用户的流行度 self.log_filename="/Users/holazhai/Documents/IPython2/mesh_recommend/myapi/log/popular_recommend.txt" self.log_format=' [%(asctime)s] %(message)s' #将日志文件格式化 import logging logging.basicConfig(format=self.log_format,datafmt='%Y-%m-%d %H:%M:%S %p',level=logging.DEBUG,filename=self.log_filename,filemode='a')
def __init__(self): super(ALSRec, self).__init__() self.redshift = RedShift() self.redis = RedisCache() self.userLeft_factor = "rec_userLeft_factor" self.userRight_factor = "rec_userRight_factor" self.log_filename="/Users/holazhai/Documents/IPython2/mesh_recommend/myapi/log/als_recommend.txt" self.log_format=' [%(asctime)s] %(message)s' #将日志文件格式化 import logging logging.basicConfig(format=self.log_format,datafmt='%Y-%m-%d %H:%M:%S %p',level=logging.DEBUG,filename=self.log_filename,filemode='a')
def __init__(self): super(RefreshFriend, self).__init__() self.redshift = RedShift() self.userFriendPrex = 'rec_user_friend_' #记录用户的好友列表 (set) self.log_filename = "/data/mesh_push_service/meshapi/log/refresh_user_friend.txt" self.log_format = ' [%(asctime)s] %(message)s' #将日志文件格式化 import logging logging.basicConfig(format=self.log_format, datafmt='%Y-%m-%d %H:%M:%S %p', level=logging.DEBUG, filename=self.log_filename, filemode='a')
class RefreshFriend(object): """docstring for RefreshFriend""" def __init__(self): super(RefreshFriend, self).__init__() self.redshift = RedShift() self.userFriendPrex = 'rec_user_friend_' #记录用户的好友列表 (set) self.log_filename = "/data/mesh_push_service/meshapi/log/refresh_user_friend.txt" self.log_format = ' [%(asctime)s] %(message)s' #将日志文件格式化 import logging logging.basicConfig(format=self.log_format, datafmt='%Y-%m-%d %H:%M:%S %p', level=logging.DEBUG, filename=self.log_filename, filemode='a') def loadFriendData(self): """ 从redshift加载好友信息 """ sql = "select userid,oppositeuserid from meshmatch_friend_prod;" sqlResult = self.redshift.getAll(sql) logging.debug('loadFriendData') return sqlResult def friendInit(self): """ 预处理数据 """ r = redis.Redis(host=RedisDBConfig.HOST, port=RedisDBConfig.PORT) p = r.pipeline() friend = self.loadFriendData() #用户好友信息 for i in friend: p.delete(i[0]) p.execute() for i in friend: p.sadd(self.userFriendPrex + i[0], i[1]) p.sadd(self.userFriendPrex + i[1], i[0]) logging.debug('write friend info to redis')
class ALSRec(object): """docstring for ALSRec""" def __init__(self): super(ALSRec, self).__init__() self.redshift = RedShift() self.redis = RedisCache() self.userLeft_factor = "rec_userLeft_factor" self.userRight_factor = "rec_userRight_factor" self.log_filename = "/data/mesh_push_service/meshapi/log/als_recommend.txt" self.log_format = ' [%(asctime)s] %(message)s' #将日志文件格式化 import logging logging.basicConfig(format=self.log_format, datafmt='%Y-%m-%d %H:%M:%S %p', level=logging.DEBUG, filename=self.log_filename, filemode='a') def loadData(self): """ 从redshift加载数据 """ sql = "select fromuserid,touserid,case when duration>60000 then 60000 else duration end from meshmatch_event_prod where eventtype in ('LIVING_HANGUP','LIVING_USER_HANGUP');" sqlResult = self.redshift.getAll(sql) logging.debug('loadData') return sqlResult def loadFriendData(self): """ 从redshift加载好友信息 """ sql = "select userid,oppositeuserid,70000 as cnt from meshmatch_friend_prod;" sqlResult = self.redshift.getAll(sql) logging.debug('loadFriendData') return sqlResult def getPercentile(self, arr, l): """ 计算分位数 """ return [np.percentile(arr, i) for i in l] def preProcessData(self): """ 预处理数据 """ friend = self.loadFriendData() #用户好友信息 living = self.loadData() #用户通话详情, raw = friend + living rawArray = np.array(raw, dtype=int) temp = [] for l in rawArray: if l[0] < l[1]: temp.append([l[0], l[1], l[2]]) else: temp.append([l[1], l[0], l[2]]) # 根据前两列分组,取最大值 df = pd.DataFrame(temp, columns=['id1', 'id2', 'cnt']) dfGroupby = df.iloc[df.groupby(['id1', 'id2' ]).apply(lambda x: x['cnt'].idxmax())] scoreLeft = np.array(dfGroupby) #两列用户id顺序颠倒 scoreRight = copy.deepcopy(scoreLeft) scoreRight[:, [0, 1]] = scoreRight[:, [1, 0]] score = np.concatenate((scoreLeft, scoreRight)) logging.debug('preProcessData') return score def matrixData(self, score): data = pd.DataFrame(score, columns=['id1', 'id2', 'cnt']) data['id1'] = data['id1'].astype("category") data['id2'] = data['id2'].astype("category") living = coo_matrix( (data['cnt'].astype(float), (data['id1'].cat.codes.copy(), data['id2'].cat.codes.copy()))) return data, living def bm25_weight(self, X, K1=100, B=0.8): """ Weighs each row of the sparse matrix of the data by BM25 weighting """ # calculate idf per term (user) X = coo_matrix(X) N = X.shape[0] #print (N) idf = np.log(float(N) / (1 + np.bincount(X.col))) #print (idf) # calculate length_norm per document (artist) row_sums = np.ravel(X.sum(axis=1)) average_length = row_sums.mean() length_norm = (1.0 - B) + B * row_sums / average_length # weight matrix rows by bm25 X.data = X.data * (K1 + 1.0) / (K1 * length_norm[X.row] + X.data) * idf[X.col] return X def saveToRedis(self, data, user1_factors, user2_factors): """ 把用户向量保存在redis中 """ l = list(set(np.array(data)[:, 0])) l.sort() for i in range(len(l)): print i self.redis.hset_hset(self.userLeft_factor, l[i], user1_factors[i].tostring()) self.redis.hset_hset(self.userRight_factor, l[i], user2_factors[i].tostring()) logging.debug('saveToRedis') def alsRec(self): score = self.preProcessData() data, living = self.matrixData(score) weighted = self.bm25_weight(living) print weighted.shape user1_factors, user2_factors = implicit.alternating_least_squares( weighted, factors=5) print "save to redis" self.saveToRedis(data, user1_factors, user2_factors)
class Popular(object): """docstring for Popular""" def __init__(self): super(Popular, self).__init__() self.redshift = RedShift() self.userPopular = 'rec_popular' # 记录用户的流行度 self.log_filename="/Users/holazhai/Documents/IPython2/mesh_recommend/myapi/log/popular_recommend.txt" self.log_format=' [%(asctime)s] %(message)s' #将日志文件格式化 import logging logging.basicConfig(format=self.log_format,datafmt='%Y-%m-%d %H:%M:%S %p',level=logging.DEBUG,filename=self.log_filename,filemode='a') def loadData(self): """ 从redshift加载数据 """ sql = "select t1.userid,case when like_score>1 then 1 else like_score end as like_score,liked_score,conn_score,report_score, \ case when block_score>1 then 1 else block_score end as block_score,hangup_score,pay_score from \ (select userid, \ max(like_count+2)*1.0/(max(conn_count)+5) as like_score,\ (log(max(liked_count)+1))*max(liked_count)*1.0/(max(like_count)+max(liked_count)+2.0)/(max(conn_count)+5) as liked_score, \ log(max(conn_count)+1)*(max(conn_avg)-10000) as conn_score, \ max(reportedcount)*1.0/(max(conn_count)+5) as report_score, \ max(blockcount)*1.0/(max(conn_count)+5) as block_score, \ (max(LIVING_HANGUP_10)+max(LIVING_USER_HANGUP_10))*1.0/(max(LIVING_USER_HANGUP)+max(LIVING_HANGUP)+4.0) as hangup_score \ from \ (select userid,reportedcount,blockcount,like_count,liked_count,conn_count,conn_avg \ from meshmatch_user_ext where conn_avg>0) m \ left join \ (select n.leftid,LIVING_HANGUP,LIVING_USER_HANGUP,LIVING_HANGUP_10,LIVING_USER_HANGUP_10 from \ (select leftid,sum(case when eventtype=0 then cnt else 0 end) as LIVING_HANGUP,sum(case when eventtype=1 then cnt else 0 end) as LIVING_USER_HANGUP \ from (select leftid,eventtype,count(*) as cnt \ from (select distinct leftid,rightid,eventtype \ from (select leftid,rightid,eventtype \ from (select fromuserid as leftid,touserid as rightid,case when eventtype='LIVING_USER_HANGUP' then 0 else 1 end as eventtype \ from meshmatch_event_prod where eventtype in ('LIVING_HANGUP','LIVING_USER_HANGUP')) a \ union all \ select touserid as leftid,fromuserid as rightid,case when eventtype='LIVING_HANGUP' then 0 else 1 end as eventtype \ from meshmatch_event_prod where eventtype in ('LIVING_HANGUP','LIVING_USER_HANGUP')) b) c \ group by leftid,eventtype)d group by leftid) n \ left join \ (select leftid,sum(case when eventtype=0 then cnt else 0 end) as LIVING_HANGUP_10,sum(case when eventtype=1 then cnt else 0 end) as LIVING_USER_HANGUP_10 \ from (select leftid,eventtype,count(*) as cnt \ from (select distinct leftid,rightid,eventtype \ from (select leftid,rightid,eventtype \ from (select fromuserid as leftid,touserid as rightid,case when eventtype='LIVING_USER_HANGUP' then 0 else 1 end as eventtype \ from meshmatch_event_prod where eventtype in ('LIVING_HANGUP','LIVING_USER_HANGUP') and duration<10000) a \ union all \ select touserid as leftid,fromuserid as rightid,case when eventtype='LIVING_HANGUP' then 0 else 1 end as eventtype \ from meshmatch_event_prod where eventtype in ('LIVING_HANGUP','LIVING_USER_HANGUP') and duration<10000) b) c \ group by leftid,eventtype)d group by leftid) p \ on n.leftid=p.leftid) q \ on m.userid=q.leftid group by userid) t1 left join (select distinct userid,1 as pay_score from meshmatch_payrecord_prod) t2 on t1.userid=t2.userid;" sqlResult = self.redshift.getAll(sql) userid = [i[0] for i in sqlResult] rawArray = np.array(sqlResult, dtype=float) data = rawArray[:,1:] df = pd.DataFrame(data, index=userid) #df2 = df.fillna(0) logging.debug('loadData') return df def loadSensorsData(self, url, val): data = urllib.urlencode(val) request = urllib2.Request(url,data) response = urllib2.urlopen(request) sensor = response.read() row = sensor.split('\n') colKeys = row[0].split('\t') midL = [] cntL = [] length = len(sensor.split('\n')) for i in range(1,length-1): col = row[i].split('\t') if len(col) == 2: midL.append(col[0]) cntL.append(col[1]) raw_data = {'cntL': cntL} df = pd.DataFrame(raw_data, index=midL, columns=['cntL']) return df def saveToRedis(self, userid, data, minResult, span): r = redis.Redis(host=RedisDBConfig.HOST, port=RedisDBConfig.PORT) p = r.pipeline() for i in range(len(userid)): if i%1000 == 0: print i p.execute() p.hset(self.userPopular, userid[i], (data[i]-minResult)/span*5.0) p.execute() logging.debug('saveToRedis') def popular(self): """ 计算逻辑 """ url = 'http://sensors.sta.holaverse.com:8007/api/sql/query?token=5a14139d09916d172f9e99375ebdf78c0dc01bf14a8a2fbe55eeed45a9521bb1&project=tiki' val = {"q":"select mid,(sum(case when int_val=1 then cnt else 0.0 end)+sum(case when int_val=3 then cnt else 0 end)+4.0)/(sum(cnt)+8.0) from (select mid,int_val,count(int_val) as cnt from events where event='M123' group by mid,int_val) a group by mid"} sensor = self.loadSensorsData(url, val) redShiftData = self.loadData() result = pd.concat([redShiftData, sensor], axis=1) df1 = result.fillna(0) userid = list(df1.index) data = np.array(df1,dtype=float) row = data.shape[0] min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0, 1)) data_minmax = min_max_scaler.fit_transform(data) #like_score,liked_score,conn_score,report_score,block_score,hangup_score,pay_score,异常断开 resultData = list(data_minmax[:,0] + 3*data_minmax[:,1] + data_minmax[:,2] - 2*data_minmax[:,3] - 2*data_minmax[:,4] - data_minmax[:,5] + 2*data_minmax[:,6] - 2*data_minmax[:,7]) maxResult = max(resultData) minResult = min(resultData) span = maxResult-minResult """ 测试 u = np.array(userid) pos=np.where(u=='42462576') resultData[14270] data_minmax[14270] pos2 = np.where(resultData>resultData[14270]) len(u[pos2]) pos=np.where(u=='77534097') resultData[29551] pos2 = np.where(resultData>resultData[29551]) u[pos2] pos=np.where(u=='22843255') resultData[5706] pos2 = np.where(resultData>resultData[5706]) len(u[pos2]) """ self.saveToRedis(userid, resultData, minResult, span)
class Popular(object): """docstring for Popular""" def __init__(self): super(Popular, self).__init__() self.redshift = RedShift() self.userPopular = 'rec_popular' # 记录用户的流行度 self.log_filename = "/data/mesh_push_service/meshapi/log/popular_recommend.txt" self.log_format = ' [%(asctime)s] %(message)s' #将日志文件格式化 import logging logging.basicConfig(format=self.log_format, datafmt='%Y-%m-%d %H:%M:%S %p', level=logging.DEBUG, filename=self.log_filename, filemode='a') def loadData(self): """ 从redshift加载数据 """ sql = "select userid,case when like_count>0 then like_count else 1 end,case when liked_count>0 then liked_count else 0 end,case when conn_count>0 then conn_count else 0 end,case when conn_avg>0 then conn_avg else 20000 end from meshmatch_user_ext;" sqlResult = self.redshift.getAll(sql) logging.debug('loadData') return sqlResult def saveToRedis(self, userid, data): r = redis.Redis(host=RedisDBConfig.HOST, port=RedisDBConfig.PORT) p = r.pipeline() for i in range(len(userid)): if i % 1000 == 0: print i p.execute() p.hset(self.userPopular, userid[i], data[i]) p.execute() logging.debug('saveToRedis') def popular(self): """ 计算逻辑 l[1]+l[1]/l[0]+l[2]+l[2]*l[3] """ raw = self.loadData() rawArray = np.array(raw, dtype=float) userid = [i[0] for i in raw] data = rawArray[:, 1:] row = data.shape[0] dataTemp = (data[:, 1]).reshape(row, 1) dataTemp = np.hstack( (dataTemp, (data[:, 1] / data[:, 0]).reshape(row, 1))) dataTemp = np.hstack((dataTemp, (data[:, 2]).reshape(row, 1))) dataTemp = np.hstack( (dataTemp, (data[:, 2] * (data[:, 3] - 20000)).reshape(row, 1))) min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0, 1)) data_minmax = min_max_scaler.fit_transform(dataTemp) resultData = list(data_minmax[:, 0] + data_minmax[:, 1] + data_minmax[:, 2] + data_minmax[:, 3]) self.saveToRedis(userid, resultData)