Python RedShiftの例、common.redshiftpool.RedShift Pythonの例

コード例 #1

0

ファイルを表示

ファイル: popular_recommend.py プロジェクト: ibrahim85/IPython2

 def __init__(self):
     super(Popular, self).__init__()
     self.redshift = RedShift()
     self.userPopular = 'rec_popular' # 记录用户的流行度
     self.log_filename="/Users/holazhai/Documents/IPython2/mesh_recommend/myapi/log/popular_recommend.txt" 
     self.log_format=' [%(asctime)s]   %(message)s' 
     #将日志文件格式化  
     import logging
     logging.basicConfig(format=self.log_format,datafmt='%Y-%m-%d %H:%M:%S %p',level=logging.DEBUG,filename=self.log_filename,filemode='a')

コード例 #2

0

ファイルを表示

ファイル: als_recommend.py プロジェクト: ibrahim85/IPython2

 def __init__(self):
     super(ALSRec, self).__init__()
     self.redshift = RedShift()
     self.redis = RedisCache()
     self.userLeft_factor = "rec_userLeft_factor"
     self.userRight_factor = "rec_userRight_factor"
     self.log_filename="/Users/holazhai/Documents/IPython2/mesh_recommend/myapi/log/als_recommend.txt" 
     self.log_format=' [%(asctime)s]   %(message)s' 
     #将日志文件格式化  
     import logging
     logging.basicConfig(format=self.log_format,datafmt='%Y-%m-%d %H:%M:%S %p',level=logging.DEBUG,filename=self.log_filename,filemode='a')

コード例 #3

0

ファイルを表示

 def __init__(self):
     super(RefreshFriend, self).__init__()
     self.redshift = RedShift()
     self.userFriendPrex = 'rec_user_friend_'  #记录用户的好友列表 （set）
     self.log_filename = "/data/mesh_push_service/meshapi/log/refresh_user_friend.txt"
     self.log_format = ' [%(asctime)s]   %(message)s'
     #将日志文件格式化
     import logging
     logging.basicConfig(format=self.log_format,
                         datafmt='%Y-%m-%d %H:%M:%S %p',
                         level=logging.DEBUG,
                         filename=self.log_filename,
                         filemode='a')

コード例 #4

0

ファイルを表示

class RefreshFriend(object):
    """docstring for RefreshFriend"""
    def __init__(self):
        super(RefreshFriend, self).__init__()
        self.redshift = RedShift()
        self.userFriendPrex = 'rec_user_friend_'  #记录用户的好友列表 （set）
        self.log_filename = "/data/mesh_push_service/meshapi/log/refresh_user_friend.txt"
        self.log_format = ' [%(asctime)s]   %(message)s'
        #将日志文件格式化
        import logging
        logging.basicConfig(format=self.log_format,
                            datafmt='%Y-%m-%d %H:%M:%S %p',
                            level=logging.DEBUG,
                            filename=self.log_filename,
                            filemode='a')

    def loadFriendData(self):
        """
        从redshift加载好友信息
        """
        sql = "select userid,oppositeuserid from meshmatch_friend_prod;"
        sqlResult = self.redshift.getAll(sql)
        logging.debug('loadFriendData')

        return sqlResult

    def friendInit(self):
        """
        预处理数据
        """
        r = redis.Redis(host=RedisDBConfig.HOST, port=RedisDBConfig.PORT)
        p = r.pipeline()

        friend = self.loadFriendData()  #用户好友信息

        for i in friend:
            p.delete(i[0])

        p.execute()
        for i in friend:
            p.sadd(self.userFriendPrex + i[0], i[1])
            p.sadd(self.userFriendPrex + i[1], i[0])

        logging.debug('write friend info to redis')

コード例 #5

0

ファイルを表示

class ALSRec(object):
    """docstring for ALSRec"""
    def __init__(self):
        super(ALSRec, self).__init__()
        self.redshift = RedShift()
        self.redis = RedisCache()
        self.userLeft_factor = "rec_userLeft_factor"
        self.userRight_factor = "rec_userRight_factor"
        self.log_filename = "/data/mesh_push_service/meshapi/log/als_recommend.txt"
        self.log_format = ' [%(asctime)s]   %(message)s'
        #将日志文件格式化
        import logging
        logging.basicConfig(format=self.log_format,
                            datafmt='%Y-%m-%d %H:%M:%S %p',
                            level=logging.DEBUG,
                            filename=self.log_filename,
                            filemode='a')

    def loadData(self):
        """
        从redshift加载数据
        """
        sql = "select fromuserid,touserid,case when duration>60000 then 60000 else duration end from meshmatch_event_prod where eventtype in ('LIVING_HANGUP','LIVING_USER_HANGUP');"
        sqlResult = self.redshift.getAll(sql)
        logging.debug('loadData')
        return sqlResult

    def loadFriendData(self):
        """
        从redshift加载好友信息
        """
        sql = "select userid,oppositeuserid,70000 as cnt from meshmatch_friend_prod;"
        sqlResult = self.redshift.getAll(sql)
        logging.debug('loadFriendData')
        return sqlResult

    def getPercentile(self, arr, l):
        """
        计算分位数
        """
        return [np.percentile(arr, i) for i in l]

    def preProcessData(self):
        """
        预处理数据
        """
        friend = self.loadFriendData()  #用户好友信息
        living = self.loadData()  #用户通话详情，

        raw = friend + living

        rawArray = np.array(raw, dtype=int)
        temp = []
        for l in rawArray:
            if l[0] < l[1]:
                temp.append([l[0], l[1], l[2]])
            else:
                temp.append([l[1], l[0], l[2]])

        # 根据前两列分组，取最大值
        df = pd.DataFrame(temp, columns=['id1', 'id2', 'cnt'])
        dfGroupby = df.iloc[df.groupby(['id1', 'id2'
                                        ]).apply(lambda x: x['cnt'].idxmax())]
        scoreLeft = np.array(dfGroupby)
        #两列用户id顺序颠倒
        scoreRight = copy.deepcopy(scoreLeft)
        scoreRight[:, [0, 1]] = scoreRight[:, [1, 0]]
        score = np.concatenate((scoreLeft, scoreRight))
        logging.debug('preProcessData')
        return score

    def matrixData(self, score):
        data = pd.DataFrame(score, columns=['id1', 'id2', 'cnt'])
        data['id1'] = data['id1'].astype("category")
        data['id2'] = data['id2'].astype("category")

        living = coo_matrix(
            (data['cnt'].astype(float), (data['id1'].cat.codes.copy(),
                                         data['id2'].cat.codes.copy())))
        return data, living

    def bm25_weight(self, X, K1=100, B=0.8):
        """ Weighs each row of the sparse matrix of the data by BM25 weighting """
        # calculate idf per term (user)
        X = coo_matrix(X)
        N = X.shape[0]
        #print (N)
        idf = np.log(float(N) / (1 + np.bincount(X.col)))
        #print (idf)
        # calculate length_norm per document (artist)
        row_sums = np.ravel(X.sum(axis=1))
        average_length = row_sums.mean()
        length_norm = (1.0 - B) + B * row_sums / average_length

        # weight matrix rows by bm25
        X.data = X.data * (K1 + 1.0) / (K1 * length_norm[X.row] +
                                        X.data) * idf[X.col]
        return X

    def saveToRedis(self, data, user1_factors, user2_factors):
        """
        把用户向量保存在redis中
        """
        l = list(set(np.array(data)[:, 0]))
        l.sort()
        for i in range(len(l)):
            print i
            self.redis.hset_hset(self.userLeft_factor, l[i],
                                 user1_factors[i].tostring())
            self.redis.hset_hset(self.userRight_factor, l[i],
                                 user2_factors[i].tostring())
        logging.debug('saveToRedis')

    def alsRec(self):
        score = self.preProcessData()
        data, living = self.matrixData(score)
        weighted = self.bm25_weight(living)
        print weighted.shape
        user1_factors, user2_factors = implicit.alternating_least_squares(
            weighted, factors=5)
        print "save to redis"
        self.saveToRedis(data, user1_factors, user2_factors)

コード例 #6

0

ファイルを表示

ファイル: popular_recommend.py プロジェクト: ibrahim85/IPython2

class Popular(object):
    """docstring for Popular"""
    def __init__(self):
        super(Popular, self).__init__()
        self.redshift = RedShift()
        self.userPopular = 'rec_popular' # 记录用户的流行度
        self.log_filename="/Users/holazhai/Documents/IPython2/mesh_recommend/myapi/log/popular_recommend.txt" 
        self.log_format=' [%(asctime)s]   %(message)s' 
        #将日志文件格式化  
        import logging
        logging.basicConfig(format=self.log_format,datafmt='%Y-%m-%d %H:%M:%S %p',level=logging.DEBUG,filename=self.log_filename,filemode='a') 

    def loadData(self):
        """
        从redshift加载数据
        """
        sql = "select t1.userid,case when like_score>1 then 1 else like_score end as like_score,liked_score,conn_score,report_score, \
case when block_score>1 then 1 else block_score end as block_score,hangup_score,pay_score from \
(select userid, \
max(like_count+2)*1.0/(max(conn_count)+5) as like_score,\
(log(max(liked_count)+1))*max(liked_count)*1.0/(max(like_count)+max(liked_count)+2.0)/(max(conn_count)+5) as liked_score, \
log(max(conn_count)+1)*(max(conn_avg)-10000) as conn_score, \
max(reportedcount)*1.0/(max(conn_count)+5) as report_score, \
max(blockcount)*1.0/(max(conn_count)+5) as block_score, \
(max(LIVING_HANGUP_10)+max(LIVING_USER_HANGUP_10))*1.0/(max(LIVING_USER_HANGUP)+max(LIVING_HANGUP)+4.0) as hangup_score \
from \
  (select userid,reportedcount,blockcount,like_count,liked_count,conn_count,conn_avg \
  from meshmatch_user_ext where conn_avg>0) m  \
  left join  \
  (select n.leftid,LIVING_HANGUP,LIVING_USER_HANGUP,LIVING_HANGUP_10,LIVING_USER_HANGUP_10 from  \
  (select leftid,sum(case when eventtype=0 then cnt else 0 end) as LIVING_HANGUP,sum(case when eventtype=1 then cnt else 0 end) as LIVING_USER_HANGUP \
    from (select leftid,eventtype,count(*) as cnt  \
      from (select distinct leftid,rightid,eventtype  \
        from (select leftid,rightid,eventtype  \
          from (select fromuserid as leftid,touserid as rightid,case when eventtype='LIVING_USER_HANGUP' then 0 else 1 end as eventtype  \
            from meshmatch_event_prod where eventtype in ('LIVING_HANGUP','LIVING_USER_HANGUP')) a  \
            union all  \
                select touserid as leftid,fromuserid as rightid,case when eventtype='LIVING_HANGUP' then 0 else 1 end as eventtype  \
                  from meshmatch_event_prod where eventtype in ('LIVING_HANGUP','LIVING_USER_HANGUP')) b) c  \
                  group by leftid,eventtype)d group by leftid) n  \
  left join  \
  (select leftid,sum(case when eventtype=0 then cnt else 0 end) as LIVING_HANGUP_10,sum(case when eventtype=1 then cnt else 0 end) as LIVING_USER_HANGUP_10 \
    from (select leftid,eventtype,count(*) as cnt  \
      from (select distinct leftid,rightid,eventtype  \
        from (select leftid,rightid,eventtype  \
          from (select fromuserid as leftid,touserid as rightid,case when eventtype='LIVING_USER_HANGUP' then 0 else 1 end as eventtype  \
            from meshmatch_event_prod where eventtype in ('LIVING_HANGUP','LIVING_USER_HANGUP') and duration<10000) a  \
            union all  \
                select touserid as leftid,fromuserid as rightid,case when eventtype='LIVING_HANGUP' then 0 else 1 end as eventtype  \
                  from meshmatch_event_prod where eventtype in ('LIVING_HANGUP','LIVING_USER_HANGUP') and duration<10000) b) c  \
                  group by leftid,eventtype)d group by leftid) p \
                  on n.leftid=p.leftid) q    \
   on m.userid=q.leftid group by userid) t1 left join (select distinct userid,1 as pay_score from meshmatch_payrecord_prod) t2 on t1.userid=t2.userid;"

        sqlResult = self.redshift.getAll(sql)
        userid = [i[0] for i in sqlResult]
        rawArray  = np.array(sqlResult, dtype=float)
        data = rawArray[:,1:]
        df = pd.DataFrame(data, index=userid)
        #df2 = df.fillna(0)

        logging.debug('loadData')
        return df

    def loadSensorsData(self, url, val):
        data = urllib.urlencode(val) 
        request = urllib2.Request(url,data)
        response = urllib2.urlopen(request)
        sensor = response.read()
        row = sensor.split('\n')
        colKeys = row[0].split('\t')
        midL = []
        cntL = []
        length = len(sensor.split('\n'))
        for i in range(1,length-1):
            col = row[i].split('\t')
            if len(col) == 2:
                midL.append(col[0])
                cntL.append(col[1])


        raw_data = {'cntL': cntL}
        df = pd.DataFrame(raw_data, index=midL, columns=['cntL'])
        return df

    def saveToRedis(self, userid, data, minResult, span):
        r = redis.Redis(host=RedisDBConfig.HOST, port=RedisDBConfig.PORT)
        p = r.pipeline()
        for i in range(len(userid)):
            if i%1000 == 0:
                print i
                p.execute()
            p.hset(self.userPopular, userid[i], (data[i]-minResult)/span*5.0)
        p.execute()
        logging.debug('saveToRedis')

    def popular(self):
        """
        计算逻辑
        """
        url = 'http://sensors.sta.holaverse.com:8007/api/sql/query?token=5a14139d09916d172f9e99375ebdf78c0dc01bf14a8a2fbe55eeed45a9521bb1&project=tiki'
        val = {"q":"select mid,(sum(case when int_val=1 then cnt else 0.0 end)+sum(case when int_val=3 then cnt else 0 end)+4.0)/(sum(cnt)+8.0) from (select mid,int_val,count(int_val) as cnt from events where event='M123' group by mid,int_val) a group by mid"}
        sensor = self.loadSensorsData(url, val)
        redShiftData = self.loadData()
        result = pd.concat([redShiftData, sensor], axis=1)
        df1 = result.fillna(0)

        userid = list(df1.index)
        data = np.array(df1,dtype=float)
        row = data.shape[0]
        min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
        data_minmax = min_max_scaler.fit_transform(data)

        #like_score,liked_score,conn_score,report_score,block_score,hangup_score,pay_score,异常断开
        resultData = list(data_minmax[:,0] + 3*data_minmax[:,1] + data_minmax[:,2] - 2*data_minmax[:,3] - 2*data_minmax[:,4] - data_minmax[:,5] + 2*data_minmax[:,6] - 2*data_minmax[:,7])
        
        maxResult = max(resultData)
        minResult = min(resultData)
        span = maxResult-minResult
        """
        测试
        u = np.array(userid)

        pos=np.where(u=='42462576')
        resultData[14270]
        data_minmax[14270]

        pos2 = np.where(resultData>resultData[14270])
        len(u[pos2])


        pos=np.where(u=='77534097') 
        resultData[29551]
        pos2 = np.where(resultData>resultData[29551])
        u[pos2]

        pos=np.where(u=='22843255') 
        resultData[5706]
        pos2 = np.where(resultData>resultData[5706])
        len(u[pos2])
        
        """
    
        self.saveToRedis(userid, resultData, minResult, span)

コード例 #7

0

ファイルを表示

ファイル: popular_recommend-bk.py プロジェクト: ibrahim85/IPython2

class Popular(object):
    """docstring for Popular"""
    def __init__(self):
        super(Popular, self).__init__()
        self.redshift = RedShift()
        self.userPopular = 'rec_popular'  # 记录用户的流行度
        self.log_filename = "/data/mesh_push_service/meshapi/log/popular_recommend.txt"
        self.log_format = ' [%(asctime)s]   %(message)s'
        #将日志文件格式化
        import logging
        logging.basicConfig(format=self.log_format,
                            datafmt='%Y-%m-%d %H:%M:%S %p',
                            level=logging.DEBUG,
                            filename=self.log_filename,
                            filemode='a')

    def loadData(self):
        """
        从redshift加载数据
        """
        sql = "select userid,case when like_count>0 then like_count else 1 end,case when liked_count>0 then liked_count else 0 end,case when conn_count>0 then conn_count else 0 end,case when conn_avg>0 then conn_avg else 20000 end from meshmatch_user_ext;"
        sqlResult = self.redshift.getAll(sql)
        logging.debug('loadData')
        return sqlResult

    def saveToRedis(self, userid, data):
        r = redis.Redis(host=RedisDBConfig.HOST, port=RedisDBConfig.PORT)
        p = r.pipeline()
        for i in range(len(userid)):
            if i % 1000 == 0:
                print i
                p.execute()
            p.hset(self.userPopular, userid[i], data[i])
        p.execute()
        logging.debug('saveToRedis')

    def popular(self):
        """
        计算逻辑
        l[1]+l[1]/l[0]+l[2]+l[2]*l[3]
        """
        raw = self.loadData()
        rawArray = np.array(raw, dtype=float)

        userid = [i[0] for i in raw]
        data = rawArray[:, 1:]

        row = data.shape[0]

        dataTemp = (data[:, 1]).reshape(row, 1)
        dataTemp = np.hstack(
            (dataTemp, (data[:, 1] / data[:, 0]).reshape(row, 1)))

        dataTemp = np.hstack((dataTemp, (data[:, 2]).reshape(row, 1)))

        dataTemp = np.hstack(
            (dataTemp, (data[:, 2] * (data[:, 3] - 20000)).reshape(row, 1)))

        min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
        data_minmax = min_max_scaler.fit_transform(dataTemp)

        resultData = list(data_minmax[:, 0] + data_minmax[:, 1] +
                          data_minmax[:, 2] + data_minmax[:, 3])
        self.saveToRedis(userid, resultData)