Beispiel #1
0
 def fit(self, trainSamples, trainTargets):
     self.dataModel = MemeryDataModel(trainSamples, trainTargets)
     #print 'train user:' + str(self.dataModel.getUsersNum())
     V = self.dataModel.getData()
     model = ProjectedGradientNMF(n_components=self.factors,
                                  max_iter=1000,
                                  nls_max_iter=1000)
     self.pu = model.fit_transform(V)
     self.qi = model.fit(V).components_.transpose()
Beispiel #2
0
 def fit(self, trainSamples, trainTargets):
     #print len(trainSamples)
     self.dataModel = MemeryDataModel(trainSamples, trainTargets)
     usersNum = self.dataModel.getUsersNum()
     self.simiMatrix = np.zeros((usersNum, usersNum))
     for i in range(usersNum):
         for j in range(i+1, usersNum):
             s = self.similarity.compute(self.dataModel.getItemIDsFromUid(i), self.dataModel.getItemIDsFromUid(j))
             self.simiMatrix[i][j] = self.simiMatrix[j][i] = s
Beispiel #3
0
 def gen_items_popular(self, trainSamples, trainTargets, hasTimes=False):
     self.dataModel = MemeryDataModel(trainSamples, trainTargets)
     itempopular = np.zeros(self.dataModel.getItemsNum())
     uids = self.dataModel.getData().nonzero()[0]
     iids = self.dataModel.getData().nonzero()[1]
     for i in range(len(iids)):
         iid = iids[i]
         itempopular[iid] += 1
     self.popItems = itempopular
Beispiel #4
0
class NMF(BaseEstimator):
    def __init__(self, n=5, factors=50):
        print 'nmf begin'
        self.n = n
        self.factors = factors

    def predict(self, testSamples):
        recommend_lists = []
        for user_item in testSamples:
            uid = self.dataModel.getUidByUser(user_item[0])
            iid = self.dataModel.getUidByUser(user_item[1])
            recommend_lists.append(np.dot(self.pu[uid], self.qi[iid]))
        return recommend_lists

    def fit(self, trainSamples, trainTargets):
        self.dataModel = MemeryDataModel(trainSamples, trainTargets)
        #print 'train user:'******'not in test'
            return []
        else:
            predict_scores = []
            for i in range(self.dataModel.getItemsNum()):
                predict_scores.append(self.predict_single(uid, i))
            topN = np.argsort(np.array(predict_scores))[-1:-self.n - 1:-1]
            return [self.dataModel.getItemByIid(i) for i in topN]

    def score(self, testSamples, trueLabels):
        print 'NMF scoring ...'
        trueList = []
        recommendList = []
        user_unique = list(set(np.array(testSamples)[:, 0]))
        #print 'test user:'******'NMF result:' + '(' + str(self.get_params()) + ')' + str(
            (result)['F1'])
        return (result)['F1']
Beispiel #5
0
 def fit(self, trainSamples, trainTargets):
     self.dataModel = MemeryDataModel(trainSamples, trainTargets)
     itemsNum = self.dataModel.getItemsNum()
     self.simiMatrix = np.zeros((itemsNum, itemsNum))
     for i in range(itemsNum):
         for j in range(i + 1, itemsNum):
             s = self.similarity.compute(
                 self.dataModel.getUserIDsFromIid(i),
                 self.dataModel.getUserIDsFromIid(j))
             self.simiMatrix[i][j] = self.simiMatrix[j][i] = s
Beispiel #6
0
class TopN(BaseEstimator):
    def __init__(self, n=5):
        print 'topN begin'
        self.n = n

    def gen_items_popular(self, trainSamples, trainTargets, hasTimes=False):
        self.dataModel = MemeryDataModel(trainSamples, trainTargets)
        itempopular = np.zeros(self.dataModel.getItemsNum())
        uids = self.dataModel.getData().nonzero()[0]
        iids = self.dataModel.getData().nonzero()[1]
        for i in range(len(iids)):
            iid = iids[i]
            itempopular[iid] += 1
        self.popItems = itempopular

    def predict(self, testSamples):
        recommend_lists = []
        for user_item in testSamples:
            if self.dataModel.getIidByItem(user_item[1]) in self.topN[:self.n]:
                recommend_lists.append(1)
            else:
                recommend_lists.append(0)
        return recommend_lists

    def fit(self, trainSamples, trainTargets):
        #print trainSamples, trainTargets
        #print len(trainSamples), len(trainTargets)
        self.gen_items_popular(trainSamples, trainTargets)
        self.topN = np.argsort(np.array(self.popItems))[-1::-1]
        return self

    def recommend(self, uid):
        return [self.dataModel.getItemByIid(i) for i in self.topN[:self.n]]

    def score(self, testSamples, trueLabels):
        #print testSamples
        #print len(testSamples)
        trueList = []
        recommendList = []

        user_unique = list(set(np.array(testSamples)[:, 0]))
        for u in user_unique:
            uTrueIndex = np.argwhere(np.array(testSamples)[:, 0] == u)[:, 0]
            #true = [self.dataModel.getIidByItem(i) for i in list(np.array(testSamples)[uTrueIndex][:,1])]
            true = list(np.array(testSamples)[uTrueIndex][:, 1])
            trueList.append(true)
            pre = [self.dataModel.getItemByIid(i) for i in self.topN[:self.n]]
            recommendList.append(pre)
        e = Eval()
        result = e.evalAll(recommendList, trueList)
        print 'TopN result:' + '(' + str(self.get_params()) + ')' + str(
            (result)['F1'])
        return (result)['F1']
Beispiel #7
0
class TopN(BaseEstimator):

    def __init__(self, n=5):
        print 'topN begin'
        self.n = n

    def gen_items_popular(self, trainSamples, trainTargets, hasTimes=False):
        self.dataModel = MemeryDataModel(trainSamples, trainTargets)
        itempopular = np.zeros(self.dataModel.getItemsNum())
        uids = self.dataModel.getData().nonzero()[0]
        iids = self.dataModel.getData().nonzero()[1]
        for i in range(len(iids)):
            iid = iids[i]
            itempopular[iid] += 1
        self.popItems = itempopular

    def predict(self, testSamples):
        recommend_lists = []
        for user_item in testSamples:
            if self.dataModel.getIidByItem(user_item[1]) in self.topN[:self.n]:
                recommend_lists.append(1)
            else:
                recommend_lists.append(0)
        return recommend_lists

    def fit(self, trainSamples, trainTargets):
        #print trainSamples, trainTargets
        #print len(trainSamples), len(trainTargets)
        self.gen_items_popular(trainSamples, trainTargets)
        self.topN = np.argsort(np.array(self.popItems))[-1::-1]
        return self

    def recommend(self, uid):
        return [self.dataModel.getItemByIid(i) for i in self.topN[:self.n]]
    def score(self, testSamples, trueLabels):
        #print testSamples
        #print len(testSamples)
        trueList = []
        recommendList= []

        user_unique = list(set(np.array(testSamples)[:,0]))
        for u in user_unique:
            uTrueIndex = np.argwhere(np.array(testSamples)[:,0] == u)[:,0]
            #true = [self.dataModel.getIidByItem(i) for i in list(np.array(testSamples)[uTrueIndex][:,1])]
            true = list(np.array(testSamples)[uTrueIndex][:,1])
            trueList.append(true)
            pre = [self.dataModel.getItemByIid(i) for i in self.topN[:self.n]]
            recommendList.append(pre)
        e = Eval()
        result = e.evalAll(recommendList, trueList)
        print 'TopN result:'+'('+str(self.get_params())+')'+str((result)['F1'])
        return (result)['F1']
Beispiel #8
0
class NMF(BaseEstimator):
    def __init__(self, n=5, factors=50):
        print 'nmf begin'
        self.n = n
        self.factors = factors

    def predict(self, testSamples):
        recommend_lists = []
        for user_item in testSamples:
            uid = self.dataModel.getUidByUser(user_item[0])
            iid = self.dataModel.getUidByUser(user_item[1])
            recommend_lists.append(np.dot(self.pu[uid], self.qi[iid]))
        return recommend_lists

    def fit(self, trainSamples, trainTargets):
        self.dataModel = MemeryDataModel(trainSamples, trainTargets)
        #print 'train user:'******'not in test'
            return []
        else:
            predict_scores = []
            for i in range(self.dataModel.getItemsNum()):
                predict_scores.append(self.predict_single(uid, i))
            topN = np.argsort(np.array(predict_scores))[-1:-self.n-1:-1]
            return [self.dataModel.getItemByIid(i) for i in topN]
    def score(self, testSamples, trueLabels):
        print 'NMF scoring ...'
        trueList = []
        recommendList= []
        user_unique = list(set(np.array(testSamples)[:,0]))
        #print 'test user:'******'NMF result:'+'('+str(self.get_params())+')' + str((result)['F1'])
        return (result)['F1']
Beispiel #9
0
    def fit(self, trainSamples, trainTargets):
        self.dataModel = MemeryDataModel(trainSamples, trainTargets, isRating=True)
        usersNum = self.dataModel.getUsersNum()
        itemsNum = self.dataModel.getItemsNum()
        self.T = [{} for i in range(usersNum)]
        for uid in range(usersNum):
            purchased_items = self.dataModel.getItemIDsFromUid(uid)
            for i in range(len(purchased_items)):
                for j in range(i+1, len(purchased_items)):
                    rating_i = self.dataModel.getRating(uid, purchased_items[i])
                    rating_j = self.dataModel.getRating(uid, purchased_items[j])
                    if rating_i > rating_j:
                        key = str(purchased_items[i]) + " " + str(purchased_items[j])
                    elif rating_i < rating_j:
                        key = str(purchased_items[j]) + " " + str(purchased_items[i])
                    else:
                        continue
                    self.T[uid][key] = 1

        for uid in range(usersNum):
            print self.dataModel.getUserByUid(uid), len(self.T[uid])

        idf = {}
        pair_sum = [[0]*itemsNum for i in range(itemsNum)]
        for uid in range(usersNum):
            for t, times in self.T[uid].iteritems():
                i1, i2 = t.split(" ")
                pair_sum[int(i1)][int(i2)] += 1
        for i1 in range(itemsNum):
            for i2 in range(itemsNum):
                if pair_sum[i1][i2] != 0:
                    key = str(i1) + ' ' + str(i2)
                    sum = pair_sum[i1][i2] + pair_sum[i2][i1]
                    alpha = log10(1+9.0*sum/usersNum)
                    idf[key] = alpha*log2(sum*1.0/pair_sum[i1][i2])+(1-alpha)

        W = [{} for i in range(usersNum)]
        for uid in range(usersNum):
            for t, times in self.T[uid].iteritems():
                i1, i2 = t.split(" ")
                diff = self.dataModel.getRating(uid, int(i1))-self.dataModel.getRating(uid, int(i2))
                tf = log2(1+abs(diff))
                W[uid][t] = tf * idf[t]

        self.simiMatrix = np.zeros((usersNum, usersNum))
        for i in range(usersNum):
            for j in range(i+1, usersNum):
                s = self.cos(W[i], W[j])
                self.simiMatrix[i][j] = self.simiMatrix[j][i] = s
Beispiel #10
0
    def fit(self, trainSamples, trainTargets):
        self.dataModel = MemeryDataModel(trainSamples, trainTargets)
        self.mu = np.array(trainTargets).mean()
        self.bu = np.zeros(self.dataModel.getUsersNum())
        self.bi = np.zeros(self.dataModel.getItemsNum())
        temp = math.sqrt(self.factors)
        self.qi = [[(0.1 * random.random() / temp) for j in range(self.factors)] for i in range(self.dataModel.getItemsNum())]
        self.pu = [[(0.1 * random.random() / temp) for j in range(self.factors)] for i in range(self.dataModel.getUsersNum())]
        lineData = self.dataModel.getLineData()
        lengthOfTrain = len(lineData)

        for step in range(self.iter):
            rmse_sum = 0.0
            hash = np.random.permutation(lengthOfTrain)
            for j in range(lengthOfTrain):
                n = hash[j]
                row = lineData[n]
                uid = self.dataModel.getUidByUser(row[0])
                iid = self.dataModel.getIidByItem(row[1])
                rating = row[2]
                #rating = 1
                eui = rating - self.predict_single(uid, iid)
                rmse_sum += eui**2
                self.bu[uid] += self.learningrate*(eui-self.userregular*self.bu[uid])
                self.bi[iid] += self.learningrate*(eui-self.itemregular*self.bi[iid])
                temp = self.qi[iid]
                self.qi[iid] += self.learningrate*(np.dot(eui, self.pu[uid]) - np.dot(self.itemregular, self.qi[iid]))
                self.pu[uid] += self.learningrate*(np.dot(eui, temp) - np.dot(self.userregular, self.pu[uid]))
            self.learningrate = self.learningrate * 0.93
Beispiel #11
0
 def fit(self, trainSamples, trainTargets):
     self.dataModel = MemeryDataModel(trainSamples, trainTargets)
     #print 'train user:' + str(self.dataModel.getUsersNum())
     V = self.dataModel.getData()
     model = ProjectedGradientNMF(n_components=self.factors, max_iter=1000, nls_max_iter=1000)
     self.pu = model.fit_transform(V)
     self.qi = model.fit(V).components_.transpose()
Beispiel #12
0
    def fit(self, trainSamples, trainTargets):
        self.dataModel = MemeryDataModel(trainSamples, trainTargets)
        temp = math.sqrt(self.factors)
        self.item_bias = np.zeros(self.dataModel.getItemsNum())
        self.user_factors = np.array([[(0.1 * random.random() / temp) for j in range(self.factors)] for i in range(self.dataModel.getUsersNum())])
        self.item_factors = np.array([[(0.1 * random.random() / temp) for j in range(self.factors)] for i in range(self.dataModel.getItemsNum())])
        '''
        user_file = 'pu'
        item_file = 'qi'
        self.user_factors = np.array(pd.read_csv(user_file).values)[:, 1:]
        self.item_factors = np.array(pd.read_csv(item_file).values)[:, 1:]
        '''
        num_loss_samples = int(100*self.dataModel.getUsersNum()**0.5)
        #print 'sampling {0} <user,item i,item j> triples...'.format(num_loss_samples)
        loss_sampler = UniformUserUniformItem(True)
        self.loss_samples = [t for t in loss_sampler.generate_samples(self.dataModel, num_loss_samples)]
        old_loss = self.loss()

        update_sampler = UniformPairWithoutReplacement(True)
        #print 'initial loss = {0}'.format(self.loss())
        for it in xrange(self.iter):
            #print 'starting iteration {0}'.format(it)
            for u, i, j in update_sampler.generate_samples(self.dataModel):
                self.update_factors(u, i, j)
            if abs(self.loss() - old_loss) < 0.01 or self.loss() - old_loss > 0:
                #print 'iteration {0}: loss = {1}'.format(it, self.loss())
                #print 'converge!!'
                break
            else:
                old_loss = self.loss()
                self.learning_rate *= 0.9
Beispiel #13
0
 def fit(self, trainSamples, trainTargets):
     self.dataModel = MemeryDataModel(trainSamples, trainTargets)
     itemsNum = self.dataModel.getItemsNum()
     self.simiMatrix = np.zeros((itemsNum, itemsNum))
     for i in range(itemsNum):
         for j in range(i+1, itemsNum):
             s = self.similarity.compute(self.dataModel.getUserIDsFromIid(i), self.dataModel.getUserIDsFromIid(j))
             self.simiMatrix[i][j] = self.simiMatrix[j][i] = s
Beispiel #14
0
    def fit(self, trainSamples, trainTargets):
        self.dataModel = MemeryDataModel(trainSamples, trainTargets, hasTimes=True)
        usersNum = self.dataModel.getUsersNum()
        itemsNum = self.dataModel.getItemsNum()
        all_item_set = set(range(itemsNum))

        self.T = [{} for i in range(usersNum)]
        for uid in range(usersNum):
            purchased_items = self.dataModel.getItemIDsFromUid(uid)
            for i in range(len(purchased_items)):
                for j in range(i+1, len(purchased_items)):
                    rating_i = self.dataModel.getRating(uid, purchased_items[i])
                    rating_j = self.dataModel.getRating(uid, purchased_items[j])
                    if rating_i > rating_j:
                        key = str(purchased_items[i]) + " " + str(purchased_items[j])
                    elif rating_i < rating_j:
                        key = str(purchased_items[j]) + " " + str(purchased_items[i])
                    else:
                        continue
                    self.T[uid][key] = 1
            # for i in purchased_items:
            #     purchased_items = self.dataModel.getItemIDsFromUid(uid)
                # unpurchased_items = random.sample(all_item_set.difference(purchased_items), self.sample_rate)
                # for j in unpurchased_items:
                #     key = str(i) + " " + str(j)
                #     self.T[uid][key] = 1

        idf = {}
        pair_sum = [[0]*itemsNum for i in range(itemsNum)]
        for uid in range(usersNum):
            for t, times in self.T[uid].iteritems():
                i1, i2 = t.split(" ")
                pair_sum[int(i1)][int(i2)] += 1
        for i1 in range(itemsNum):
            for i2 in range(itemsNum):
                if pair_sum[i1][i2] != 0:
                    key = str(i1) + ' ' + str(i2)
                    sum = pair_sum[i1][i2] + pair_sum[i2][i1]
                    alpha = log10(1+9.0*sum/usersNum)
                    idf[key] = alpha*log2(sum*1.0/pair_sum[i1][i2])+(1-alpha)

        W = [{} for i in range(usersNum)]
        for uid in range(usersNum):
            for t, times in self.T[uid].iteritems():
                i1, i2 = t.split(" ")
                diff = self.dataModel.getRating(uid, int(i1))-self.dataModel.getRating(uid, int(i2))
                # if diff != 1:
                #     print 'error!'
                tf = log2(1+abs(diff))
                if diff < 0:
                    tf = -tf
                W[uid][t] = tf * idf[t]

        self.simiMatrix = np.zeros((usersNum, usersNum))
        for i in range(usersNum):
            for j in range(i+1, usersNum):
                s = self.cos(W[i], W[j])
                self.simiMatrix[i][j] = self.simiMatrix[j][i] = s
Beispiel #15
0
 def gen_items_popular(self, trainSamples, trainTargets, hasTimes=False):
     self.dataModel = MemeryDataModel(trainSamples, trainTargets)
     itempopular = np.zeros(self.dataModel.getItemsNum())
     uids = self.dataModel.getData().nonzero()[0]
     iids = self.dataModel.getData().nonzero()[1]
     for i in range(len(iids)):
         iid = iids[i]
         itempopular[iid] += 1
     self.popItems = itempopular
Beispiel #16
0
    def fit(self, trainSamples, trainTargets):
        self.dataModel = MemeryDataModel(trainSamples, trainTargets)
        temp = math.sqrt(self.factors)
        self.item_bias = np.zeros(self.dataModel.getItemsNum())
        self.user_factors = np.array([[
            (0.1 * random.random() / temp) for j in range(self.factors)
        ] for i in range(self.dataModel.getUsersNum())])
        self.item_factors = np.array([[
            (0.1 * random.random() / temp) for j in range(self.factors)
        ] for i in range(self.dataModel.getItemsNum())])
        '''
        user_file = 'pu'
        item_file = 'qi'
        self.user_factors = np.array(pd.read_csv(user_file).values)[:, 1:]
        self.item_factors = np.array(pd.read_csv(item_file).values)[:, 1:]
        '''
        num_loss_samples = int(100 * self.dataModel.getUsersNum()**0.5)
        #print 'sampling {0} <user,item i,item j> triples...'.format(num_loss_samples)
        loss_sampler = UniformUserUniformItem(True)
        self.loss_samples = [
            t for t in loss_sampler.generate_samples(self.dataModel,
                                                     num_loss_samples)
        ]
        old_loss = self.loss()

        update_sampler = UniformPairWithoutReplacement(True)
        #print 'initial loss = {0}'.format(self.loss())
        for it in xrange(self.iter):
            #print 'starting iteration {0}'.format(it)
            for u, i, j in update_sampler.generate_samples(self.dataModel):
                self.update_factors(u, i, j)
            if abs(self.loss() -
                   old_loss) < 0.01 or self.loss() - old_loss > 0:
                #print 'iteration {0}: loss = {1}'.format(it, self.loss())
                #print 'converge!!'
                break
            else:
                old_loss = self.loss()
                self.learning_rate *= 0.9
Beispiel #17
0
    def fit(self, trainSamples, trainTargets):
        self.dataModel = MemeryDataModel(trainSamples, trainTargets)
        self.mu = np.array(trainTargets).mean()
        self.bu = np.zeros(self.dataModel.getUsersNum())
        self.bi = np.zeros(self.dataModel.getItemsNum())
        temp = math.sqrt(self.factors)
        self.qi = [[(0.1 * random.random() / temp)
                    for j in range(self.factors)]
                   for i in range(self.dataModel.getItemsNum())]
        self.pu = [[(0.1 * random.random() / temp)
                    for j in range(self.factors)]
                   for i in range(self.dataModel.getUsersNum())]
        lineData = self.dataModel.getLineData()
        lengthOfTrain = len(lineData)

        for step in range(self.iter):
            rmse_sum = 0.0
            hash = np.random.permutation(lengthOfTrain)
            for j in range(lengthOfTrain):
                n = hash[j]
                row = lineData[n]
                uid = self.dataModel.getUidByUser(row[0])
                iid = self.dataModel.getIidByItem(row[1])
                rating = row[2]
                #rating = 1
                eui = rating - self.predict_single(uid, iid)
                rmse_sum += eui**2
                self.bu[uid] += self.learningrate * (
                    eui - self.userregular * self.bu[uid])
                self.bi[iid] += self.learningrate * (
                    eui - self.itemregular * self.bi[iid])
                temp = self.qi[iid]
                self.qi[iid] += self.learningrate * (
                    np.dot(eui, self.pu[uid]) -
                    np.dot(self.itemregular, self.qi[iid]))
                self.pu[uid] += self.learningrate * (
                    np.dot(eui, temp) - np.dot(self.userregular, self.pu[uid]))
            self.learningrate = self.learningrate * 0.93
Beispiel #18
0
class BPR(BaseEstimator):
    def __init__(self, n=5, factors=50, learning_rate=0.001, bias_regularization=0.001, user_regularization=0.001,
                 positive_item_regularization=0.001, negative_item_regularization=0.001,iter = 50):
        """initialise BPR matrix factorization model
        D: number of factors
        """
        print 'bpr begin'
        self.n = n
        self.factors = factors
        self.learning_rate = learning_rate
        self.bias_regularization = bias_regularization
        self.user_regularization = user_regularization
        self.positive_item_regularization = positive_item_regularization
        self.negative_item_regularization = negative_item_regularization
        self.iter = iter

    def predict(self, testSamples):
        recList = []
        for user_item in testSamples:
            uid = self.dataModel.getUidByUser(user_item[0])
            recList.append(self.recommend(uid))
        return recList

    def update_factors(self, u, i, j, update_u=True, update_i=True):
        """apply SGD update"""
        update_j = True
        x = self.item_bias[i] - self.item_bias[j] \
            + np.dot(self.user_factors[u],self.item_factors[i]-self.item_factors[j])
        z = 1.0/(1.0+exp(x))
        # update bias terms
        if update_i:
            d = z - self.bias_regularization * self.item_bias[i]
            self.item_bias[i] += self.learning_rate * d
        if update_j:
            d = -z - self.bias_regularization * self.item_bias[j]
            self.item_bias[j] += self.learning_rate * d

        if update_u:
            d = (self.item_factors[i]-self.item_factors[j])*z - self.user_regularization*self.user_factors[u]
            self.user_factors[u,:] += self.learning_rate*d
        if update_i:
            d = self.user_factors[u]*z - self.positive_item_regularization*self.item_factors[i]
            self.item_factors[i,:] += self.learning_rate*d
        if update_j:
            d = -self.user_factors[u]*z - self.negative_item_regularization*self.item_factors[j]
            self.item_factors[j] += self.learning_rate*d
    def loss(self):
        ranking_loss = 0
        for u,i,j in self.loss_samples:
            x = self.predict_single(u,i) - self.predict_single(u,j)
            ranking_loss += math.log(1.0+exp(-x))

        complexity = 0
        for u,i,j in self.loss_samples:
            complexity += self.user_regularization * np.dot(self.user_factors[u],self.user_factors[u])
            complexity += self.positive_item_regularization * np.dot(self.item_factors[i],self.item_factors[i])
            complexity += self.negative_item_regularization * np.dot(self.item_factors[j],self.item_factors[j])
            complexity += self.bias_regularization * self.item_bias[i]**2
            complexity += self.bias_regularization * self.item_bias[j]**2
        return ranking_loss + 0.5*complexity
    def fit(self, trainSamples, trainTargets):
        self.dataModel = MemeryDataModel(trainSamples, trainTargets)
        temp = math.sqrt(self.factors)
        self.item_bias = np.zeros(self.dataModel.getItemsNum())
        self.user_factors = np.array([[(0.1 * random.random() / temp) for j in range(self.factors)] for i in range(self.dataModel.getUsersNum())])
        self.item_factors = np.array([[(0.1 * random.random() / temp) for j in range(self.factors)] for i in range(self.dataModel.getItemsNum())])
        '''
        user_file = 'pu'
        item_file = 'qi'
        self.user_factors = np.array(pd.read_csv(user_file).values)[:, 1:]
        self.item_factors = np.array(pd.read_csv(item_file).values)[:, 1:]
        '''
        num_loss_samples = int(100*self.dataModel.getUsersNum()**0.5)
        #print 'sampling {0} <user,item i,item j> triples...'.format(num_loss_samples)
        loss_sampler = UniformUserUniformItem(True)
        self.loss_samples = [t for t in loss_sampler.generate_samples(self.dataModel, num_loss_samples)]
        old_loss = self.loss()

        update_sampler = UniformPairWithoutReplacement(True)
        #print 'initial loss = {0}'.format(self.loss())
        for it in xrange(self.iter):
            #print 'starting iteration {0}'.format(it)
            for u, i, j in update_sampler.generate_samples(self.dataModel):
                self.update_factors(u, i, j)
            if abs(self.loss() - old_loss) < 0.01 or self.loss() - old_loss > 0:
                #print 'iteration {0}: loss = {1}'.format(it, self.loss())
                #print 'converge!!'
                break
            else:
                old_loss = self.loss()
                self.learning_rate *= 0.9
                #print 'iteration {0}: loss = {1}'.format(it, self.loss())

    def predict_single(self,uid,iid):
        return self.item_bias[iid] + np.dot(self.user_factors[uid],self.item_factors[iid])
    def recommend(self, u):
        uid = self.dataModel.getUidByUser(u)
        if uid == -1:
            print 'not in test'
            return []
        else:
            predict_scores = []
            for i in range(self.dataModel.getItemsNum()):
                s = self.predict_single(uid, i)
                predict_scores.append(s)
            topN = np.argsort(np.array(predict_scores))[-1:-self.n - 1:-1]
            return [self.dataModel.getItemByIid(i) for i in topN]
    def score(self, testSamples, trueLabels):
        print 'BPR scoring ...'
        trueList = []
        recommendList= []
        user_unique = list(set(np.array(testSamples)[:,0]))
        for u in user_unique:
            uTrueIndex = np.argwhere(np.array(testSamples)[:,0] == u)[:,0]
            #true = [self.dataModel.getIidByItem(i) for i in list(np.array(testSamples)[uTrueIndex][:,1])]
            true = list(np.array(testSamples)[uTrueIndex][:,1])
            trueList.append(true)
            pre = self.recommend(u)
            recommendList.append(pre)
        e = Eval()
        result = e.evalAll(recommendList, trueList)
        print 'BPR result:'+ '('+str(self.get_params())+')'+str((result)['F1'])
        return (result)['F1']
Beispiel #19
0
class LFM(BaseEstimator):
    def __init__(self,
                 n=5,
                 factors=25,
                 learningrate=0.05,
                 userregular=0.0001,
                 itemregular=0.0001,
                 iter=10):
        print 'lfm begin'
        self.factors = factors
        self.n = n
        self.learningrate = learningrate
        self.userregular = userregular
        self.itemregular = itemregular
        self.iter = iter

    def predict(self, testSamples):
        recList = []
        for user_item in testSamples:
            uid = self.dataModel.getUidByUser(user_item[0])
            recList.append(self.recommend(uid))
        return recList

    def fit(self, trainSamples, trainTargets):
        self.dataModel = MemeryDataModel(trainSamples, trainTargets)
        self.mu = np.array(trainTargets).mean()
        self.bu = np.zeros(self.dataModel.getUsersNum())
        self.bi = np.zeros(self.dataModel.getItemsNum())
        temp = math.sqrt(self.factors)
        self.qi = [[(0.1 * random.random() / temp)
                    for j in range(self.factors)]
                   for i in range(self.dataModel.getItemsNum())]
        self.pu = [[(0.1 * random.random() / temp)
                    for j in range(self.factors)]
                   for i in range(self.dataModel.getUsersNum())]
        lineData = self.dataModel.getLineData()
        lengthOfTrain = len(lineData)

        for step in range(self.iter):
            rmse_sum = 0.0
            hash = np.random.permutation(lengthOfTrain)
            for j in range(lengthOfTrain):
                n = hash[j]
                row = lineData[n]
                uid = self.dataModel.getUidByUser(row[0])
                iid = self.dataModel.getIidByItem(row[1])
                rating = row[2]
                #rating = 1
                eui = rating - self.predict_single(uid, iid)
                rmse_sum += eui**2
                self.bu[uid] += self.learningrate * (
                    eui - self.userregular * self.bu[uid])
                self.bi[iid] += self.learningrate * (
                    eui - self.itemregular * self.bi[iid])
                temp = self.qi[iid]
                self.qi[iid] += self.learningrate * (
                    np.dot(eui, self.pu[uid]) -
                    np.dot(self.itemregular, self.qi[iid]))
                self.pu[uid] += self.learningrate * (
                    np.dot(eui, temp) - np.dot(self.userregular, self.pu[uid]))
            self.learningrate = self.learningrate * 0.93

    def predict_single(self, uid, iid):
        ans = self.mu + self.bi[iid] + self.bu[uid] + np.dot(
            self.qi[iid], self.pu[uid])
        if ans > 5:
            return 5
        elif ans < 1:
            return 1
        return ans

    def recommend(self, u):
        uid = self.dataModel.getUidByUser(u)
        if uid == -1:
            print 'not in test'
            return []
        else:
            predict_scores = []
            for i in range(self.dataModel.getItemsNum()):
                predict_scores.append(self.predict_single(uid, i))
            topN = np.argsort(np.array(predict_scores))[-1:-self.n - 1:-1]
            return [self.dataModel.getItemByIid(i) for i in topN]

    def score(self, testSamples, trueLabels):
        print 'LFM scoring ...'
        trueList = []
        recommendList = []
        user_unique = list(set(np.array(testSamples)[:, 0]))
        for u in user_unique:
            uTrueIndex = np.argwhere(np.array(testSamples)[:, 0] == u)[:, 0]
            #true = [self.dataModel.getIidByItem(i) for i in list(np.array(testSamples)[uTrueIndex][:,1])]
            true = list(np.array(testSamples)[uTrueIndex][:, 1])
            trueList.append(true)
            pre = self.recommend(u)
            recommendList.append(pre)
        e = Eval()
        result = e.evalAll(recommendList, trueList)
        print 'LFM result:' + '(' + str(self.get_params()) + ')' + str(
            (result)['F1'])
        return (result)['F1']
Beispiel #20
0
class VSRank(BaseEstimator):

    def __init__(self, neighbornum=5, n=5):
        print 'vsrank begin'
        self.neighbornum = neighbornum
        self.similarity = Similarity('COSINE')
        self.n = n

    def predict(self,testSamples):
        recList = []
        for user_item in testSamples:
            uid = self.dataModel.getUidByUser(user_item[0])
            recList.append(self.recommend(uid))
        return recList

    def fit(self, trainSamples, trainTargets):
        self.dataModel = MemeryDataModel(trainSamples, trainTargets, isRating=True)
        usersNum = self.dataModel.getUsersNum()
        itemsNum = self.dataModel.getItemsNum()
        self.T = [{} for i in range(usersNum)]
        for uid in range(usersNum):
            purchased_items = self.dataModel.getItemIDsFromUid(uid)
            for i in range(len(purchased_items)):
                for j in range(i+1, len(purchased_items)):
                    rating_i = self.dataModel.getRating(uid, purchased_items[i])
                    rating_j = self.dataModel.getRating(uid, purchased_items[j])
                    if rating_i > rating_j:
                        key = str(purchased_items[i]) + " " + str(purchased_items[j])
                    elif rating_i < rating_j:
                        key = str(purchased_items[j]) + " " + str(purchased_items[i])
                    else:
                        continue
                    self.T[uid][key] = 1

        for uid in range(usersNum):
            print self.dataModel.getUserByUid(uid), len(self.T[uid])

        idf = {}
        pair_sum = [[0]*itemsNum for i in range(itemsNum)]
        for uid in range(usersNum):
            for t, times in self.T[uid].iteritems():
                i1, i2 = t.split(" ")
                pair_sum[int(i1)][int(i2)] += 1
        for i1 in range(itemsNum):
            for i2 in range(itemsNum):
                if pair_sum[i1][i2] != 0:
                    key = str(i1) + ' ' + str(i2)
                    sum = pair_sum[i1][i2] + pair_sum[i2][i1]
                    alpha = log10(1+9.0*sum/usersNum)
                    idf[key] = alpha*log2(sum*1.0/pair_sum[i1][i2])+(1-alpha)

        W = [{} for i in range(usersNum)]
        for uid in range(usersNum):
            for t, times in self.T[uid].iteritems():
                i1, i2 = t.split(" ")
                diff = self.dataModel.getRating(uid, int(i1))-self.dataModel.getRating(uid, int(i2))
                tf = log2(1+abs(diff))
                W[uid][t] = tf * idf[t]

        self.simiMatrix = np.zeros((usersNum, usersNum))
        for i in range(usersNum):
            for j in range(i+1, usersNum):
                s = self.cos(W[i], W[j])
                self.simiMatrix[i][j] = self.simiMatrix[j][i] = s

    def cos(self, dict1, dict2):
        product = 0.0
        m1 = 0.0
        m2 = 0.0
        for k, v in dict1.iteritems():
            m1 += v*v
            i1, i2 = k.split(' ')
            k_ = i2 + ' ' + i1
            if dict2.has_key(k):
                product += v * dict2[k]
            elif dict2.has_key(k_):
                product -= v * dict2[k_]
        for k, v in dict2.iteritems():
            m2 += v*v
        if product == 0:
            return 0
        else:
            return product/sqrt(m1)/sqrt(m2)

    def tau(self, dict1, dict2, u1, u2):
        pass


    def neighborhood(self, userID):
        neighbors = np.argsort(np.array(self.simiMatrix[userID]))[-1:-self.neighbornum-1:-1]
        return neighbors


    def predict_single(self, userID, itemID):
        rating = 0.0
        for uid in self.neighborhood(userID):
            if itemID in self.dataModel.getItemIDsFromUid(uid):
                rating += self.simiMatrix[userID][uid] * self.dataModel.getRating(uid, itemID)
        return rating

    def recommend(self, u):
        userID = self.dataModel.getUidByUser(u)
        if userID == -1:
            print 'not in test'
            return []
        else:
            # return self.recommend_listwise(userID)
            return self.recommend_pairwise(userID)

    def recommend_pointwise(self, userID):
        #interactedItems = self.dataModel.getItemIDsFromUid(userID)
        ratings = dict()
        for uid in self.neighborhood(userID):
            for iid in self.dataModel.getItemIDsFromUid(uid):
                #if iid in interactedItems:
                    #continue
                r = ratings.get(iid, 0)
                ratings[iid] = r + self.simiMatrix[userID][uid] * self.dataModel.getRating(uid, iid)
        r = [x for (x, y) in sorted(ratings.items(), lambda a, b: cmp(a[1], b[1]), reverse=True)[:self.n]]
        return [self.dataModel.getItemByIid(i) for i in r]

    def recommend_pairwise(self, userID):
        itemsNum = self.dataModel.getItemsNum()
        N = itemsNum
        recNum = self.n
        pi = [0]*itemsNum
        rank = []
        for i in range(itemsNum):
            sum1 = 0
            sum2 = 0
            for j in range(itemsNum):
                if j != i:
                    p = self.preference(userID, i, j)
                    sum1 += p
                    sum2 -= p
            pi[i] = sum1 - sum2
        I = set(i for i in range(itemsNum))
        while recNum > 0:
        # while len(I) > 0:
            recNum -= 1
            t = np.argmax(pi)
            rank.append(t)
            I.remove(t)
            pi[t] = None
            for i in I:
                pi[i] += self.preference(userID, t, i) - self.preference(userID, i, t)
        # r = [x for (x, y) in sorted(zip(range(itemsNum), rank), lambda a, b: cmp(a[1], b[1]))[:self.n]]
        return [self.dataModel.getItemByIid(i) for i in rank]

    def preference(self, uid, i1, i2):
        nerghborhood = []
        keystr = str(i1) + ' ' + str(i2)
        keystr_ = str(i2) + ' ' + str(i1)
        for i in range(self.dataModel.getUsersNum()):
            if self.T[i].has_key(keystr) or self.T[i].has_key(keystr_):
                nerghborhood.append(i)
        distance = [0]*len(nerghborhood)
        for i in range(len(nerghborhood)):
            distance[i] = self.simiMatrix[uid][nerghborhood[i]]
        nerghborhood = [x for (x, y) in sorted(zip(nerghborhood, distance), lambda a, b: cmp(a[1], b[1]), reverse=True)[:self.neighbornum]]
        preference = 0.0
        sum = 0.0
        for i in nerghborhood:
            rating1 = self.dataModel.getRating(i, i1)
            rating2 = self.dataModel.getRating(i, i2)
            sum += self.simiMatrix[uid][i]
            if rating1 > rating2:
                preference += self.simiMatrix[uid][i]
            elif rating1 < rating2:
                preference -= self.simiMatrix[uid][i]
        if sum == 0:
            return 0
        else:
            return preference/sum

    def recommend_listwise(self, userID):
        itemsNum = self.dataModel.getItemsNum()
        M = [[0]*itemsNum for i in range(itemsNum)]
        for uid in self.neighborhood(userID):
            for t, times in self.T[uid].iteritems():
                i1, i2 = t.split(" ")
                M[int(i1)][int(i2)] += 1
        for m in xrange(itemsNum):
            for n in xrange(itemsNum):
                for k in xrange(itemsNum):
                    M[n][k] = max(M[n][k], min(M[n][m], M[m][k]))
        rank = [0]*itemsNum
        for m in range(itemsNum):
            for n in range(itemsNum):
                if n != m and M[m][n] > M[n][m]:
                    rank[m] += 1
        r = [x for (x, y) in sorted(zip(range(itemsNum), rank), lambda a, b: cmp(a[1], b[1]))[:self.n]]
        return [self.dataModel.getItemByIid(i) for i in r]

    def score(self, testSamples, trueLabels):
        print 'vsrank scoring ...'
        #print len(testSamples)
        trueList = []
        recommendList= []
        user_unique = list(set(np.array(testSamples)[:,0]))
        for u in user_unique:
            uTrueIndex = np.argwhere(np.array(testSamples)[:,0] == u)[:,0]
            #true = [self.dataModel.getIidByItem(i) for i in list(np.array(testSamples)[uTrueIndex][:,1])]
            uTrueItem = list(np.array(testSamples)[uTrueIndex][:,1])
            uTrueRating = list(np.array(trueLabels)[uTrueIndex])
            true = [x for (x, y) in sorted(zip(uTrueItem, uTrueRating), lambda a, b: cmp(a[1], b[1]), reverse=True)[:self.n]]
            trueList.append(true)
            pre = self.recommend(u)
            recommendList.append(pre)
        e = Eval()
        result = e.evalAll(recommendList, trueList)

        print 'vsrank result:'+'('+str(self.get_params())+')'+str(result)
        return (result)['F1']
Beispiel #21
0
class LFM(BaseEstimator):
    def __init__(self, n=5, factors=25, learningrate=0.05, userregular=0.0001, itemregular=0.0001, iter = 10):
        print 'lfm begin'
        self.factors = factors
        self.n = n
        self.learningrate = learningrate
        self.userregular = userregular
        self.itemregular = itemregular
        self.iter = iter

    def predict(self, testSamples):
        recList = []
        for user_item in testSamples:
            uid = self.dataModel.getUidByUser(user_item[0])
            recList.append(self.recommend(uid))
        return recList

    def fit(self, trainSamples, trainTargets):
        self.dataModel = MemeryDataModel(trainSamples, trainTargets)
        self.mu = np.array(trainTargets).mean()
        self.bu = np.zeros(self.dataModel.getUsersNum())
        self.bi = np.zeros(self.dataModel.getItemsNum())
        temp = math.sqrt(self.factors)
        self.qi = [[(0.1 * random.random() / temp) for j in range(self.factors)] for i in range(self.dataModel.getItemsNum())]
        self.pu = [[(0.1 * random.random() / temp) for j in range(self.factors)] for i in range(self.dataModel.getUsersNum())]
        lineData = self.dataModel.getLineData()
        lengthOfTrain = len(lineData)

        for step in range(self.iter):
            rmse_sum = 0.0
            hash = np.random.permutation(lengthOfTrain)
            for j in range(lengthOfTrain):
                n = hash[j]
                row = lineData[n]
                uid = self.dataModel.getUidByUser(row[0])
                iid = self.dataModel.getIidByItem(row[1])
                rating = row[2]
                #rating = 1
                eui = rating - self.predict_single(uid, iid)
                rmse_sum += eui**2
                self.bu[uid] += self.learningrate*(eui-self.userregular*self.bu[uid])
                self.bi[iid] += self.learningrate*(eui-self.itemregular*self.bi[iid])
                temp = self.qi[iid]
                self.qi[iid] += self.learningrate*(np.dot(eui, self.pu[uid]) - np.dot(self.itemregular, self.qi[iid]))
                self.pu[uid] += self.learningrate*(np.dot(eui, temp) - np.dot(self.userregular, self.pu[uid]))
            self.learningrate = self.learningrate * 0.93

    def predict_single(self, uid, iid):
        ans = self.mu + self.bi[iid] + self.bu[uid] + np.dot(self.qi[iid], self.pu[uid])
        if ans > 5:
            return 5
        elif ans < 1:
            return 1
        return ans
    def recommend(self, u):
        uid = self.dataModel.getUidByUser(u)
        if uid == -1:
            print 'not in test'
            return []
        else:
            predict_scores = []
            for i in range(self.dataModel.getItemsNum()):
                predict_scores.append(self.predict_single(uid, i))
            topN = np.argsort(np.array(predict_scores))[-1:-self.n-1:-1]
            return [self.dataModel.getItemByIid(i) for i in topN]
    def score(self, testSamples, trueLabels):
        print 'LFM scoring ...'
        trueList = []
        recommendList= []
        user_unique = list(set(np.array(testSamples)[:,0]))
        for u in user_unique:
            uTrueIndex = np.argwhere(np.array(testSamples)[:,0] == u)[:,0]
            #true = [self.dataModel.getIidByItem(i) for i in list(np.array(testSamples)[uTrueIndex][:,1])]
            true = list(np.array(testSamples)[uTrueIndex][:,1])
            trueList.append(true)
            pre = self.recommend(u)
            recommendList.append(pre)
        e = Eval()
        result = e.evalAll(recommendList, trueList)
        print 'LFM result:'+ '('+str(self.get_params())+')'+str((result)['F1'])
        return (result)['F1']
Beispiel #22
0
class BPR(BaseEstimator):
    def __init__(self,
                 n=5,
                 factors=50,
                 learning_rate=0.001,
                 bias_regularization=0.001,
                 user_regularization=0.001,
                 positive_item_regularization=0.001,
                 negative_item_regularization=0.001,
                 iter=50):
        """initialise BPR matrix factorization model
        D: number of factors
        """
        print 'bpr begin'
        self.n = n
        self.factors = factors
        self.learning_rate = learning_rate
        self.bias_regularization = bias_regularization
        self.user_regularization = user_regularization
        self.positive_item_regularization = positive_item_regularization
        self.negative_item_regularization = negative_item_regularization
        self.iter = iter

    def predict(self, testSamples):
        recList = []
        for user_item in testSamples:
            uid = self.dataModel.getUidByUser(user_item[0])
            recList.append(self.recommend(uid))
        return recList

    def update_factors(self, u, i, j, update_u=True, update_i=True):
        """apply SGD update"""
        update_j = True
        x = self.item_bias[i] - self.item_bias[j] \
            + np.dot(self.user_factors[u],self.item_factors[i]-self.item_factors[j])
        z = 1.0 / (1.0 + exp(x))
        # update bias terms
        if update_i:
            d = z - self.bias_regularization * self.item_bias[i]
            self.item_bias[i] += self.learning_rate * d
        if update_j:
            d = -z - self.bias_regularization * self.item_bias[j]
            self.item_bias[j] += self.learning_rate * d

        if update_u:
            d = (self.item_factors[i] - self.item_factors[j]
                 ) * z - self.user_regularization * self.user_factors[u]
            self.user_factors[u, :] += self.learning_rate * d
        if update_i:
            d = self.user_factors[
                u] * z - self.positive_item_regularization * self.item_factors[
                    i]
            self.item_factors[i, :] += self.learning_rate * d
        if update_j:
            d = -self.user_factors[
                u] * z - self.negative_item_regularization * self.item_factors[
                    j]
            self.item_factors[j] += self.learning_rate * d

    def loss(self):
        ranking_loss = 0
        for u, i, j in self.loss_samples:
            x = self.predict_single(u, i) - self.predict_single(u, j)
            ranking_loss += math.log(1.0 + exp(-x))

        complexity = 0
        for u, i, j in self.loss_samples:
            complexity += self.user_regularization * np.dot(
                self.user_factors[u], self.user_factors[u])
            complexity += self.positive_item_regularization * np.dot(
                self.item_factors[i], self.item_factors[i])
            complexity += self.negative_item_regularization * np.dot(
                self.item_factors[j], self.item_factors[j])
            complexity += self.bias_regularization * self.item_bias[i]**2
            complexity += self.bias_regularization * self.item_bias[j]**2
        return ranking_loss + 0.5 * complexity

    def fit(self, trainSamples, trainTargets):
        self.dataModel = MemeryDataModel(trainSamples, trainTargets)
        temp = math.sqrt(self.factors)
        self.item_bias = np.zeros(self.dataModel.getItemsNum())
        self.user_factors = np.array([[
            (0.1 * random.random() / temp) for j in range(self.factors)
        ] for i in range(self.dataModel.getUsersNum())])
        self.item_factors = np.array([[
            (0.1 * random.random() / temp) for j in range(self.factors)
        ] for i in range(self.dataModel.getItemsNum())])
        '''
        user_file = 'pu'
        item_file = 'qi'
        self.user_factors = np.array(pd.read_csv(user_file).values)[:, 1:]
        self.item_factors = np.array(pd.read_csv(item_file).values)[:, 1:]
        '''
        num_loss_samples = int(100 * self.dataModel.getUsersNum()**0.5)
        #print 'sampling {0} <user,item i,item j> triples...'.format(num_loss_samples)
        loss_sampler = UniformUserUniformItem(True)
        self.loss_samples = [
            t for t in loss_sampler.generate_samples(self.dataModel,
                                                     num_loss_samples)
        ]
        old_loss = self.loss()

        update_sampler = UniformPairWithoutReplacement(True)
        #print 'initial loss = {0}'.format(self.loss())
        for it in xrange(self.iter):
            #print 'starting iteration {0}'.format(it)
            for u, i, j in update_sampler.generate_samples(self.dataModel):
                self.update_factors(u, i, j)
            if abs(self.loss() -
                   old_loss) < 0.01 or self.loss() - old_loss > 0:
                #print 'iteration {0}: loss = {1}'.format(it, self.loss())
                #print 'converge!!'
                break
            else:
                old_loss = self.loss()
                self.learning_rate *= 0.9
                #print 'iteration {0}: loss = {1}'.format(it, self.loss())

    def predict_single(self, uid, iid):
        return self.item_bias[iid] + np.dot(self.user_factors[uid],
                                            self.item_factors[iid])

    def recommend(self, u):
        uid = self.dataModel.getUidByUser(u)
        if uid == -1:
            print 'not in test'
            return []
        else:
            predict_scores = []
            for i in range(self.dataModel.getItemsNum()):
                s = self.predict_single(uid, i)
                predict_scores.append(s)
            topN = np.argsort(np.array(predict_scores))[-1:-self.n - 1:-1]
            return [self.dataModel.getItemByIid(i) for i in topN]

    def score(self, testSamples, trueLabels):
        print 'BPR scoring ...'
        trueList = []
        recommendList = []
        user_unique = list(set(np.array(testSamples)[:, 0]))
        for u in user_unique:
            uTrueIndex = np.argwhere(np.array(testSamples)[:, 0] == u)[:, 0]
            #true = [self.dataModel.getIidByItem(i) for i in list(np.array(testSamples)[uTrueIndex][:,1])]
            true = list(np.array(testSamples)[uTrueIndex][:, 1])
            trueList.append(true)
            pre = self.recommend(u)
            recommendList.append(pre)
        e = Eval()
        result = e.evalAll(recommendList, trueList)
        print 'BPR result:' + '(' + str(self.get_params()) + ')' + str(
            (result)['F1'])
        return (result)['F1']
Beispiel #23
0
class ItemCF(BaseEstimator):

    def __init__(self, neighbornum=5, n=5):
        self.neighbornum = neighbornum
        self.similarity = Similarity('COSINE')
        self.n = n

    def predict(self,testSamples):
        recList = []
        for user_item in testSamples:
            uid = self.dataModel.getUidByUser(user_item[0])
            recList.append(self.recommend(uid))
        return recList

    def fit(self, trainSamples, trainTargets):
        self.dataModel = MemeryDataModel(trainSamples, trainTargets)
        itemsNum = self.dataModel.getItemsNum()
        self.simiMatrix = np.zeros((itemsNum, itemsNum))
        for i in range(itemsNum):
            for j in range(i+1, itemsNum):
                s = self.similarity.compute(self.dataModel.getUserIDsFromIid(i), self.dataModel.getUserIDsFromIid(j))
                self.simiMatrix[i][j] = self.simiMatrix[j][i] = s

    def neighborhood(self, itemID):
        neighbors = np.argsort(np.array(self.simiMatrix[itemID]))[-1:-self.neighbornum-1:-1]
        return neighbors

    def predict_single(self, userID, itemID):
        rating = 0.0
        for iid in self.neighborhood(itemID):
            if userID in self.dataModel.getUserIDsFromIid(iid):
                rating += self.simiMatrix[itemID][iid] * self.dataModel.getRating(userID, iid)
        return rating

    def recommend(self, u):
        userID = self.dataModel.getUidByUser(u)
        if userID == -1:
            print 'not in test'
            return []
        else:

            #interactedItems = self.dataModel.getItemIDsFromUid(userID)
            ratings = dict()
            for iid in self.dataModel.getItemIDsFromUid(userID):
                for niid in self.neighborhood(iid):
                    #if iid in interactedItems:
                        #continue
                    r = ratings.get(iid, 0)
                    ratings[iid] = r + self.simiMatrix[iid][niid] * self.dataModel.getRating(userID, niid)
            r = [x for (x, y) in sorted(ratings.items(), lambda a, b: cmp(a[1], b[1]), reverse=True)[:self.n]]
            return [self.dataModel.getItemByIid(i) for i in r]

    def score(self, testSamples, trueLabels):
        print 'Item_CF scoring ...'
        trueList = []
        recommendList= []
        user_unique = list(set(np.array(testSamples)[:,0]))
        for u in user_unique:
            uTrueIndex = np.argwhere(np.array(testSamples)[:,0] == u)[:,0]
            #true = [self.dataModel.getIidByItem(i) for i in list(np.array(testSamples)[uTrueIndex][:,1])]
            true = list(np.array(testSamples)[uTrueIndex][:,1])
            trueList.append(true)
            pre = self.recommend(u)
            recommendList.append(pre)
        e = Eval()
        result = e.evalAll(trueList, recommendList)
        print 'ItemCF result:'+'('+str(self.get_params())+')'+str((result)['F1'])
        return (result)['F1']
Beispiel #24
0
class VSRankPlus(BaseEstimator):

    def __init__(self, neighbornum=5, n=5):
        print 'vsrank begin'
        self.neighbornum = neighbornum
        self.similarity = Similarity('COSINE')
        self.n = n
        self.sample_rate = 5

    def predict(self,testSamples):
        recList = []
        for user_item in testSamples:
            uid = self.dataModel.getUidByUser(user_item[0])
            recList.append(self.recommend(uid))
        return recList

    def fit(self, trainSamples, trainTargets):
        self.dataModel = MemeryDataModel(trainSamples, trainTargets, hasTimes=True)
        usersNum = self.dataModel.getUsersNum()
        itemsNum = self.dataModel.getItemsNum()
        all_item_set = set(range(itemsNum))

        self.T = [{} for i in range(usersNum)]
        for uid in range(usersNum):
            purchased_items = self.dataModel.getItemIDsFromUid(uid)
            for i in range(len(purchased_items)):
                for j in range(i+1, len(purchased_items)):
                    rating_i = self.dataModel.getRating(uid, purchased_items[i])
                    rating_j = self.dataModel.getRating(uid, purchased_items[j])
                    if rating_i > rating_j:
                        key = str(purchased_items[i]) + " " + str(purchased_items[j])
                    elif rating_i < rating_j:
                        key = str(purchased_items[j]) + " " + str(purchased_items[i])
                    else:
                        continue
                    self.T[uid][key] = 1
            # for i in purchased_items:
            #     purchased_items = self.dataModel.getItemIDsFromUid(uid)
                # unpurchased_items = random.sample(all_item_set.difference(purchased_items), self.sample_rate)
                # for j in unpurchased_items:
                #     key = str(i) + " " + str(j)
                #     self.T[uid][key] = 1

        idf = {}
        pair_sum = [[0]*itemsNum for i in range(itemsNum)]
        for uid in range(usersNum):
            for t, times in self.T[uid].iteritems():
                i1, i2 = t.split(" ")
                pair_sum[int(i1)][int(i2)] += 1
        for i1 in range(itemsNum):
            for i2 in range(itemsNum):
                if pair_sum[i1][i2] != 0:
                    key = str(i1) + ' ' + str(i2)
                    sum = pair_sum[i1][i2] + pair_sum[i2][i1]
                    alpha = log10(1+9.0*sum/usersNum)
                    idf[key] = alpha*log2(sum*1.0/pair_sum[i1][i2])+(1-alpha)

        W = [{} for i in range(usersNum)]
        for uid in range(usersNum):
            for t, times in self.T[uid].iteritems():
                i1, i2 = t.split(" ")
                diff = self.dataModel.getRating(uid, int(i1))-self.dataModel.getRating(uid, int(i2))
                # if diff != 1:
                #     print 'error!'
                tf = log2(1+abs(diff))
                if diff < 0:
                    tf = -tf
                W[uid][t] = tf * idf[t]

        self.simiMatrix = np.zeros((usersNum, usersNum))
        for i in range(usersNum):
            for j in range(i+1, usersNum):
                s = self.cos(W[i], W[j])
                self.simiMatrix[i][j] = self.simiMatrix[j][i] = s

    def cos(self, dict1, dict2):
        product = 0.0
        m1 = 0.0
        m2 = 0.0
        for k, v in dict1.iteritems():
            m1 += v**2
            if dict2.has_key(k):
                product += v * dict2[k]
        for k, v in dict2.iteritems():
            m2 += v**2
        if product == 0:
            return 0
        else:
            return product/sqrt(m1)/sqrt(m2)

    def tau(self, dict1, dict2, u1, u2):
        pass


    def neighborhood(self, userID):
        neighbors = np.argsort(np.array(self.simiMatrix[userID]))[-1:-self.neighbornum-1:-1]
        return neighbors

    def predict_single(self, userID, itemID):
        rating = 0.0
        for uid in self.neighborhood(userID):
            if itemID in self.dataModel.getItemIDsFromUid(uid):
                rating += self.simiMatrix[userID][uid] * self.dataModel.getRating(uid, itemID)
        return rating

    def recommend(self, u):
        userID = self.dataModel.getUidByUser(u)
        if userID == -1:
            print 'not in test'
            return []
        else:
            # return self.recommend_listwise(userID)
            return self.recommend_pairwise(userID)

    def recommend_pointwise(self, userID):
        #interactedItems = self.dataModel.getItemIDsFromUid(userID)
        ratings = dict()
        for uid in self.neighborhood(userID):
            for iid in self.dataModel.getItemIDsFromUid(uid):
                #if iid in interactedItems:
                    #continue
                r = ratings.get(iid, 0)
                ratings[iid] = r + self.simiMatrix[userID][uid] * self.dataModel.getRating(uid, iid)
        r = [x for (x, y) in sorted(ratings.items(), lambda a, b: cmp(a[1], b[1]), reverse=True)[:self.n]]
        return [self.dataModel.getItemByIid(i) for i in r]

    def recommend_pairwise(self, userID):
        itemsNum = self.dataModel.getItemsNum()
        N = itemsNum
        recNum = self.n
        pi = [0]*itemsNum
        rank = []
        for i in range(itemsNum):
            sum1 = 0
            sum2 = 0
            for j in range(itemsNum):
                if j != i:
                    p = self.preference(userID, i, j)
                    sum1 += p
                    sum2 -= p
            pi[i] = sum1 - sum2
        I = set(i for i in range(itemsNum))
        while recNum > 0:
        # while len(I) > 0:
            recNum -= 1
            t = np.argmax(pi)
            rank.append(t)
            I.remove(t)
            pi[t] = None
            for i in I:
                pi[i] += self.preference(userID, t, i) - self.preference(userID, i, t)
        # r = [x for (x, y) in sorted(zip(range(itemsNum), rank), lambda a, b: cmp(a[1], b[1]))[:self.n]]
        return [self.dataModel.getItemByIid(i) for i in rank]

    def preference(self, uid, i1, i2):
        nerghborhood = []
        keystr = str(i1) + ' ' + str(i2)
        keystr_ = str(i2) + ' ' + str(i1)
        for i in range(self.dataModel.getUsersNum()):
            if self.T[i].has_key(keystr) or self.T[i].has_key(keystr_):
                nerghborhood.append(i)
        distance = [0]*len(nerghborhood)
        for i in range(len(nerghborhood)):
            distance[i] = self.simiMatrix[uid][nerghborhood[i]]
        nerghborhood = [x for (x, y) in sorted(zip(nerghborhood, distance), lambda a, b: cmp(a[1], b[1]), reverse=True)[:self.neighbornum]]
        preference = 0.0
        sum = 0.0
        for i in nerghborhood:
            rating1 = self.dataModel.getRating(i, i1)
            rating2 = self.dataModel.getRating(i, i2)
            sum += self.simiMatrix[uid][i]
            if rating1 > rating2:
                preference += self.simiMatrix[uid][i]
            elif rating1 < rating2:
                preference -= self.simiMatrix[uid][i]
        if sum == 0:
            return 0
        else:
            return preference/sum

    def recommend_listwise(self, userID):
        itemsNum = self.dataModel.getItemsNum()
        M = [[0]*itemsNum for i in range(itemsNum)]
        for uid in self.neighborhood(userID):
            for t, times in self.T[uid].iteritems():
                i1, i2 = t.split(" ")
                M[int(i1)][int(i2)] += 1
        for m in xrange(itemsNum):
            for n in xrange(itemsNum):
                for k in xrange(itemsNum):
                    M[n][k] = max(M[n][k], min(M[n][m], M[m][k]))
        rank = [0]*itemsNum
        for m in range(itemsNum):
            for n in range(itemsNum):
                if n != m and M[m][n] > M[n][m]:
                    rank[m] += 1
        r = [x for (x, y) in sorted(zip(range(itemsNum), rank), lambda a, b: cmp(a[1], b[1]))[:self.n]]
        return [self.dataModel.getItemByIid(i) for i in r]

    def score(self, testSamples, trueLabels):
        print 'vsrank scoring ...'
        #print len(testSamples)
        trueList = []
        recommendList= []
        user_unique = list(set(np.array(testSamples)[:,0]))
        for u in user_unique:
            uTrueIndex = np.argwhere(np.array(testSamples)[:,0] == u)[:,0]
            #true = [self.dataModel.getIidByItem(i) for i in list(np.array(testSamples)[uTrueIndex][:,1])]
            true = list(np.array(testSamples)[uTrueIndex][:,1])
            trueList.append(true)
            pre = self.recommend(u)
            recommendList.append(pre)
        e = Eval()
        result = e.evalAll(recommendList, trueList)
        print 'vsrank result:'+'('+str(self.get_params())+')'+str((result)['F1'])
        return (result)['F1']
Beispiel #25
0
class ItemCF(BaseEstimator):
    def __init__(self, neighbornum=5, n=5):
        self.neighbornum = neighbornum
        self.similarity = Similarity('COSINE')
        self.n = n

    def predict(self, testSamples):
        recList = []
        for user_item in testSamples:
            uid = self.dataModel.getUidByUser(user_item[0])
            recList.append(self.recommend(uid))
        return recList

    def fit(self, trainSamples, trainTargets):
        self.dataModel = MemeryDataModel(trainSamples, trainTargets)
        itemsNum = self.dataModel.getItemsNum()
        self.simiMatrix = np.zeros((itemsNum, itemsNum))
        for i in range(itemsNum):
            for j in range(i + 1, itemsNum):
                s = self.similarity.compute(
                    self.dataModel.getUserIDsFromIid(i),
                    self.dataModel.getUserIDsFromIid(j))
                self.simiMatrix[i][j] = self.simiMatrix[j][i] = s

    def neighborhood(self, itemID):
        neighbors = np.argsort(np.array(
            self.simiMatrix[itemID]))[-1:-self.neighbornum - 1:-1]
        return neighbors

    def predict_single(self, userID, itemID):
        rating = 0.0
        for iid in self.neighborhood(itemID):
            if userID in self.dataModel.getUserIDsFromIid(iid):
                rating += self.simiMatrix[itemID][
                    iid] * self.dataModel.getRating(userID, iid)
        return rating

    def recommend(self, u):
        userID = self.dataModel.getUidByUser(u)
        if userID == -1:
            print 'not in test'
            return []
        else:

            #interactedItems = self.dataModel.getItemIDsFromUid(userID)
            ratings = dict()
            for iid in self.dataModel.getItemIDsFromUid(userID):
                for niid in self.neighborhood(iid):
                    #if iid in interactedItems:
                    #continue
                    r = ratings.get(iid, 0)
                    ratings[iid] = r + self.simiMatrix[iid][
                        niid] * self.dataModel.getRating(userID, niid)
            r = [
                x for (x, y) in sorted(ratings.items(),
                                       lambda a, b: cmp(a[1], b[1]),
                                       reverse=True)[:self.n]
            ]
            return [self.dataModel.getItemByIid(i) for i in r]

    def score(self, testSamples, trueLabels):
        print 'Item_CF scoring ...'
        trueList = []
        recommendList = []
        user_unique = list(set(np.array(testSamples)[:, 0]))
        for u in user_unique:
            uTrueIndex = np.argwhere(np.array(testSamples)[:, 0] == u)[:, 0]
            #true = [self.dataModel.getIidByItem(i) for i in list(np.array(testSamples)[uTrueIndex][:,1])]
            true = list(np.array(testSamples)[uTrueIndex][:, 1])
            trueList.append(true)
            pre = self.recommend(u)
            recommendList.append(pre)
        e = Eval()
        result = e.evalAll(trueList, recommendList)
        print 'ItemCF result:' + '(' + str(self.get_params()) + ')' + str(
            (result)['F1'])
        return (result)['F1']
Beispiel #26
0
    def fit(self, trainSamples, trainTargets):
        self.dataModel = MemeryDataModel(trainSamples,
                                         trainTargets,
                                         hasTimes=True)
        usersNum = self.dataModel.getUsersNum()
        itemsNum = self.dataModel.getItemsNum()
        all_item_set = set(range(itemsNum))

        self.T = [{} for i in range(usersNum)]
        for uid in range(usersNum):
            purchased_items = self.dataModel.getItemIDsFromUid(uid)
            for i in range(len(purchased_items)):
                for j in range(i + 1, len(purchased_items)):
                    rating_i = self.dataModel.getRating(
                        uid, purchased_items[i])
                    rating_j = self.dataModel.getRating(
                        uid, purchased_items[j])
                    if rating_i > rating_j:
                        key = str(purchased_items[i]) + " " + str(
                            purchased_items[j])
                    elif rating_i < rating_j:
                        key = str(purchased_items[j]) + " " + str(
                            purchased_items[i])
                    else:
                        continue
                    self.T[uid][key] = 1
            # for i in purchased_items:
            #     purchased_items = self.dataModel.getItemIDsFromUid(uid)
            # unpurchased_items = random.sample(all_item_set.difference(purchased_items), self.sample_rate)
            # for j in unpurchased_items:
            #     key = str(i) + " " + str(j)
            #     self.T[uid][key] = 1

        idf = {}
        pair_sum = [[0] * itemsNum for i in range(itemsNum)]
        for uid in range(usersNum):
            for t, times in self.T[uid].iteritems():
                i1, i2 = t.split(" ")
                pair_sum[int(i1)][int(i2)] += 1
        for i1 in range(itemsNum):
            for i2 in range(itemsNum):
                if pair_sum[i1][i2] != 0:
                    key = str(i1) + ' ' + str(i2)
                    sum = pair_sum[i1][i2] + pair_sum[i2][i1]
                    alpha = log10(1 + 9.0 * sum / usersNum)
                    idf[key] = alpha * log2(sum * 1.0 / pair_sum[i1][i2]) + (
                        1 - alpha)

        W = [{} for i in range(usersNum)]
        for uid in range(usersNum):
            for t, times in self.T[uid].iteritems():
                i1, i2 = t.split(" ")
                diff = self.dataModel.getRating(
                    uid, int(i1)) - self.dataModel.getRating(uid, int(i2))
                # if diff != 1:
                #     print 'error!'
                tf = log2(1 + abs(diff))
                if diff < 0:
                    tf = -tf
                W[uid][t] = tf * idf[t]

        self.simiMatrix = np.zeros((usersNum, usersNum))
        for i in range(usersNum):
            for j in range(i + 1, usersNum):
                s = self.cos(W[i], W[j])
                self.simiMatrix[i][j] = self.simiMatrix[j][i] = s