Example #1
0
    def __generateSet(self):
        triple = []
        scale = set()
        # find the maximum rating and minimum value
        for i, entry in enumerate(self.trainingData):
            userName, itemName, rating = entry
            scale.add(float(rating))
        self.rScale = list(scale)
        self.rScale.sort()

        for i, entry in enumerate(self.trainingData):
            userName, itemName, rating = entry
            # makes the rating within the range [0, 1].
            rating = normalize(float(rating), self.rScale[-1], self.rScale[0])
            self.trainingData[i][2] = rating
            # order the user
            if not self.user.has_key(userName):
                self.user[userName] = len(self.user)
                self.id2user[self.user[userName]] = userName
            # order the item
            if not self.item.has_key(itemName):
                self.item[itemName] = len(self.item)
                self.id2item[self.item[itemName]] = itemName
                # userList.append
            triple.append([self.user[userName], self.item[itemName], rating])
        self.trainingMatrix = new_sparseMatrix.SparseMatrix(triple)

        self.all_User.update(self.user)
        self.all_Item.update(self.item)
        for entry in self.testData:
            userId, itemId, rating = entry
            # order the user
            if not self.user.has_key(userId):
                self.all_User[userId] = len(self.all_User)
            # order the item
            if not self.item.has_key(itemId):
                self.all_Item[itemId] = len(self.all_Item)

            if not self.testSet_u.has_key(userId):
                self.testSet_u[userId] = {}
            self.testSet_u[userId][itemId] = rating
            if not self.testSet_i.has_key(itemId):
                self.testSet_i[itemId] = {}
            self.testSet_i[itemId][userId] = rating
Example #2
0
    def __generateSet(self):
        triple = []
        scale = set()
        # find the maximum rating and minimum value
        for i, entry in enumerate(self.trainingData):
            userName, itemName, rating = entry
            scale.add(float(rating))
        self.rScale = list(scale)
        self.rScale.sort()

        for i,entry in enumerate(self.trainingData):
            userName,itemName,rating = entry
            # makes the rating within the range [0, 1].
            rating = normalize(float(rating), self.rScale[-1], self.rScale[0])
            self.trainingData[i][2] = rating
            # order the user
            if userName not in self.user:
                self.user[userName] = len(self.user)
                self.id2user[self.user[userName]] = userName
            # order the item
            if itemName not in self.item:
                self.item[itemName] = len(self.item)
                self.id2item[self.item[itemName]] = itemName
                # userList.append
            self.trainSet_u[userName][itemName] = rating
            self.trainSet_i[itemName][userName] = rating

        self.all_User.update(self.user)
        self.all_Item.update(self.item)
        for entry in self.testData:
            userName, itemName, rating = entry
            # order the user
            if userName not in self.user:
                self.all_User[userName] = len(self.all_User)
            # order the item
            if itemName not in self.item:
                self.all_Item[itemName] = len(self.all_Item)

            self.testSet_u[userName][itemName] = rating
            self.testSet_i[itemName][userName] = rating
Example #3
0
    def __loadRatings(self, file, bTest=False):
        if not bTest:
            print 'load training data...'
        else:
            print 'load test data...'
        with open(file) as f:
            ratings = f.readlines()
        #ignore the headline
        if self.ratingConfig.contains('-header'):
            ratings = ratings[1:]
        #order of the columns
        order = self.ratingConfig['-columns'].strip().split()
        #split data
        #userList= []
        u_i_r = {}
        i_u_r = {}
        triple = []
        #find the maximum rating and minimum value
        for lineNo, line in enumerate(ratings):
            items = split(' |,|\t', line.strip())
            if len(order) < 3:
                print 'The rating file is not in a correct format. Error: Line num %d' % lineNo
                exit(-1)
            userId = items[int(order[0])]
            itemId = items[int(order[1])]
            rating = items[int(order[2])]
            if float(rating) > self.rScale[0]:
                self.rScale[0] = float(rating)
            if float(rating) < self.rScale[1]:
                self.rScale[1] = float(rating)

        for lineNo, line in enumerate(ratings):
            items = split(' |,|\t', line.strip())
            if len(order) < 3:
                print 'The rating file is not in a correct format. Error: Line num %d' % lineNo
                exit(-1)
            userId = items[int(order[0])]
            itemId = items[int(order[1])]
            rating = items[int(order[2])]

            #makes the rating within the range [0, 1].
            normRating = normalize(float(rating), self.rScale[0],
                                   self.rScale[1])
            #order the user
            if not self.user.has_key(userId):
                self.user[userId] = len(self.user)
            #order the item
            if not self.item.has_key(itemId):
                self.item[itemId] = len(self.item)
            if not u_i_r.has_key(userId):
                u_i_r[userId] = []
                #userList.append(userId)
            u_i_r[userId].append([itemId, float(rating)])
            if not i_u_r.has_key(itemId):
                i_u_r[itemId] = []
            i_u_r[itemId].append([userId, float(rating)])
            if not bTest:
                self.triple.append([userId, itemId, normRating])
                triple.append(
                    [self.user[userId], self.item[itemId], normRating])

        if not bTest:
            #contruct the sparse matrix
            # data=[]
            # indices=[]
            # indptr=[]
            # offset = 0
            # for uid in userList:
            #     uRating = [r[1] for r in u_i_r[uid]]
            #     uColunms = [self.item[r[0]] for r in u_i_r[uid]]
            #     data += uRating
            #     indices += uColunms
            #     indptr .append(offset)
            #     offset += len(uRating)
            # indptr.append(offset)
            # return sparseMatrix.SparseMatrix(data, indices, indptr)
            return new_sparseMatrix.SparseMatrix(triple)
        else:
            # return testSet
            return u_i_r, i_u_r