コード例 #1
0
ファイル: file.py プロジェクト: CostanzaS/Thesis
 def loadDataSet(conf, file, bTest=False, binarized=False, threshold=3.0):
     trainingData = []
     testData = []
     ratingConfig = LineConfig(conf['ratings.setup'])
     if not bTest:
         print('loading training data...')
     else:
         print('loading test data...')
     with open(file) as f:
         ratings = f.readlines()
     # ignore the headline
     if ratingConfig.contains('-header'):
         ratings = ratings[1:]
     # order of the columns
     order = ratingConfig['-columns'].strip().split()
     delim = ' |,|\t'
     if ratingConfig.contains('-delim'):
         delim = ratingConfig['-delim']
     for lineNo, line in enumerate(ratings):
         items = split(delim, line.strip())
         if not bTest and len(order) < 2:
             print(
                 'The rating file is not in a correct format. Error: Line num %d'
                 % lineNo)
             exit(-1)
         try:
             userId = items[int(order[0])]
             itemId = items[int(order[1])]
             if len(order) < 3:
                 rating = 1  #default value
             else:
                 rating = items[int(order[2])]
             if binarized:
                 if float(items[int(order[2])]) < threshold:
                     continue
                 else:
                     rating = 1
         except ValueError:
             print(
                 'Error! Have you added the option -header to the rating.setup?'
             )
             exit(-1)
         if not bTest:
             trainingData.append([userId, itemId, float(rating)])
         else:
             if binarized:
                 if rating == 1:
                     testData.append([userId, itemId, float(rating)])
                 else:
                     continue
             testData.append([userId, itemId, float(rating)])
     if not bTest:
         return trainingData
     else:
         return testData
コード例 #2
0
    def execute(self, config, max_sample=1000):
        # import the algorithm module

        importStr = 'from algorithm.ranking.' + config[
            'recommender'] + ' import ' + config['recommender']
        exec(importStr)

        algo_evaluation = LineConfig(config['evaluation.setup'])
        if algo_evaluation.contains('-ul') and eval(
                algo_evaluation['-ul']) > 0:
            training_data = 'self.training_user_item'
            social_info = 'relation=self.relation'
        else:
            training_data = 'self.training_account_item'
            social_info = ''

        if config['recommender'].startswith('ABPR'):
            recommender = config['recommender'] + '(config, {}, self.test_user_item, {}, C={}, N={})'. \
                format(training_data, social_info, self.C, self.N)
        else:
            recommender = config['recommender'] + '(config, {}, self.test_user_item, {})'.\
                format(training_data, social_info)

        algorithum = eval(recommender)
        algorithum.accountDAO = self.accountDAO
        algorithum.evaluation_conf = algo_evaluation
        algorithum.get_test_map(K=self.K, L=self.L)
        algorithum.get_test_sample_data(max_sample=max_sample)

        algorithum.execute()
コード例 #3
0
 def loadRelationship(conf, filePath):
     socialConfig = LineConfig(conf['social.setup'])
     relation = []
     print('loading social data...')
     with open(filePath) as f:
         relations = f.readlines()
         # ignore the headline
     if socialConfig.contains('-header'):
         relations = relations[1:]
     # order of the columns
     order = socialConfig['-columns'].strip().split()
     if len(order) <= 2:
         print('The social file is not in a correct format.')
     for lineNo, line in enumerate(relations):
         items = split(' |,|\t', line.strip())
         if len(order) < 2:
             print(
                 'The social file is not in a correct format. Error: Line num %d'
                 % lineNo)
             exit(-1)
         userId1 = items[int(order[0])]
         userId2 = items[int(order[1])]
         if len(order) < 3:
             weight = 1
         else:
             weight = float(items[int(order[2])])
         relation.append([userId1, userId2, weight])
     return relation
コード例 #4
0
    def __init__(self,
                 config_dict,
                 config_account,
                 account_DAO,
                 C=3,
                 K=1,
                 L=-1,
                 N=0):
        self.trainingData = []  # training data
        self.testData = []  # testData
        self.relation = []
        self.measure = []
        self.config_dict = config_dict
        self.C = C
        self.K = K
        self.L = L
        self.N = N

        self.accountDAO = account_DAO

        if config_account.contains('evaluation.setup'):
            all_evaluation = LineConfig(config_account['evaluation.setup'])
            if all_evaluation.contains('--account'):
                self.training_user_item = account_DAO.training_user_item
                self.training_account_item = account_DAO.training_account_item
                self.relation = account_DAO.relation
                self.test_user_item = account_DAO.test_user_item
        else:
            raise Exception('Evaluation is not well configured!')

        print('preprocessing...')
コード例 #5
0
    def __init__(self,config):
        self.trainingData = []  # training data
        self.testData = []  # testData
        self.measure = []
        self.config =config
        setup = LineConfig(config['record.setup'])
        columns = {}
        labels = setup['-columns'].split(',')
        delim = ''
        if setup.contains('-delim'):
            delim=setup['-delim']
        for col in labels:
            label = col.split(':')
            columns[label[0]] = int(label[1])
        if self.config.contains('evaluation.setup'):
            self.evaluation = LineConfig(config['evaluation.setup'])
            binarized = False
            bottom = 0
            if self.evaluation.contains('-b'):
                binarized = True
                bottom = float(self.evaluation['-b'])
            if self.evaluation.contains('-testSet'):
                #specify testSet

                self.trainingData = FileIO.loadDataSet(config['record'],columns=columns,binarized=binarized,threshold=bottom,delim=delim)
                self.testData = FileIO.loadDataSet(self.evaluation['-testSet'],binarized=binarized,columns=columns,threshold=bottom,delim=delim)

            elif self.evaluation.contains('-ap'):
                #auto partition

                self.trainingData = FileIO.loadDataSet(config['record'],columns=columns,binarized=binarized,threshold=bottom,delim=delim)
                self.trainingData,self.testData = DataSplit.\
                    dataSplit(self.trainingData,test_ratio=float(self.evaluation['-ap']))

            elif self.evaluation.contains('-byTime'):
                self.trainingData = FileIO.loadDataSet(config['record'], columns=columns, binarized=binarized,threshold=bottom, delim=delim)
                self.testData = []

            elif self.evaluation.contains('-cv'):
                #cross validation
                self.trainingData = FileIO.loadDataSet(config['record'],columns=columns,binarized=binarized,threshold=bottom,delim=delim)
                #self.trainingData,self.testData = DataSplit.crossValidation(self.trainingData,int(self.evaluation['-cv']))

        else:
            print 'Evaluation is not well configured!'
            exit(-1)

        # if config.contains('social'):
        #     self.socialConfig = LineConfig(self.config['social.setup'])
        #     self.relation = FileIO.loadRelationship(config,self.config['social'])

        print 'preprocessing...'
コード例 #6
0
    def loadDataSet(conf, file, bTest=False):
        trainingData = defaultdict(dict)
        testData = defaultdict(dict)
        ratingConfig = LineConfig(conf['ratings.setup'])
        if not bTest:
            print('loading training data...')
        else:
            print('loading test data...')
        with open(file) as f:
            ratings = f.readlines()
        # ignore the headline
        if ratingConfig.contains('-header'):
            ratings = ratings[1:]
        # order of the columns
        order = ratingConfig['-columns'].strip().split()

        for lineNo, line in enumerate(ratings):
            items = split(' |,|\t', line.strip())
            if not bTest and len(order) < 3:
                print(
                    'The rating file is not in a correct format. Error: Line num %d'
                    % lineNo)
                exit(-1)
            try:
                userId = items[int(order[0])]
                itemId = items[int(order[1])]
                if bTest and len(order) < 3:
                    rating = 1  #default value
                else:
                    rating = items[int(order[2])]

            except ValueError:
                print(
                    'Error! Have you added the option -header to the rating.setup?'
                )
                exit(-1)
            if not bTest:
                trainingData[userId][itemId] = float(rating)
            else:
                testData[userId][itemId] = float(rating)
        if not bTest:
            return trainingData
        else:
            return testData
コード例 #7
0
class Recommender(object):
    def __init__(self, conf, trainingSet, testSet, fold='[1]'):
        self.config = conf
        self.data = None
        self.isSaveModel = False
        self.ranking = None
        self.isLoadModel = False
        self.output = None
        self.isOutput = True
        self.data = RatingDAO(self.config, trainingSet, testSet)
        self.foldInfo = fold
        self.evalSettings = LineConfig(self.config['evaluation.setup'])
        self.measure = []
        self.record = []
        if self.evalSettings.contains('-cold'):
            #evaluation on cold-start users
            threshold = int(self.evalSettings['-cold'])
            removedUser = {}
            for user in self.data.testSet_u:
                if self.data.trainSet_u.has_key(user) and len(
                        self.data.trainSet_u[user]) > threshold:
                    removedUser[user] = 1

            for user in removedUser:
                del self.data.testSet_u[user]

            testData = []
            for item in self.data.testData:
                if not removedUser.has_key(item[0]):
                    testData.append(item)
            self.data.testData = testData

        self.num_users, self.num_items, self.train_size = self.data.trainingSize(
        )

    def readConfiguration(self):
        self.algorName = self.config['recommender']
        self.output = LineConfig(self.config['output.setup'])
        self.isOutput = self.output.isMainOn()
        self.ranking = LineConfig(self.config['item.ranking'])

    def printAlgorConfig(self):
        "show algorithm's configuration"
        print('Algorithm:', self.config['recommender'])
        print('Ratings dataset:', abspath(self.config['ratings']))
        if LineConfig(self.config['evaluation.setup']).contains('-testSet'):
            print(
                'Test set:',
                abspath(
                    LineConfig(self.config['evaluation.setup']).getOption(
                        '-testSet')))
        #print 'Count of the users in training set: ',len()
        print(
            'Training set size: (user count: %d, item count %d, record count: %d)'
            % (self.data.trainingSize()))
        print(
            'Test set size: (user count: %d, item count %d, record count: %d)'
            % (self.data.testSize()))
        print('=' * 80)

    def initModel(self):
        pass

    def buildModel(self):
        'build the model (for model-based algorithms )'
        pass

    def buildModel_tf(self):
        'training model on tensorflow'
        pass

    def saveModel(self):
        pass

    def loadModel(self):
        pass

    def predict(self, u, i):
        pass

    def predictForRanking(self, u):
        pass

    def checkRatingBoundary(self, prediction):
        if prediction > self.data.rScale[-1]:
            return self.data.rScale[-1]
        elif prediction < self.data.rScale[0]:
            return self.data.rScale[0]
        else:
            return round(prediction, 3)

    def evalRatings(self):
        res = []  #used to contain the text of the result
        res.append('userId  itemId  original  prediction\n')
        #predict
        for ind, entry in enumerate(self.data.testData):
            user, item, rating = entry

            #predict
            prediction = self.predict(user, item)
            #denormalize
            #prediction = denormalize(prediction,self.data.rScale[-1],self.data.rScale[0])
            #####################################
            pred = self.checkRatingBoundary(prediction)
            # add prediction in order to measure
            self.data.testData[ind].append(pred)
            res.append(user + ' ' + item + ' ' + str(rating) + ' ' +
                       str(pred) + '\n')
        currentTime = strftime("%Y-%m-%d %H-%M-%S", localtime(time()))
        #output prediction result
        if self.isOutput:
            outDir = self.output['-dir']
            fileName = self.config[
                'recommender'] + '@' + currentTime + '-rating-predictions' + self.foldInfo + '.txt'
            FileIO.writeFile(outDir, fileName, res)
            print('The result has been output to ', abspath(outDir), '.')
        #output evaluation result
        outDir = self.output['-dir']
        fileName = self.config[
            'recommender'] + '@' + currentTime + '-measure' + self.foldInfo + '.txt'
        self.measure = Measure.ratingMeasure(self.data.testData)
        FileIO.writeFile(outDir, fileName, self.measure)
        print('The result of %s %s:\n%s' %
              (self.algorName, self.foldInfo, ''.join(self.measure)))

    def evalRanking(self):
        res = []  # used to contain the text of the result

        if self.ranking.contains('-topN'):
            top = self.ranking['-topN'].split(',')
            top = [int(num) for num in top]
            N = int(top[-1])
            if N > 100 or N < 0:
                print(
                    'N can not be larger than 100! It has been reassigned with 10'
                )
                N = 10
            if N > len(self.data.item):
                N = len(self.data.item)
        else:
            print('No correct evaluation metric is specified!')
            exit(-1)

        res.append(
            'userId: recommendations in (itemId, ranking score) pairs, * means the item matches.\n'
        )
        # predict
        recList = {}
        userN = {}
        userCount = len(self.data.testSet_u)
        #rawRes = {}
        for i, user in enumerate(self.data.testSet_u):
            itemSet = {}
            line = user + ':'
            predictedItems = self.predictForRanking(user)
            # predictedItems = denormalize(predictedItems, self.data.rScale[-1], self.data.rScale[0])
            for id, rating in enumerate(predictedItems):
                # if not self.data.rating(user, self.data.id2item[id]):
                # prediction = self.checkRatingBoundary(prediction)
                # pred = self.checkRatingBoundary(prediction)
                #####################################
                # add prediction in order to measure

                itemSet[self.data.id2item[id]] = rating

            ratedList, ratingList = self.data.userRated(user)
            for item in ratedList:
                del itemSet[item]

            Nrecommendations = []
            for item in itemSet:
                if len(Nrecommendations) < N:
                    Nrecommendations.append((item, itemSet[item]))
                else:
                    break

            Nrecommendations.sort(key=lambda d: d[1], reverse=True)
            recommendations = [item[1] for item in Nrecommendations]
            resNames = [item[0] for item in Nrecommendations]

            # find the N biggest scores
            for item in itemSet:
                ind = N
                l = 0
                r = N - 1

                if recommendations[r] < itemSet[item]:
                    while r >= l:
                        mid = (r - l) / 2 + l
                        if recommendations[mid] >= itemSet[item]:
                            l = mid + 1
                        elif recommendations[mid] < itemSet[item]:
                            r = mid - 1

                        if r < l:
                            ind = r
                            break
                #move the items backwards
                if ind < N - 2:
                    recommendations[ind + 2:] = recommendations[ind + 1:-1]
                    resNames[ind + 2:] = resNames[ind + 1:-1]
                if ind < N - 1:
                    recommendations[ind + 1] = itemSet[item]
                    resNames[ind + 1] = item

            recList[user] = zip(resNames, recommendations)

            if i % 100 == 0:
                print(self.algorName, self.foldInfo,
                      'progress:' + str(i) + '/' + str(userCount))
            for item in recList[user]:
                line += ' (' + item[0] + ',' + str(item[1]) + ')'
                if self.data.testSet_u[user].has_key(item[0]):
                    line += '*'

            line += '\n'
            res.append(line)
        currentTime = strftime("%Y-%m-%d %H-%M-%S", localtime(time()))
        # output prediction result
        if self.isOutput:
            fileName = ''
            outDir = self.output['-dir']
            fileName = self.config[
                'recommender'] + '@' + currentTime + '-top-' + str(
                    N) + 'items' + self.foldInfo + '.txt'
            FileIO.writeFile(outDir, fileName, res)
            print('The result has been output to ', abspath(outDir), '.')
        # output evaluation result
        outDir = self.output['-dir']
        fileName = self.config[
            'recommender'] + '@' + currentTime + '-measure' + self.foldInfo + '.txt'
        self.measure = Measure.rankingMeasure(self.data.testSet_u, recList,
                                              top)
        FileIO.writeFile(outDir, fileName, self.measure)
        print('The result of %s %s:\n%s' %
              (self.algorName, self.foldInfo, ''.join(self.measure)))

    def execute(self):
        self.readConfiguration()
        if self.foldInfo == '[1]':
            self.printAlgorConfig()
        #load model from disk or build model
        if self.isLoadModel:
            print('Loading model %s...' % (self.foldInfo))
            self.loadModel()
        else:
            print('Initializing model %s...' % (self.foldInfo))
            self.initModel()
            print('Building Model %s...' % (self.foldInfo))
            try:
                import tensorflow
                if self.evalSettings.contains('-tf'):
                    self.buildModel_tf()
                else:
                    self.buildModel()
            except ImportError:
                self.buildModel()

        #preict the ratings or item ranking
        print('Predicting %s...' % (self.foldInfo))
        if self.ranking.isMainOn():
            self.evalRanking()
        else:
            self.evalRatings()

        #save model
        if self.isSaveModel:
            print('Saving model %s...' % (self.foldInfo))
            self.saveModel()
        # with open(self.foldInfo+'measure.txt','w') as f:
        #     f.writelines(self.record)
        return self.measure
コード例 #8
0
ファイル: RecQ.py プロジェクト: SuperSupeng/pythonIsAmazing
class RecQ(object):
    def __init__(self, config):
        self.trainingData = []  # training data
        self.testData = []  # testData
        self.relation = []
        self.measure = []
        self.config = config
        self.ratingConfig = LineConfig(config['ratings.setup'])
        if self.config.contains('evaluation.setup'):
            self.evaluation = LineConfig(config['evaluation.setup'])
            binarized = False
            bottom = 0
            if self.evaluation.contains('-b'):
                binarized = True
                bottom = float(self.evaluation['-b'])
            if self.evaluation.contains('-testSet'):
                #specify testSet

                self.trainingData = FileIO.loadDataSet(config,
                                                       config['ratings'],
                                                       binarized=binarized,
                                                       threshold=bottom)
                self.testData = FileIO.loadDataSet(config,
                                                   self.evaluation['-testSet'],
                                                   bTest=True,
                                                   binarized=binarized,
                                                   threshold=bottom)

            elif self.evaluation.contains('-ap'):
                #auto partition

                self.trainingData = FileIO.loadDataSet(config,
                                                       config['ratings'],
                                                       binarized=binarized,
                                                       threshold=bottom)
                self.trainingData,self.testData = DataSplit.\
                    dataSplit(self.trainingData,test_ratio=float(self.evaluation['-ap']),binarized=binarized)
            elif self.evaluation.contains('-cv'):
                #cross validation
                self.trainingData = FileIO.loadDataSet(config,
                                                       config['ratings'],
                                                       binarized=binarized,
                                                       threshold=bottom)
                #self.trainingData,self.testData = DataSplit.crossValidation(self.trainingData,int(self.evaluation['-cv']))

        else:
            print('Evaluation is not well configured!')
            exit(-1)

        if config.contains('social'):
            self.socialConfig = LineConfig(self.config['social.setup'])
            self.relation = FileIO.loadRelationship(config,
                                                    self.config['social'])

        print('preprocessing...')

    def execute(self):
        #import the algorithm module
        try:
            importStr = 'from algorithm.rating.' + self.config[
                'recommender'] + ' import ' + self.config['recommender']
            exec(importStr)
        except ImportError:
            importStr = 'from algorithm.ranking.' + self.config[
                'recommender'] + ' import ' + self.config['recommender']
            exec(importStr)
        if self.evaluation.contains('-cv'):
            k = int(self.evaluation['-cv'])
            if k <= 1 or k > 10:
                k = 3

            mkl.set_num_threads(max(1, mkl.get_max_threads() / k))

            #create the manager
            manager = Manager()
            m = manager.dict()
            i = 1
            tasks = []

            binarized = False
            if self.evaluation.contains('-b'):
                binarized = True

            for train, test in DataSplit.crossValidation(self.trainingData,
                                                         k,
                                                         binarized=binarized):
                fold = '[' + str(i) + ']'
                if self.config.contains('social'):
                    recommender = self.config[
                        'recommender'] + "(self.config,train,test,self.relation,fold)"
                else:
                    recommender = self.config[
                        'recommender'] + "(self.config,train,test,fold)"
            #create the process
                p = Process(target=run, args=(m, eval(recommender), i))
                tasks.append(p)
                i += 1
            #start the processes
            for p in tasks:
                p.start()
                if not self.evaluation.contains('-p'):
                    p.join()
            #wait until all processes are completed
            if self.evaluation.contains('-p'):
                for p in tasks:
                    p.join()
            #compute the mean error of k-fold cross validation
            self.measure = [dict(m)[i] for i in range(1, k + 1)]
            res = []
            for i in range(len(self.measure[0])):
                if self.measure[0][i][:3] == 'Top':
                    res.append(self.measure[0][i])
                    continue
                measure = self.measure[0][i].split(':')[0]
                total = 0
                for j in range(k):
                    total += float(self.measure[j][i].split(':')[1])
                res.append(measure + ':' + str(total / k) + '\n')
            #output result
            currentTime = strftime("%Y-%m-%d %H-%M-%S", localtime(time()))
            outDir = LineConfig(self.config['output.setup'])['-dir']
            fileName = self.config[
                'recommender'] + '@' + currentTime + '-' + str(
                    k) + '-fold-cv' + '.txt'
            FileIO.writeFile(outDir, fileName, res)
            print('The result of %d-fold cross validation:\n%s' %
                  (k, ''.join(res)))

        else:
            if self.config.contains('social'):
                recommender = self.config[
                    'recommender'] + '(self.config,self.trainingData,self.testData,self.relation)'
            else:
                recommender = self.config[
                    'recommender'] + '(self.config,self.trainingData,self.testData)'
            eval(recommender).execute()
コード例 #9
0
class Recommender(object):
    def __init__(self, conf, trainingSet, testSet, fold='[1]'):
        self.currentTime = strftime("%Y-%m-%d %H-%M-%S", localtime(time()))
        self.config = conf
        self.data = None
        self.isSaveModel = False
        self.ranking = None
        self.isLoadModel = False
        self.output = None
        self.isOutput = True

        self.data = RatingDAO(self.config, trainingSet, testSet)
        self.foldInfo = fold
        self.evalSettings = LineConfig(self.config['evaluation.setup'])
        self.measure = []
        self.record = []
        if self.evalSettings.contains('-cold'):
            # evaluation on cold-start users
            threshold = int(self.evalSettings['-cold'])
            removedUser = {}
            for user in self.data.testSet_u:
                if user in self.data.trainSet_u and len(
                        self.data.trainSet_u[user]) > threshold:
                    removedUser[user] = 1

            for user in removedUser:
                del self.data.testSet_u[user]

            testData = []
            for item in self.data.testData:
                if item[0] not in removedUser:
                    testData.append(item)
            self.data.testData = testData

        self.num_users, self.num_items, self.train_size = self.data.trainingSize(
        )

    def get_test_sample_data(self, max_sample=1000):

        testSample = {}
        keys = list(self.data.testSet_u.keys())
        if len(self.data.testSet_u) <= max_sample:
            testSample = self.data.testSet_u
        else:
            while True:
                if len(testSample) == max_sample:
                    break
                index = np.random.choice(len(self.data.testSet_u))
                user = keys[index]
                testSample[user] = self.data.testSet_u[user]

        self.testSample = testSample

    def get_test_map(self, K=1, L=-1):
        self.K = K
        self.L = L
        if not hasattr(self, 'accountDAO') or self.accountDAO is None:
            self.map_from_true_to_identify = {
                i: i
                for i in list(self.data.testSet_u.keys())
            }
        elif self.evaluation_conf.contains('-ul') and eval(
                self.evaluation_conf['-ul']) > 0:
            self.map_from_true_to_identify = self.get_map_from_true_to_identify(
                k=K, index=L)
        else:
            self.map_from_true_to_identify = self.accountDAO.map_from_user_to_account

    def readConfiguration(self):
        self.algorName = self.config['recommender']
        self.output = LineConfig(self.config['output.setup'])
        self.isOutput = self.output.isMainOn()
        self.ranking = LineConfig(self.config['item.ranking'])

    def printAlgorConfig(self):
        "show algorithm's configuration"
        print('Algorithm:', self.config['recommender'])
        print('Ratings dataset:', abspath(self.config['ratings']))
        if LineConfig(self.config['evaluation.setup']).contains('-testSet'):
            print(
                'Test set:',
                abspath(
                    LineConfig(self.config['evaluation.setup']).getOption(
                        '-testSet')))
        # print 'Count of the users in training set: ',len()
        print(
            'Training set size: (user count: %d, item count %d, record count: %d)'
            % (self.data.trainingSize()))
        print(
            'Test set size: (user count: %d, item count %d, record count: %d)'
            % (self.data.testSize()))
        print('=' * 80)

    def initModel(self):
        pass

    def buildModel(self):
        'build the model (for model-based algorithms )'
        pass

    def buildModel_tf(self):
        'training model on tensorflow'
        pass

    def saveModel(self):
        pass

    def loadModel(self):
        pass

    def predict(self, u, i):
        pass

    def predictForRanking(self, u):
        pass

    def checkRatingBoundary(self, prediction):
        if prediction > self.data.rScale[-1]:
            return self.data.rScale[-1]
        elif prediction < self.data.rScale[0]:
            return self.data.rScale[0]
        else:
            return round(prediction, 3)

    def evalRatings(self):
        res = []  # used to contain the text of the result
        res.append('userId  itemId  original  prediction\n')
        # predict
        for ind, entry in enumerate(self.data.testData):
            user, item, rating = entry

            # predict
            prediction = self.predict(user, item)
            # denormalize
            # prediction = denormalize(prediction,self.data.rScale[-1],self.data.rScale[0])
            #####################################
            pred = self.checkRatingBoundary(prediction)
            # add prediction in order to measure
            self.data.testData[ind].append(pred)
            res.append(user + ' ' + item + ' ' + str(rating) + ' ' +
                       str(pred) + '\n')
        currentTime = strftime("%Y-%m-%d %H-%M-%S", localtime(time()))
        # output prediction result
        if self.isOutput:
            outDir = self.output['-dir']
            fileName = self.config[
                'recommender'] + '@' + currentTime + '-rating-predictions' + self.foldInfo + '.txt'
            FileIO.writeFile(outDir, fileName, res)
            print('The result has been output to ', abspath(outDir), '.')
        # output evaluation result
        outDir = self.output['-dir']
        fileName = self.config[
            'recommender'] + '@' + currentTime + '-measure' + self.foldInfo + '.txt'
        self.measure = Measure.ratingMeasure(self.data.testData)
        FileIO.writeFile(outDir, fileName, self.measure)
        print('The result of %s %s:\n%s' %
              (self.algorName, self.foldInfo, ''.join(self.measure)))

    def get_map_from_true_to_identify(self, k=1, index=-1):
        map_from_true_to_identify = {}
        table = self.accountDAO.test_table[self.accountDAO.test_table.k == k]

        reserve_list = [
            ind for ind, users in enumerate(table['identify_user'])
            if len(users)
        ]
        table = table.iloc[reserve_list].copy()

        table['identify_user_index'] = [
            i_list[index] if len(i_list) and len(i_list) >= index + 1 else None
            for i_list in table['identify_user']
        ]
        # table['identify_user_index'].astype(int)
        # table.groupby

        identify_list = table.groupby(
            'truth_user')['identify_user_index'].aggregate(list)

        for truth, idens in identify_list.items():
            i_users, counts = np.unique(np.array(idens)[np.array(idens) > 0],
                                        return_counts=True)
            if len(i_users) == 0:
                continue
            map_from_true_to_identify[truth] = i_users[np.argmax(counts)]

        # identification_result = dict(zip(table['truth_user'].to_list(), table['identify_user'].to_list()))
        # for key, value in identification_result.items():
        #     if len(value) and len(value) >= index + 1:
        #         try:
        #             map_from_true_to_identify[key] = value[index]
        #         except:
        #             print(key, value)
        #             map_from_true_to_identify[key] = value[index]

        return map_from_true_to_identify

    def get_recommendation(self, data_user, N):
        user, identified_user, testSample_user = data_user
        itemSet = {}
        line = str(user) + ':'
        predictedItems = self.predictForRanking(identified_user)

        for id, rating in enumerate(predictedItems):
            itemSet[self.data.id2item[id]] = rating

        # if not hasattr(self, 'accountDAO') or self.accountDAO is None:
        #     ratedList, ratingList = self.data.userRated(user)
        # else:
        #     ratedList = list(self.accountDAO.ground_visit[user].keys())
        # for item in ratedList:
        #     del itemSet[item]

        Nrecommendations = []

        for item in itemSet:
            if len(Nrecommendations) < N:
                Nrecommendations.append((item, itemSet[item]))
            else:
                break

        # Nrecommendations = list(itemSet.items())[:N]

        Nrecommendations.sort(key=lambda d: d[1], reverse=True)
        recommendations = [item[1] for item in Nrecommendations]
        resNames = [item[0] for item in Nrecommendations]

        # find the N biggest scores
        for item in itemSet:
            ind = N
            l = 0
            r = N - 1

            if recommendations[r] < itemSet[item]:
                while r >= l:
                    mid = (r - l) // 2 + l
                    if recommendations[mid] >= itemSet[item]:
                        l = mid + 1
                    elif recommendations[mid] < itemSet[item]:
                        r = mid - 1

                    if r < l:
                        ind = r
                        break
            # move the items backwards
            if ind < N - 2:
                recommendations[ind + 2:] = recommendations[ind + 1:-1]
                resNames[ind + 2:] = resNames[ind + 1:-1]
            if ind < N - 1:
                recommendations[ind + 1] = itemSet[item]
                resNames[ind + 1] = item

        # recList[user] = list(zip(resNames, recommendations))

        # recList[user] = list(itemSet_sorted.items())[:N]

        recList_user = list(zip(resNames, recommendations))

        for item in recList_user:
            line += ' (' + str(item[0]) + ',' + str(item[1]) + ')'
            if item[0] in testSample_user:
                line += '*'

        line += '\n'

        return user, line, recList_user

    def evalRanking(self, write_to_file=True, use_now_time=False):
        res = []  # used to contain the text of the result

        if self.ranking.contains('-topN'):
            top = self.ranking['-topN'].split(',')
            top = [int(num) for num in top]
            N = max(top)
            if N > 100 or N < 0:
                print(
                    'N can not be larger than 100! It has been reassigned with 10'
                )
                N = 10
            if N > len(self.data.item):
                N = len(self.data.item)
        else:
            print('No correct evaluation metric is specified!')
            exit(-1)

        res.append(
            'userId: recommendations in (itemId, ranking score) pairs, * means the item matches.\n'
        )
        # predict
        recList = {}
        userN = {}

        testSample = self.testSample

        # # multiprocessing way
        # pool = Pool(12)
        # dataset = []
        # for user, testSample_u in testSample.items():
        #     identified_user = self.map_from_true_to_identify.get(user, -1)
        #     if identified_user == -1:
        #         continue
        #     dataset.append([user, identified_user, testSample_u])
        #
        # result_generator = pool.imap_unordered(partial(self.get_recommendation, N=N), dataset)
        # for result in tqdm(result_generator, total=len(dataset), desc='Measuring [{}]'):
        #     user, line, recList_user = result
        #     recList[user] = recList_user
        #     res.append(line)
        # pool.close()
        # pool.join()

        testSample_copy = testSample.copy()

        for i, user in tqdm(enumerate(testSample),
                            total=len(testSample),
                            desc='Measuring [{}]'.format(self.algorName)):
            identified_user = self.map_from_true_to_identify.get(user, -1)
            if identified_user == -1:
                del testSample_copy[user]
                continue
            user, line, recList_user = self.get_recommendation(
                (user, identified_user, testSample[user]), N)

            recList[user] = recList_user
            res.append(line)

        self.measure = Measure.rankingMeasure(testSample_copy, recList, top)
        try:
            self.measure.append("C:{}\n".format(self.C))
        except:
            pass
        try:
            self.measure.append("L:{}\n".format(self.L))
        except:
            pass
        try:
            self.measure.append("K:{}\n".format(self.K))
        except:
            pass
        try:
            self.measure.append("N:{}\n".format(self.N))
        except:
            pass

        if use_now_time:
            currentTime = strftime("%Y-%m-%d %H-%M-%S", localtime(time()))
        else:
            currentTime = self.currentTime
        if write_to_file:
            # output prediction result
            if False and self.isOutput:
                fileName = ''
                outDir = self.output['-dir']
                fileName = self.config[
                    'recommender'] + '@' + currentTime + '-top-' + str(
                        N) + 'items' + self.foldInfo + '.txt'
                FileIO.writeFile(outDir, fileName, res)
            # output evaluation result
            outDir = self.output['-dir']
            try:
                fileName = self.config[
                    'recommender'] + '@' + currentTime + '-measure' + self.foldInfo + '_C{}'.format(
                        self.C) + '.txt'
            except:
                fileName = self.config[
                    'recommender'] + '@' + currentTime + '-measure' + self.foldInfo + '.txt'
            FileIO.writeFile(outDir, fileName, self.measure)
            # FileIO.writeFile(outDir, fileName, "C:{}".format(self.C))

            print('The result has been output to ', abspath(outDir), '.')
        print('The result of %s %s:\n%s' %
              (self.algorName, self.foldInfo, ''.join(self.measure)))

    def execute(self):
        self.readConfiguration()
        if self.foldInfo == '[1]':
            self.printAlgorConfig()
        # load model from disk or build model
        if self.isLoadModel:
            print('Loading model %s...' % (self.foldInfo))
            self.loadModel()
        else:
            print('Initializing model %s...' % (self.foldInfo))
            self.initModel()
            print('Building Model %s...' % (self.foldInfo))
            try:
                import tensorflow
                if self.evalSettings.contains('-tf'):
                    self.buildModel_tf()
                else:
                    self.buildModel()
            except ImportError:
                self.buildModel()

        # preict the ratings or item ranking
        print('Predicting %s...' % (self.foldInfo))
        if self.ranking.isMainOn():
            self.evalRanking()
        else:
            self.evalRatings()

        # save model
        if self.isSaveModel:
            print('Saving model %s...' % (self.foldInfo))
            self.saveModel()
        # with open(self.foldInfo+'measure.txt','w') as f:
        #     f.writelines(self.record)
        return self.measure
コード例 #10
0
ファイル: rating.py プロジェクト: hyliqd/RecQ
class RatingDAO(object):
    'data access control'

    def __init__(self, config):
        self.config = config
        self.ratingConfig = LineConfig(config['ratings.setup'])
        self.evaluation = LineConfig(config['evaluation.setup'])
        self.user = {}  #used to store the order of users
        self.item = {}  #used to store the order of items
        self.userMeans = {}  #used to store the mean values of users's ratings
        self.itemMeans = {}  #used to store the mean values of items's ratings
        self.triple = []  #training data
        self.globalMean = 0
        self.timestamp = {}
        self.ratingMatrix = None
        self.trainingMatrix = None
        self.validationMatrix = None
        self.testSet_u = None  # used to store the test set by hierarchy user:[item,rating]
        self.testSet_i = None  # used to store the test set by hierarchy item:[user,rating]
        self.rScale = [-9999999, 999999]
        if self.evaluation.contains('-testSet'):
            #specify testSet
            self.trainingMatrix = self.__loadRatings(config['ratings'])
            self.testSet_u, self.testSet_i = self.__loadRatings(
                self.evaluation['-testSet'], True)
        else:  #cross validation and leave-one-out
            self.ratingMatrix = self.__loadRatings(config['ratings'])
        self.__computeItemMean()
        self.__computeUserMean()
        self.__globalAverage()

    def __loadRatings(self, file, bTest=False):
        if not bTest:
            print 'load training data...'
        else:
            print 'load test data...'
        with open(file) as f:
            ratings = f.readlines()
        #ignore the headline
        if self.ratingConfig.contains('-header'):
            ratings = ratings[1:]
        #order of the columns
        order = self.ratingConfig['-columns'].strip().split()
        #split data
        #userList= []
        u_i_r = {}
        i_u_r = {}
        triple = []
        #find the maximum rating and minimum value
        for lineNo, line in enumerate(ratings):
            items = split(' |,|\t', line.strip())
            if len(order) < 3:
                print 'The rating file is not in a correct format. Error: Line num %d' % lineNo
                exit(-1)
            userId = items[int(order[0])]
            itemId = items[int(order[1])]
            rating = items[int(order[2])]
            if float(rating) > self.rScale[0]:
                self.rScale[0] = float(rating)
            if float(rating) < self.rScale[1]:
                self.rScale[1] = float(rating)

        for lineNo, line in enumerate(ratings):
            items = split(' |,|\t', line.strip())
            if len(order) < 3:
                print 'The rating file is not in a correct format. Error: Line num %d' % lineNo
                exit(-1)
            userId = items[int(order[0])]
            itemId = items[int(order[1])]
            rating = items[int(order[2])]

            #makes the rating within the range [0, 1].
            normRating = normalize(float(rating), self.rScale[0],
                                   self.rScale[1])
            #order the user
            if not self.user.has_key(userId):
                self.user[userId] = len(self.user)
            #order the item
            if not self.item.has_key(itemId):
                self.item[itemId] = len(self.item)
            if not u_i_r.has_key(userId):
                u_i_r[userId] = []
                #userList.append(userId)
            u_i_r[userId].append([itemId, float(rating)])
            if not i_u_r.has_key(itemId):
                i_u_r[itemId] = []
            i_u_r[itemId].append([userId, float(rating)])
            if not bTest:
                self.triple.append([userId, itemId, normRating])
                triple.append(
                    [self.user[userId], self.item[itemId], normRating])

        if not bTest:
            #contruct the sparse matrix
            # data=[]
            # indices=[]
            # indptr=[]
            # offset = 0
            # for uid in userList:
            #     uRating = [r[1] for r in u_i_r[uid]]
            #     uColunms = [self.item[r[0]] for r in u_i_r[uid]]
            #     data += uRating
            #     indices += uColunms
            #     indptr .append(offset)
            #     offset += len(uRating)
            # indptr.append(offset)
            # return sparseMatrix.SparseMatrix(data, indices, indptr)
            return new_sparseMatrix.SparseMatrix(triple)
        else:
            # return testSet
            return u_i_r, i_u_r

    def __globalAverage(self):
        total = sum(self.userMeans.values())
        if total == 0:
            self.globalMean = 0
        else:
            self.globalMean = total / len(self.userMeans)

    def __computeUserMean(self):
        for u in self.user:
            n = self.row(u) > 0
            mean = 0

            if not self.containsUser(
                    u):  # no data about current user in training set
                pass
            else:
                sum = float(self.row(u)[0].sum())
                try:
                    mean = sum / n[0].sum()
                except ZeroDivisionError:
                    mean = 0
            self.userMeans[u] = mean

    def __computeItemMean(self):
        for c in self.item:
            n = self.col(c) > 0
            mean = 0
            if not self.containsItem(
                    c):  # no data about current user in training set
                pass
            else:
                sum = float(self.col(c)[0].sum())
                try:
                    mean = sum / n[0].sum()
                except ZeroDivisionError:
                    mean = 0
            self.itemMeans[c] = mean

    def getUserId(self, u):
        if self.user.has_key(u):
            return self.user[u]
        else:
            return -1

    def getItemId(self, i):
        if self.item.has_key(i):
            return self.item[i]
        else:
            return -1

    def trainingSize(self):
        return self.trainingMatrix.size

    def testSize(self):
        return (len(self.testSet_u), len(self.testSet_i))

    def contains(self, u, i):
        'whether user u rated item i'
        return self.trainingMatrix.contains(self.getUserId(u),
                                            self.getItemId(i))

    def containsUser(self, u):
        'whether user is in training set'
        return self.trainingMatrix.matrix_User.has_key(self.getUserId(u))

    def containsItem(self, i):
        'whether item is in training set'
        return self.trainingMatrix.matrix_Item.has_key(self.getItemId(i))

    def userRated(self, u):
        if self.trainingMatrix.matrix_User.has_key(self.getUserId(u)):
            userIndex = self.trainingMatrix.matrix_User[self.user[u]].keys()
            rating = self.trainingMatrix.matrix_User[self.user[u]].values()
            return (userIndex, rating)
        return ([], [])

    def itemRated(self, i):
        if self.trainingMatrix.matrix_Item.has_key(self.getItemId(i)):
            itemIndex = self.trainingMatrix.matrix_Item[self.item[i]].keys()
            rating = self.trainingMatrix.matrix_Item[self.item[i]].values()
            return (itemIndex, rating)
        return ([], [])

    def row(self, u):
        return self.trainingMatrix.row(self.getUserId(u))

    def col(self, c):
        return self.trainingMatrix.col(self.getItemId(c))

    def rating(self, u, c):
        return self.trainingMatrix.elem(self.getUserId(u), self.getItemId(c))

    def ratingScale(self):
        return (self.rScale[0], self.rScale[1])

    def elemCount(self):
        return self.trainingMatrix.elemCount()
コード例 #11
0
ファイル: data.py プロジェクト: zhangshengnan1993/RecQ
class ratingDAO(object):
    'data access control'

    def __init__(self, config):
        self.config = config
        self.ratingConfig = LineConfig(config['ratings.setup'])
        self.evaluation = LineConfig(config['evaluation'])
        self.user = {}
        self.item = {}
        self.timestamp = {}
        self.ratingMatrix = None
        self.trainingMatrix = None
        self.validationMatrix = None
        self.testSet_u = None  # used to store the test set by hierarchy user:[item,rating]
        self.testSet_i = None  # used to store the test set by hierarchy item:[user,rating]
        self.rScale = [-9999999, 999999]
        if self.evaluation.contains('-testSet'):
            #specify testSet
            self.trainingMatrix = self.loadRatings(config['ratings'])
            self.testSet_u, self.testSet_i = self.loadRatings(
                self.evaluation['-testSet'], True)

        else:  #cross validation and leave-one-out
            self.ratingMatrix = self.loadRatings(config['ratings'])

    def loadRatings(self, file, bTest=False):
        with open(file) as f:
            ratings = f.readlines()
        #ignore the headline
        if self.ratingConfig.contains('-header'):
            ratings = ratings[1:]
        #set delimiter
        delimiter = ' '
        if self.ratingConfig.contains('-d'):
            delimiter = self.ratingConfig['-d']
        #order of the columns
        order = self.ratingConfig['-columns'].strip().split()
        #split data
        userList = []
        u_i_r = {}
        i_u_r = {}
        triple = []
        for line in ratings:
            items = line.strip().split(delimiter)
            userId = items[int(order[0])]
            itemId = items[int(order[1])]
            rating = items[int(order[2])]
            if float(rating) > self.rScale[0]:
                self.rScale[0] = float(rating)
            if float(rating) < self.rScale[1]:
                self.rScale[1] = float(rating)
            #order the user
            if not self.user.has_key(userId):
                self.user[userId] = len(self.user)
            #order the item
            if not self.item.has_key(itemId):
                self.item[itemId] = len(self.item)
            if not u_i_r.has_key(userId):
                u_i_r[userId] = []
                userList.append(userId)
            u_i_r[userId].append([itemId, float(rating)])
            if not i_u_r.has_key(itemId):
                i_u_r[itemId] = []
            i_u_r[itemId].append([userId, float(rating)])
            triple.append(
                [self.user[userId], self.item[itemId],
                 float(rating)])

        if not bTest:
            #contruct the sparse matrix
            # data=[]
            # indices=[]
            # indptr=[]
            # offset = 0
            # for uid in userList:
            #     uRating = [r[1] for r in u_i_r[uid]]
            #     uColunms = [self.item[r[0]] for r in u_i_r[uid]]
            #     data += uRating
            #     indices += uColunms
            #     indptr .append(offset)
            #     offset += len(uRating)
            # indptr.append(offset)
            # return sparseMatrix.SparseMatrix(data, indices, indptr)
            return new_sparseMatrix.SparseMatrix(
                triple, (len(self.user), len(self.item)))
        else:
            # return testSet
            return u_i_r, i_u_r

    def row(self, u):
        return self.trainingMatrix.row(self.user[u])

    def col(self, c):
        return self.trainingMatrix.col(self.item[c])

    def rating(self, u, c):
        return self.trainingMatrix.elem(self.user[u], self.item[c])

    def ratingScale(self):
        return (self.rScale[0], self.rScale[1])
コード例 #12
0
class Record(object):
    'data access control'

    def __init__(self, config, trainingSet, testSet):
        self.config = config
        self.recordConfig = LineConfig(config['record.setup'])
        self.evalConfig = LineConfig(config['evaluation.setup'])
        self.name2id = defaultdict(dict)
        self.id2name = defaultdict(dict)
        self.listened = {}
        self.listened['artist'] = defaultdict(dict)
        self.listened['track'] = defaultdict(dict)
        self.listened['album'] = defaultdict(dict)
        self.artist2Album = defaultdict(
            dict)  #key:artist id, value:{album id1:1, album id2:1 ...}
        self.album2Track = defaultdict(dict)  #
        self.artist2Track = defaultdict(dict)  #
        self.userRecord = defaultdict(
            list)  #user data in training set. form: {user:[record1,record2]}
        self.testSet = defaultdict(
            dict
        )  #user data in test set. form: {user:{recommenedObject1:1,recommendedObject:1}}
        self.recordCount = 0
        self.columns = {}
        labels = self.recordConfig['-columns'].split(',')
        for col in labels:
            label = col.split(':')
            self.columns[label[0]] = int(label[1])
        if self.evalConfig.contains('-byTime'):
            trainingSet, testSet = self.splitDataByTime(trainingSet)
        self.preprocess(trainingSet, testSet)

    def splitDataByTime(self, dataset):
        trainingSet = []
        testSet = []
        ratio = float(self.evalConfig['-byTime'])
        records = defaultdict(list)
        for event in dataset:
            records[event['user']].append(event)
        for user in records:
            orderedList = sorted(records[user], key=lambda d: d['time'])
            training = orderedList[0:int(len(orderedList) * (1 - ratio))]
            test = orderedList[int(len(orderedList) * (1 - ratio)):]
            trainingSet += training
            testSet += test
        return trainingSet, testSet

    def preprocess(self, trainingSet, testSet):
        for entry in trainingSet:
            self.recordCount += 1
            for key in entry:
                if key != 'time':
                    if not self.name2id[key].has_key(entry[key]):
                        self.name2id[key][entry[key]] = len(self.name2id[key])
                        self.id2name[key][len(self.id2name[key])] = entry[key]

                if key == 'user':
                    self.userRecord[entry['user']].append(entry)
                    if entry.has_key('artist'):
                        if not self.listened['artist'][
                                entry['artist']].has_key(entry[key]):
                            self.listened['artist'][entry['artist']][
                                entry[key]] = 0
                        else:
                            self.listened['artist'][entry['artist']][
                                entry[key]] += 1
                    if entry.has_key('album'):
                        if not self.listened['album'][entry['album']].has_key(
                                entry[key]):
                            self.listened['album'][entry['album']][
                                entry[key]] = 0
                        else:
                            self.listened['album'][entry['album']][
                                entry[key]] += 1
                    if entry.has_key('track'):
                        if not self.listened['track'][entry['track']].has_key(
                                entry[key]):
                            self.listened['track'][entry['track']][
                                entry[key]] = 0
                        else:
                            self.listened['track'][entry['track']][
                                entry[key]] += 1
                if key == 'artist' and entry.has_key('album'):
                    self.artist2Album[entry[key]][entry['album']] = 1
                if key == 'album' and entry.has_key('track'):
                    self.album2Track[entry[key]][entry['track']] = 1
                if key == 'artist' and entry.has_key('track'):
                    self.artist2Track[entry[key]][entry['track']] = 1

        recType = self.evalConfig['-target']
        for entry in testSet:
            for key in entry:
                if key != 'time':
                    if not self.name2id[key].has_key(entry[key]):
                        self.name2id[key][entry[key]] = len(self.name2id[key])
                        self.id2name[key][len(self.id2name[key])] = entry[key]
                if key == 'user':
                    if entry.has_key(recType):
                        self.testSet[entry['user']][entry[recType]] = 1

    def printTrainingSize(self):
        if self.name2id.has_key('user'):
            print 'user count:', len(self.name2id['user'])
        if self.name2id.has_key('artist'):
            print 'artist count:', len(self.name2id['artist'])
        if self.name2id.has_key('album'):
            print 'album count:', len(self.name2id['album'])
        if self.name2id.has_key('track'):
            print 'track count:', len(self.name2id['track'])
        print 'Training set size:', self.recordCount

    def getId(self, obj, t):
        if self.name2id[t].has_key(obj):
            return self.name2id[t][obj]
        else:
            print 'No ' + t + ' ' + obj + ' exists!'
            exit(-1)

    def getSize(self, t):
        return len(self.name2id[t])
コード例 #13
0
class Recommender(object):
    def __init__(self, conf, trainingSet=None, testSet=None, fold='[1]'):
        self.config = conf
        self.dao = None
        self.isSaveModel = False
        self.ranking = None
        self.isLoadModel = False
        self.output = None
        self.isOutput = True
        self.dao = RatingDAO(self.config, trainingSet, testSet)
        self.foldInfo = fold
        self.measure = []

    def readConfiguration(self):
        self.algorName = self.config['recommender']
        self.output = LineConfig(self.config['output.setup'])
        self.isOutput = self.output.isMainOn()
        self.ranking = LineConfig(self.config['item.ranking'])

    def printAlgorConfig(self):
        "show algorithm's configuration"
        print 'Algorithm:', self.config['recommender']
        print 'Ratings dataset:', abspath(self.config['ratings'])
        if LineConfig(self.config['evaluation.setup']).contains('-testSet'):
            print 'Test set:', abspath(
                LineConfig(
                    self.config['evaluation.setup']).getOption('-testSet'))
        #print 'Count of the users in training set: ',len()
        print 'Training set size: (user count: %d, item count %d, record count: %d)' % (
            self.dao.trainingSize())
        print 'Test set size: (user count: %d, item count %d, record count: %d)' % (
            self.dao.testSize())
        print '=' * 80

    def initModel(self):
        pass

    def buildModel(self):
        'build the model (for model-based algorithms )'
        pass

    def saveModel(self):
        pass

    def loadModel(self):
        pass

    def predict(self, u, i):
        pass

    def predictForRanking(self, u):
        pass

    def checkRatingBoundary(self, prediction):
        if prediction > self.dao.rScale[-1]:
            return self.dao.rScale[-1]
        elif prediction < self.dao.rScale[0]:
            return self.dao.rScale[0]
        else:
            return round(prediction, 3)

    def evalRatings(self):
        res = []  #used to contain the text of the result
        res.append('userId  itemId  original  prediction\n')
        #predict
        for ind, entry in enumerate(self.dao.testData):
            user, item, rating = entry

            #predict
            prediction = self.predict(user, item)
            #denormalize
            prediction = denormalize(prediction, self.dao.rScale[-1],
                                     self.dao.rScale[0])
            #####################################
            pred = self.checkRatingBoundary(prediction)
            # add prediction in order to measure
            self.dao.testData[ind].append(pred)
            res.append(user + ' ' + item + ' ' + str(rating) + ' ' +
                       str(pred) + '\n')
        currentTime = strftime("%Y-%m-%d %H-%M-%S", localtime(time()))
        #output prediction result
        if self.isOutput:
            outDir = self.output['-dir']
            fileName = self.config[
                'recommender'] + '@' + currentTime + '-rating-predictions' + self.foldInfo + '.txt'
            FileIO.writeFile(outDir, fileName, res)
            print 'The Result has been output to ', abspath(outDir), '.'
        #output evaluation result
        outDir = self.output['-dir']
        fileName = self.config[
            'recommender'] + '@' + currentTime + '-measure' + self.foldInfo + '.txt'
        self.measure = Measure.ratingMeasure(self.dao.testData)
        FileIO.writeFile(outDir, fileName, self.measure)

    def evalRanking(self):
        res = []  # used to contain the text of the result
        N = 0
        threshold = 0
        bThres = False
        bTopN = False
        if self.ranking.contains('-topN'):
            bTopN = True
            N = int(self.ranking['-topN'])
            if N > 100 or N < 0:
                print 'N can not be larger than 100! It has been reassigned with 100'
                N = 100
        elif self.ranking.contains('-threshold'):
            threshold = float(self.ranking['-threshold'])
            bThres = True
        else:
            print 'No correct evaluation metric is specified!'
            exit(-1)

        res.append(
            'userId: recommendations in (itemId, ranking score) pairs, * means the item matches.\n'
        )
        # predict
        recList = {}
        userN = {}
        userCount = len(self.dao.testSet_u)
        for i, user in enumerate(self.dao.testSet_u):
            itemSet = {}
            line = user + ':'

            for item in self.dao.item:
                # predict
                prediction = self.predict(user, item)
                # denormalize

                prediction = denormalize(prediction, self.dao.rScale[-1],
                                         self.dao.rScale[0])

                #prediction = self.checkRatingBoundary(prediction)
                #pred = self.checkRatingBoundary(prediction)
                #####################################
                # add prediction in order to measure
                if bThres:
                    if prediction > threshold:
                        itemSet[item] = prediction
                else:
                    itemSet[item] = prediction

            ratedList, ratingList = self.dao.userRated(user)
            for item in ratedList:
                del itemSet[self.dao.id2item[item]]
            itemSet = sorted(itemSet.iteritems(),
                             key=lambda d: d[1],
                             reverse=True)
            if self.ranking.contains('-topN'):
                recList[user] = itemSet[0:N]
            elif self.ranking.contains('-threshold'):
                recList[user] = itemSet[:]
                userN[user] = len(itemSet)

            if i % 100 == 0:
                print self.algorName, self.foldInfo, 'progress:' + str(
                    i) + '/' + str(userCount)
            for item in recList[user]:
                line += ' (' + item[0] + ',' + str(item[1]) + ')'
                if self.dao.testSet_u[user].has_key(item[0]):
                    line += '*'

            line += '\n'
            res.append(line)
        currentTime = strftime("%Y-%m-%d %H-%M-%S", localtime(time()))
        # output prediction result
        if self.isOutput:
            fileName = ''
            outDir = self.output['-dir']
            if self.ranking.contains('-topN'):
                fileName = self.config[
                    'recommender'] + '@' + currentTime + '-top-' + str(
                        N) + 'items' + self.foldInfo + '.txt'
            elif self.ranking.contains('-threshold'):
                fileName = self.config[
                    'recommender'] + '@' + currentTime + '-threshold-' + str(
                        threshold) + self.foldInfo + '.txt'
            FileIO.writeFile(outDir, fileName, res)
            print 'The Result has been output to ', abspath(outDir), '.'
        #output evaluation result
        outDir = self.output['-dir']
        fileName = self.config[
            'recommender'] + '@' + currentTime + '-measure' + self.foldInfo + '.txt'
        if self.ranking.contains('-topN'):
            self.measure = Measure.rankingMeasure(self.dao.testSet_u, recList,
                                                  N)
        elif self.ranking.contains('-threshold'):
            origin = self.dao.testSet_u.copy()
            for user in origin:
                temp = {}
                for item in origin[user]:
                    if origin[user][item] >= threshold:
                        temp[item] = threshold
                origin[user] = temp
            self.measure = Measure.rankingMeasure_threshold(
                origin, recList, userN)
        FileIO.writeFile(outDir, fileName, self.measure)

    def execute(self):
        self.readConfiguration()
        if self.foldInfo == '[1]':
            self.printAlgorConfig()
        #load model from disk or build model
        if self.isLoadModel:
            print 'Loading model %s...' % (self.foldInfo)
            self.loadModel()
        else:
            print 'Initializing model %s...' % (self.foldInfo)
            self.initModel()
            print 'Building Model %s...' % (self.foldInfo)
            self.buildModel()

        #preict the ratings or item ranking
        print 'Predicting %s...' % (self.foldInfo)
        if self.ranking.isMainOn():
            self.evalRanking()
        else:
            self.evalRatings()

        #save model
        if self.isSaveModel:
            print 'Saving model %s...' % (self.foldInfo)
            self.saveModel()

        return self.measure

    def performance(self):
        #res = []  # used to contain the text of the result
        #res.append('userId  itemId  original  prediction\n')
        # predict
        res = []
        for ind, entry in enumerate(self.dao.testData):
            user, item, rating = entry

            # predict
            prediction = self.predict(user, item)
            # denormalize
            prediction = denormalize(prediction, self.dao.rScale[-1],
                                     self.dao.rScale[0])
            #####################################
            pred = self.checkRatingBoundary(prediction)
            # add prediction in order to measure
            res.append([user, item, rating, pred])
            #res.append(user + ' ' + item + ' ' + str(rating) + ' ' + str(pred) + '\n')
        #currentTime = strftime("%Y-%m-%d %H-%M-%S", localtime(time()))
        # output prediction result
        # if self.isOutput:
        #     outDir = self.output['-dir']
        #     fileName = self.config['recommender'] + '@' + currentTime + '-rating-predictions' + self.foldInfo + '.txt'
        #     FileIO.writeFile(outDir, fileName, res)
        #     print 'The Result has been output to ', abspath(outDir), '.'
        # output evaluation result
        # outDir = self.output['-dir']
        # fileName = self.config['recommender'] + '@' + currentTime + '-measure' + self.foldInfo + '.txt'
        self.measure = Measure.ratingMeasure(res)
        return self.measure
コード例 #14
0
ファイル: recommender.py プロジェクト: yuyu2223/Yue
class Recommender(object):
    def __init__(self, conf, trainingSet=None, testSet=None, fold='[1]'):
        self.config = conf
        self.isSaveModel = False
        self.isLoadModel = False
        self.isOutput = True
        self.data = Record(self.config, trainingSet, testSet)
        self.foldInfo = fold
        self.evalConfig = LineConfig(self.config['evaluation.setup'])
        if self.evalConfig.contains('-target'):
            self.recType = self.evalConfig['-target']
        else:
            self.recType = 'track'
        if LineConfig(self.config['evaluation.setup']).contains('-cold'):
            #evaluation on cold-start users
            threshold = int(
                LineConfig(self.config['evaluation.setup'])['-cold'])
            removedUser = {}
            for user in self.data.testSet:
                if self.data.userRecord.has_key(user) and len(
                        self.data.userRecord[user]) > threshold:
                    removedUser[user] = 1
            for user in removedUser:
                del self.data.testSet[user]

    def readConfiguration(self):
        self.algorName = self.config['recommender']
        self.output = LineConfig(self.config['output.setup'])
        self.isOutput = self.output.isMainOn()
        self.ranking = LineConfig(self.config['item.ranking'])

    def printAlgorConfig(self):
        "show algorithm's configuration"
        print 'Algorithm:', self.config['recommender']
        print 'Training set:', abspath(self.config['record'])
        if LineConfig(self.config['evaluation.setup']).contains('-testSet'):
            print 'Test set:', abspath(
                LineConfig(
                    self.config['evaluation.setup']).getOption('-testSet'))
        #print 'Count of the users in training set: ',len()
        self.data.printTrainingSize()
        print '=' * 80

    def initModel(self):
        pass

    def buildModel(self):
        'build the model (for model-based algorithms )'
        pass

    def saveModel(self):
        pass

    def loadModel(self):
        pass

    def predict(self, user):
        return []

    def evalRanking(self):
        res = []  # used to contain the text of the result
        N = 0
        threshold = 0

        N = int(self.ranking['-topN'])
        if N > 100 or N < 0:
            print 'N can not be larger than 100! It has been reassigned with 10'
            N = 10

        res.append(
            'userId: recommendations in (itemId, ranking score) pairs, * means the item matches.\n'
        )
        # predict
        recList = {}
        userCount = len(self.data.testSet)
        rawRes = {}
        for i, user in enumerate(self.data.testSet):
            itemSet = {}
            line = user + ':'
            predictedItems = self.predict(user)

            recList[user] = predictedItems

            if i % 100 == 0:
                print self.algorName, self.foldInfo, 'progress:' + str(
                    i) + '/' + str(userCount)
            for item in recList[user]:
                if self.data.testSet[user].has_key(item[0]):
                    line += '*'
                line += item + ','

            line += '\n'
            res.append(line)
        currentTime = strftime("%Y-%m-%d %H-%M-%S", localtime(time()))
        # output prediction result
        if self.isOutput:
            fileName = ''
            outDir = self.output['-dir']
            if self.ranking.contains('-topN'):
                fileName = self.config[
                    'recommender'] + '@' + currentTime + '-top-' + str(
                        N) + 'items' + self.foldInfo + '.txt'
            elif self.ranking.contains('-threshold'):
                fileName = self.config[
                    'recommender'] + '@' + currentTime + '-threshold-' + str(
                        threshold) + self.foldInfo + '.txt'
            FileIO.writeFile(outDir, fileName, res)
            print 'The result has been output to ', abspath(outDir), '.'
        # output evaluation result
        outDir = self.output['-dir']
        fileName = self.config[
            'recommender'] + '@' + currentTime + '-measure' + self.foldInfo + '.txt'
        if self.ranking.contains('-topN'):
            self.measure = Measure.rankingMeasure(self.data.testSet, recList,
                                                  rawRes, N)

        FileIO.writeFile(outDir, fileName, self.measure)
        print 'The result of %s %s:\n%s' % (self.algorName, self.foldInfo,
                                            ''.join(self.measure))

    def execute(self):
        self.readConfiguration()
        if self.foldInfo == '[1]':
            self.printAlgorConfig()
        #load model from disk or build model
        if self.isLoadModel:
            print 'Loading model %s...' % (self.foldInfo)
            self.loadModel()
        else:
            print 'Initializing model %s...' % (self.foldInfo)
            self.initModel()
            print 'Building Model %s...' % (self.foldInfo)
            self.buildModel()

        #preict the ratings or item ranking
        print 'Predicting %s...' % (self.foldInfo)
        self.evalRanking()
        #save model
        if self.isSaveModel:
            print 'Saving model %s...' % (self.foldInfo)
            self.saveModel()

        return self.measure
コード例 #15
0
ファイル: recommender.py プロジェクト: zkalan/Yue
class Recommender(object):
    def __init__(self, conf, trainingSet=None, testSet=None, fold='[1]'):
        self.config = conf
        self.isSaveModel = False
        self.isLoadModel = False
        self.isOutput = True
        self.data = Record(self.config, trainingSet, testSet)
        self.foldInfo = fold
        self.evalConfig = LineConfig(self.config['evaluation.setup'])
        if self.evalConfig.contains('-target'):
            self.recType = self.evalConfig['-target']
        else:
            self.recType = 'track'
        if LineConfig(self.config['evaluation.setup']).contains('-cold'):
            #evaluation on cold-start users
            threshold = int(
                LineConfig(self.config['evaluation.setup'])['-cold'])
            removedUser = []
            removedTrack = defaultdict(list)
            #for user in self.data.testSet:
            #    if user in self.data.userRecord and len(self.data.userRecord[user])>threshold:
            #        removedUser.append(user)
            for user in self.data.testSet:
                if user in self.data.userRecord:
                    for item in self.data.testSet[user]:
                        if len(self.data.trackRecord[item]) > threshold:
                            removedTrack[user].append(item)
            for user in removedTrack:
                for item in removedTrack[user]:
                    del self.data.testSet[user][item]
                if len(self.data.testSet[user]) == 0:
                    del self.data.testSet[user]
            #for user in removedUser:
            #    del self.data.testSet[user]

        if LineConfig(self.config['evaluation.setup']).contains('-sample'):
            userList = list(self.data.testSet.keys())
            removedUser = userList[:int(len(userList) * 0.9)]
            for user in removedUser:
                del self.data.testSet[user]

    def readConfiguration(self):
        self.algorName = self.config['recommender']
        self.output = LineConfig(self.config['output.setup'])
        self.isOutput = self.output.isMainOn()
        self.ranking = LineConfig(self.config['item.ranking'])

    def printAlgorConfig(self):
        "show algorithm's configuration"
        print('Algorithm:', self.config['recommender'])
        print('Training set:', abspath(self.config['record']))
        if LineConfig(self.config['evaluation.setup']).contains('-testSet'):
            print(
                'Test set:',
                abspath(
                    LineConfig(self.config['evaluation.setup']).getOption(
                        '-testSet')))
        #print 'Count of the users in training set: ',len()
        self.data.printTrainingSize()
        print('=' * 80)

    def initModel(self):
        pass

    def buildModel(self):
        'build the model (for model-based algorithms )'
        pass

    def saveModel(self):
        pass

    def loadModel(self):
        pass

    def predict(self, user):
        return []

    def evalRanking(self):
        res = []  # used to contain the text of the result
        N = 0
        threshold = 0
        top = self.ranking['-topN'].split(',')
        top = [int(num) for num in top]
        N = int(top[-1])
        if N > 100 or N < 0:
            print(
                'N can not be larger than 100! It has been reassigned with 10')
            N = 10

        res.append(
            'userId: recommendations in (itemId, ranking score) pairs, * means the item matches, $ means the unpop item\n'
        )
        # predict
        recList = {}
        userCount = len(self.data.testSet)

        for i, user in enumerate(self.data.testSet):

            num_pop = 0

            line = user + ':'
            if user in self.data.userRecord:
                predictedItems = self.predict(user)
            else:
                predictedItems = ['0'] * N
            predicted = {}
            for k, item in enumerate(predictedItems):
                predicted[item] = k
            for item in self.data.userRecord[user]:
                if item[self.recType] in predicted:
                    del predicted[item[self.recType]]
            predicted = sorted(predicted.items(), key=lambda d: d[1])
            predictedItems = [item[0] for item in predicted]
            recList[user] = predictedItems[:N]
            #print('user', user, 'the recList:', type(self.data.testSet[user]))

            if i % 100 == 0:
                print(self.algorName, self.foldInfo,
                      'progress:' + str(i) + '/' + str(userCount))
            for item in recList[user]:
                if item in self.data.testSet[user]:
                    line += '*'
                if item in self.data.PopTrack:
                    num_pop += 1
                    line += '$'
                line += item + ','

            line += '\n'
            res.append(line)
        currentTime = strftime("%Y-%m-%d %H-%M-%S", localtime(time()))
        # output prediction result
        if self.isOutput:
            fileName = ''
            outDir = self.output['-dir']
            if self.ranking.contains('-topN'):
                fileName = self.config['recommender'] + '@' + currentTime + '-top-' + self.ranking['-topN']\
                           + 'items' + self.foldInfo + '.txt'
            FileIO.writeFile(outDir, fileName, res)
            print('The result has been output to ', abspath(outDir), '.')
        # output evaluation result
        outDir = self.output['-dir']
        fileName = self.config[
            'recommender'] + '@' + currentTime + '-measure' + self.foldInfo + '.txt'

        self.measure = Measure.rankingMeasure(self.data.testSet, recList, top,
                                              self.data.getSize(self.recType))

        FileIO.writeFile(outDir, fileName, self.measure)
        print('The result of %s %s:\n%s' %
              (self.algorName, self.foldInfo, ''.join(self.measure)))

    def execute(self):
        self.readConfiguration()
        if self.foldInfo == '[1]':
            self.printAlgorConfig()
        #load model from disk or build model
        if self.isLoadModel:
            print('Loading model %s...' % (self.foldInfo))
            self.loadModel()
        else:
            print('Initializing model %s...' % (self.foldInfo))
            self.initModel()
            print('Building Model %s...' % (self.foldInfo))
            self.buildModel()

        #preict the ratings or item ranking
        print('Predicting %s...' % (self.foldInfo))
        self.evalRanking()
        #save model
        if self.isSaveModel:
            print('Saving model %s...' % (self.foldInfo))
            self.saveModel()

        return self.measure
コード例 #16
0
ファイル: RecQ.py プロジェクト: nicoleljc1227/RecQ
class RecQ(object):
    def __init__(self,config):
        self.trainingData = []  # training data
        self.testData = []  # testData
        self.measure = []
        self.config =config
        self.ratingConfig = LineConfig(config['ratings.setup'])
        if self.config.contains('evaluation.setup'):
            self.evaluation = LineConfig(config['evaluation.setup'])
            if self.evaluation.contains('-testSet'):
                #specify testSet
                self.__loadDataSet(config['ratings'])
                self.__loadDataSet(self.evaluation['-testSet'],bTest=True)
            elif self.evaluation.contains('-ap'):
                #auto partition
                self.__loadDataSet(config['ratings'])
                self.trainingData,self.testData = DataSplit.\
                    dataSplit(self.trainingData,test_ratio=float(self.evaluation['-ap']))
            elif self.evaluation.contains('-cv'):
                #cross validation
                self.__loadDataSet(config['ratings'])
                #self.trainingData,self.testData = DataSplit.crossValidation(self.trainingData,int(self.evaluation['-cv']))
            else:
                print 'Evaluation is not well configured!'
                exit(-1)

    def __loadDataSet(self, file, bTest=False):
        if not bTest:
            print 'loading training data...'
        else:
            print 'loading test data...'
        with open(file) as f:
            ratings = f.readlines()
        # ignore the headline
        if self.ratingConfig.contains('-header'):
            ratings = ratings[1:]
        # order of the columns
        order = self.ratingConfig['-columns'].strip().split()

        for lineNo, line in enumerate(ratings):
            items = split(' |,|\t', line.strip())
            if len(order) < 3:
                print 'The rating file is not in a correct format. Error: Line num %d' % lineNo
                exit(-1)
            try:
                userId = items[int(order[0])]
                itemId = items[int(order[1])]
                rating = items[int(order[2])]
            except ValueError:
                print 'Error! Have you added the option -header to the rating.setup?'
                exit(-1)
            if not bTest:
                self.trainingData.append([userId, itemId, float(rating)])
            else:
                self.testData.append([userId, itemId, float(rating)])

    def execute(self):
        exec ('from algorithm.rating.' + self.config['recommender'] + ' import ' + self.config['recommender'])
        if self.evaluation.contains('-cv'):
            i = 1
            for train,test in DataSplit.crossValidation(self.trainingData,int(self.evaluation['-cv'])):
                fold = '['+str(i)+']'
                recommender = self.config['recommender']+ "(self.config,train,test,fold)"
                measure = eval(recommender).execute()
                self.measure.append(measure)
                i+=1
            res = []
            for i in range(len(self.measure[0])):
                measure = self.measure[0][i].split(':')[0]
                total = 0
                for j in range(len(self.measure)):
                    total += float(self.measure[j][i].split(':')[1])
                res.append(measure+':'+str(total/len(self.measure))+'\n')
            outDir = LineConfig(self.config['output.setup'])['-dir']
            fileName = self.config['recommender'] +'@'+str(int(self.evaluation['-cv']))+'-fold-cv' + '.txt'
            FileIO.writeFile(outDir,fileName,res)


        else:
            recommender = self.config['recommender']+'(self.config,self.trainingData,self.testData)'
            eval(recommender).execute()
コード例 #17
0
class SDLib(object):
    def __init__(self, config):
        self.trainingData = []  # training data
        self.testData = []  # testData
        self.relation = []
        self.measure = []
        self.config = config
        self.ratingConfig = LineConfig(config['ratings.setup'])
        self.labels = FileIO.loadLabels(config['label'])

        if self.config.contains('evaluation.setup'):
            self.evaluation = LineConfig(config['evaluation.setup'])

            if self.evaluation.contains('-testSet'):
                #specify testSet
                self.trainingData = FileIO.loadDataSet(config,
                                                       config['ratings'])
                self.testData = FileIO.loadDataSet(config,
                                                   self.evaluation['-testSet'],
                                                   bTest=True)

            elif self.evaluation.contains('-ap'):
                #auto partition
                self.trainingData = FileIO.loadDataSet(config,
                                                       config['ratings'])
                self.trainingData,self.testData = DataSplit.\
                    dataSplit(self.trainingData,test_ratio=float(self.evaluation['-ap']))

            elif self.evaluation.contains('-cv'):
                #cross validation
                self.trainingData = FileIO.loadDataSet(config,
                                                       config['ratings'])
                #self.trainingData,self.testData = DataSplit.crossValidation(self.trainingData,int(self.evaluation['-cv']))

        else:
            print('Evaluation is not well configured!')
            exit(-1)

        if config.contains('social'):
            self.socialConfig = LineConfig(self.config['social.setup'])
            self.relation = FileIO.loadRelationship(config,
                                                    self.config['social'])
        print('preprocessing...')

    def execute(self):
        #import the algorithm module
        importStr = 'from method.' + self.config[
            'methodName'] + ' import ' + self.config['methodName']
        exec(importStr)
        if self.evaluation.contains('-cv'):
            k = int(self.evaluation['-cv'])
            if k <= 1 or k > 10:
                k = 3
            #create the manager used to communication in multiprocess
            manager = Manager()
            m = manager.dict()
            i = 1
            tasks = []
            for train, test in DataSplit.crossValidation(self.trainingData, k):
                fold = '[' + str(i) + ']'
                if self.config.contains('social'):
                    method = self.config[
                        'methodName'] + "(self.config,train,test,self.labels,self.relation,fold)"
                else:
                    method = self.config[
                        'methodName'] + "(self.config,train,test,self.labels,fold)"
            #create the process
                p = Process(target=run, args=(m, eval(method), i))
                tasks.append(p)
                i += 1
            #start the processes
            for p in tasks:
                p.start()
            #wait until all processes are completed
            for p in tasks:
                p.join()
            #compute the mean error of k-fold cross validation
            self.measure = [dict(m)[i] for i in range(1, k + 1)]
            res = []
            pattern = re.compile('(\d+\.\d+)')
            countPattern = re.compile('\d+\\n')
            labelPattern = re.compile('\s\d{1}[^\.|\n|\d]')
            labels = re.findall(labelPattern, self.measure[0])
            values = np.array([0] * 9, dtype=float)
            count = np.array([0, 0, 0], dtype=int)
            for report in self.measure:
                values += np.array(re.findall(pattern, report), dtype=float)
                count += np.array(re.findall(countPattern, report), dtype=int)
            values /= k
            values = np.around(values, decimals=4)
            res.append('             precision  recall  f1-score  support\n\n')
            res.append('         ' + labels[0] + '  ' +
                       '    '.join(np.array(values[0:3], dtype=str).tolist()) +
                       '   ' + str(count[0]) + '\n')
            res.append('         ' + labels[1] + '  ' +
                       '    '.join(np.array(values[3:6], dtype=str).tolist()) +
                       '   ' + str(count[1]) + '\n\n')
            res.append('  avg/total   ' +
                       '    '.join(np.array(values[6:9], dtype=str).tolist()) +
                       '   ' + str(count[2]) + '\n')
            print('Total:')
            print(''.join(res))
            # for line in lines[1:]:
            #
            # measure = self.measure[0][i].split(':')[0]
            # total = 0
            # for j in range(k):
            #     total += float(self.measure[j][i].split(':')[1])
            # res.append(measure+':'+str(total/k)+'\n')
            #output result
            currentTime = strftime("%Y-%m-%d %H-%M-%S", localtime(time()))
            outDir = LineConfig(self.config['output.setup'])['-dir']
            fileName = self.config[
                'methodName'] + '@' + currentTime + '-' + str(
                    k) + '-fold-cv' + '.txt'
            FileIO.writeFile(outDir, fileName, res)
            print('The results have been output to ' +
                  abspath(LineConfig(self.config['output.setup'])['-dir']) +
                  '\n')
        else:
            if self.config.contains('social'):
                method = self.config[
                    'methodName'] + '(self.config,self.trainingData,self.testData,self.labels,self.relation)'
            else:
                method = self.config[
                    'methodName'] + '(self.config,self.trainingData,self.testData,self.labels)'
            result = eval(method).execute()
            return result
コード例 #18
0
class Record(object):
    'data access control'
    def __init__(self,config,trainingSet,testSet):
        self.config = config
        self.recordConfig = LineConfig(config['record.setup'])
        self.evalConfig = LineConfig(config['evaluation.setup'])
        self.name2id = defaultdict(dict)
        self.id2name = defaultdict(dict)
        self.listened = {}
        self.listened['artist']=defaultdict(dict)
        self.listened['track']=defaultdict(dict)
        self.listened['album']=defaultdict(dict)
        self.artist2Album = defaultdict(dict) #key:artist id, value:{album id1:1, album id2:1 ...}
        self.album2Track = defaultdict(dict) #
        self.artist2Track = defaultdict(dict) #
        self.Track2artist = defaultdict(dict) #
        self.Track2album = defaultdict(dict) #
        self.userRecord = defaultdict(list) #user data in training set. form: {user:[record1,record2]}
        self.trackRecord = defaultdict(list) # track data in training set. form: {track:[record1, record2]}
        self.testSet = defaultdict(dict) #user data in test set. form: {user:{recommenedObject1:1,recommendedObject:1}}
        self.recordCount = 0
        self.columns = {}
        self.globalMean = 0
        self.userMeans = {} #used to store the mean values of users's listen tims
        self.trackListen = {}

        self.trainingData = trainingSet

        self.computeUserMean()
        self.globalAverage()
        self.PopTrack = {}

        labels = self.recordConfig['-columns'].split(',')
        for col in labels:
            label = col.split(':')
            self.columns[label[0]] = int(label[1])
        if self.evalConfig.contains('-byTime'):
            trainingSet,testSet = self.splitDataByTime(trainingSet)

        self.preprocess(trainingSet,testSet)


        self.computePop(trainingSet)
    
    def globalAverage(self):
        total = sum(self.userMeans.values())
        if total==0:
            self.globalMean = 0
        else:
            self.globalMean = total/len(self.userMeans)

    def computeUserMean(self):
        for user in self.userRecord:
            for item in self.userRecord[user]:
                userSum += self.listened['track'][item].values()
            
            self.userMeans[user] = userSum/float(len(self.userRecord[user]))
        
    ''' 
    def splitDataByTime(self,dataset):
        trainingSet = []
        testSet = []
        listened = {}
        ratio = float(self.evalConfig['-byTime'])
        records = defaultdict(list)
        for event in dataset:
            records[event['user']].append(event)
            if event['user'] not in listened:
                listened[event['user']] = 1
            else:
                listened[event['user']] += 1
        orderlist = sorted(listened.items(), key=lambda item:item[1], reverse=True)
        dellist = orderlist[:int(len(orderlist)*ratio)]
        for i in range(len(dellist)):
            if dellist[i][0] in records:
                del records[dellist[i][0]]

        #print('The amount of data after deletion:', len(records))

        for user in records:
            orderedList = sorted(records[user],key=lambda d:d['time'])
            training = orderedList[0:int(len(orderedList)*(1-ratio))]
            test = orderedList[int(len(orderedList)*(1-ratio)):]
            trainingSet += training
            testSet += test

        #print ('the type1 :', type(trainingSet), type(testSet))
        #file_train = 'trainset.txt'
        #file_test = 'testset.txt'
        #trainf = open(file_train, 'wb')
        #testf = open(file_test, 'wb')
        #pickle.dump(trainingSet, trainf, 2)
        #pickle.dump(testSet, testf, 2)
        #trainf.close()
        #testf.close()
        return trainingSet,testSet
    '''
    def splitDataByTime(self,dataset):
        trainingSet = []
        testSet = []
        ratio = float(self.evalConfig['-byTime'])
        records = defaultdict(list)
        for event in dataset:
            records[event['user']].append(event)

        for user in records:
            orderedList = sorted(records[user],key=lambda d:d['time'])
            training = orderedList[0:int(len(orderedList)*(1-ratio))]
            test = orderedList[int(len(orderedList)*(1-ratio)):]
            trainingSet += training
            testSet += test

        return trainingSet,testSet

    def computePop(self, dataset):
        print('computePop...')
        for event in dataset:
            total = 0
            for value in self.listened['track'][event['track']].values():
                total += value
                if value > 0:
                    self.PopTrack[event['track']] = total
            
        print('computePop is finished...')
        print('PopTrack', len(self.PopTrack))
        

    def preprocess(self,trainingSet,testSet):
        for entry in trainingSet:
            self.recordCount+=1
            for key in entry:
                if key!='time':
                    if entry[key] not in self.name2id[key]:
                        self.name2id[key][entry[key]] = len(self.name2id[key])
                        self.id2name[key][len(self.id2name[key])] = entry[key]

                if key=='user':
                    self.userRecord[entry['user']].append(entry)
                    if 'artist' in entry:
                        if entry[key] not in self.listened['artist'][entry['artist']]:
                            self.listened['artist'][entry['artist']][entry[key]] = 1
                        else:
                            self.listened['artist'][entry['artist']][entry[key]] += 1
                    if  'album' in entry:
                        if entry[key] not in self.listened['album'][entry['album']]:
                            self.listened['album'][entry['album']][entry[key]] = 1
                        else:
                            self.listened['album'][entry['album']][entry[key]] += 1
                    if 'track' in entry:
                        if entry[key] not in self.listened['track'][entry['track']]:
                            self.listened['track'][entry['track']][entry[key]] = 1
                        else:
                            self.listened['track'][entry['track']][entry[key]] += 1
                
                if key == 'artist' and 'album' in entry:
                    self.artist2Album[entry[key]][entry['album']] = 1

                if key == 'album' and 'track' in entry:
                    self.album2Track[entry[key]] = self.name2id['track'][entry['track']]
                    self.Track2album[entry['track']] = self.name2id[key][entry[key]]
                
                if key == 'artist' and 'track' in entry:
                    self.artist2Track[entry[key]] = self.name2id['track'][entry['track']]
                    self.Track2artist[entry['track']] = self.name2id[key][entry[key]]
                
                if key == 'track':
                    self.trackRecord[entry['track']].append(entry)



        recType = self.evalConfig['-target']
        for entry in testSet:
            for key in entry:
                if key != 'time':
                    if entry[key] not in self.name2id[key]:
                        self.name2id[key][entry[key]] = len(self.name2id[key])
                        self.id2name[key][len(self.id2name[key])] = entry[key]
                if key=='user':
                    if recType in entry and entry[recType] not in self.testSet[entry['user']]:
                        self.testSet[entry['user']][entry[recType]]=1
                    else:
                        self.testSet[entry['user']][entry[recType]]+=1

        #remove items appearing in the training set from the test set
        for item in self.listened[recType]:
            for user in self.listened[recType][item]:
                try:
                    del self.testSet[user][item]
                except KeyError:
                    pass
                if user in self.testSet and len(self.testSet[user])==0:
                    del self.testSet[user]
        


    def printTrainingSize(self):
        if 'user' in self.name2id:
            print ('user count:',len(self.name2id['user']))
        if 'artist' in self.name2id:
            print ('artist count:',len(self.name2id['artist']))
        if 'album' in self.name2id:
            print ('album count:',len(self.name2id['album']))
        if 'track' in self.name2id:
            print ('track count:', len(self.name2id['track']))
        print ('Training set size:',self.recordCount)


    def getId(self,obj,t):
        if obj in self.name2id[t]:
            return self.name2id[t][obj]
        else:
            print ('No '+t+' '+obj+' exists!')
            exit(-1)

    def getSize(self,t):
        return len(self.name2id[t])

    def contains(self, obj, t):
        'whether the recType t is in trainging set'
        if obj in self.name2id[t]:
            return True
        else:
            return False
コード例 #19
0
class SocialDAO(object):
    def __init__(self, conf):
        self.config = conf
        self.socialConfig = LineConfig(self.config['social.setup'])
        self.user = {}  #used to store the order of users
        self.triple = []
        self.followees = {}
        self.followers = {}
        self.trustMatrix = self.loadRelationship(self.config['social'])

    def loadRelationship(self, filePath):
        print 'load social data...'
        triple = []
        with open(filePath) as f:
            relations = f.readlines()
            # ignore the headline
        if self.socialConfig.contains('-header'):
            relations = relations[1:]
        # order of the columns
        order = self.socialConfig['-columns'].strip().split()
        if len(order) <= 2:
            print 'The social file is not in a correct format.'
        for line in relations:
            items = split(' |,|\t', line.strip())
            if len(order) < 2:
                print 'The social file is not in a correct format. Error: Line num %d' % lineNo
                exit(-1)
            userId1 = items[int(order[0])]
            userId2 = items[int(order[1])]
            if len(order) < 3:
                weight = 1
            else:
                weight = float(items[int(order[2])])
            #add relations to dict
            if not self.followees.has_key(userId1):
                self.followees[userId1] = {}
            self.followees[userId1][userId2] = weight
            if not self.followers.has_key(userId2):
                self.followers[userId2] = {}
            self.followers[userId2][userId1] = weight
            # order the user
            if not self.user.has_key(userId1):
                self.user[userId1] = len(self.user)
            if not self.user.has_key(userId2):
                self.user[userId2] = len(self.user)
            self.triple.append([userId1, userId2, weight])
            triple.append([self.user[userId1], self.user[userId2], weight])
        return new_sparseMatrix.SparseMatrix(triple)

    def row(self, u):
        #return user u's followees
        return self.trustMatrix.row(self.user[u])

    def col(self, u):
        #return user u's followers
        return self.trustMatrix.col(self.user[u])

    def weight(self, u1, u2):
        if self.followees.has_key(u1) and self.followees[u1].has_key[u2]:
            return self.followees[u1][u2]
        else:
            return 0

    def trustSize(self):
        return self.trustMatrix.size

    def getFollowers(self, u):
        if self.followers.has_key(u):
            return self.followers[u]
        else:
            return {}

    def getFollowees(self, u):
        if self.followees.has_key(u):
            return self.followees[u]
        else:
            return {}

    def hasFollowee(self, u1, u2):
        if self.followees.has_key(u1):
            if self.followees[u1].has_key(u2):
                return True
            else:
                return False
        return False

    def hasFollower(self, u1, u2):
        if self.followers.has_key(u1):
            if self.followers[u1].has_key(u2):
                return True
            else:
                return False
        return False
コード例 #20
0
ファイル: SDLib.py プロジェクト: CoderWZW/SDLib
class SDLib(object):
    def __init__(self,config):
        self.trainingData = []  # training data
        self.testData = []  # testData
        self.relation = []
        self.measure = []
        self.config =config
        self.ratingConfig = LineConfig(config['ratings.setup'])
        self.labels = FileIO.loadLabels(config['label'])

        if self.config.contains('evaluation.setup'):
            self.evaluation = LineConfig(config['evaluation.setup'])
            
            if self.evaluation.contains('-testSet'):
                #specify testSet
                self.trainingData = FileIO.loadDataSet(config, config['ratings'])
                self.testData = FileIO.loadDataSet(config, self.evaluation['-testSet'], bTest=True)

            elif self.evaluation.contains('-ap'):
                #auto partition
                self.trainingData = FileIO.loadDataSet(config,config['ratings'])
                self.trainingData,self.testData = DataSplit.\
                    dataSplit(self.trainingData,test_ratio=float(self.evaluation['-ap']))

            elif self.evaluation.contains('-cv'):
                #cross validation
                self.trainingData = FileIO.loadDataSet(config, config['ratings'])
                #self.trainingData,self.testData = DataSplit.crossValidation(self.trainingData,int(self.evaluation['-cv']))

        else:
            print 'Evaluation is not well configured!'
            exit(-1)

        if config.contains('social'):
            self.socialConfig = LineConfig(self.config['social.setup'])
            self.relation = FileIO.loadRelationship(config,self.config['social'])
        print 'preprocessing...'


    def execute(self):
        #import the algorithm module
        importStr = 'from method.' + self.config['methodName'] + ' import ' + self.config['methodName']
        exec (importStr)
        if self.evaluation.contains('-cv'):
            k = int(self.evaluation['-cv'])
            if k <= 1 or k > 10:
                k = 3
            #create the manager used to communication in multiprocess
            manager = Manager()
            m = manager.dict()
            i = 1
            tasks = []
            for train,test in DataSplit.crossValidation(self.trainingData,k):
                fold = '['+str(i)+']'
                if self.config.contains('social'):
                    method = self.config['methodName'] + "(self.config,train,test,self.labels,self.relation,fold)"
                else:
                    method = self.config['methodName'] + "(self.config,train,test,self.labels,fold)"
               #create the process
                p = Process(target=run,args=(m,eval(method),i))
                tasks.append(p)
                i+=1
            #start the processes
            for p in tasks:
                p.start()
            #wait until all processes are completed
            for p in tasks:
                p.join()
            #compute the mean error of k-fold cross validation
            self.measure = [dict(m)[i] for i in range(1,k+1)]
            res = []
            pattern = re.compile('(\d+\.\d+)')
            countPattern = re.compile('\d+\\n')
            labelPattern = re.compile('\s\d{1}[^\.|\n|\d]')
            labels = re.findall(labelPattern, self.measure[0])
            values = np.array([0]*9,dtype=float)
            count = np.array([0,0,0],dtype=int)
            for report in self.measure:
                values += np.array(re.findall(pattern,report),dtype=float)
                count+=np.array(re.findall(countPattern,report),dtype=int)
            values/=k
            values=np.around(values,decimals=4)
            res.append('             precision  recall  f1-score  support\n\n')
            res.append('         '+labels[0]+'  '+'    '.join(np.array(values[0:3],dtype=str).tolist())+'   '+str(count[0])+'\n')
            res.append('         '+labels[1]+'  '+'    '.join(np.array(values[3:6],dtype=str).tolist())+'   '+str(count[1])+'\n\n')
            res.append('  avg/total   ' + '    '.join(np.array(values[6:9], dtype=str).tolist()) + '   ' + str(count[2]) + '\n')
            print 'Total:'
            print ''.join(res)
                # for line in lines[1:]:
                #
                # measure = self.measure[0][i].split(':')[0]
                # total = 0
                # for j in range(k):
                #     total += float(self.measure[j][i].split(':')[1])
                # res.append(measure+':'+str(total/k)+'\n')
            #output result
            currentTime = strftime("%Y-%m-%d %H-%M-%S", localtime(time()))
            outDir = LineConfig(self.config['output.setup'])['-dir']
            fileName = self.config['methodName'] +'@'+currentTime+'-'+str(k)+'-fold-cv' + '.txt'
            FileIO.writeFile(outDir,fileName,res)
            print 'The results have been output to '+abspath(LineConfig(self.config['output.setup'])['-dir'])+'\n'
        else:
            if self.config.contains('social'):
                method = self.config['methodName'] + '(self.config,self.trainingData,self.testData,self.labels,self.relation)'
            else:
                method = self.config['methodName'] + '(self.config,self.trainingData,self.testData,self.labels)'
            eval(method).execute()
コード例 #21
0
ファイル: RecQ.py プロジェクト: nonva/RecQ
class RecQ(object):
    def __init__(self, config):
        self.trainingData = []  # training data
        self.testData = []  # testData
        self.relation = []
        self.measure = []
        self.config = config
        self.ratingConfig = LineConfig(config['ratings.setup'])

        if self.config.contains('evaluation.setup'):
            self.evaluation = LineConfig(config['evaluation.setup'])
            if self.evaluation.contains('-testSet'):
                #specify testSet
                self.trainingData = FileIO.loadDataSet(config,
                                                       config['ratings'])
                self.testData = FileIO.loadDataSet(config,
                                                   self.evaluation['-testSet'],
                                                   bTest=True)
            elif self.evaluation.contains('-ap'):
                #auto partition
                self.trainingData = FileIO.loadDataSet(config,
                                                       config['ratings'])
                self.trainingData,self.testData = DataSplit.\
                    dataSplit(self.trainingData,test_ratio=float(self.evaluation['-ap']))
            elif self.evaluation.contains('-cv'):
                #cross validation
                self.trainingData = FileIO.loadDataSet(config,
                                                       config['ratings'])
                #self.trainingData,self.testData = DataSplit.crossValidation(self.trainingData,int(self.evaluation['-cv']))

        else:
            print 'Evaluation is not well configured!'
            exit(-1)

        if config.contains('social'):
            self.socialConfig = LineConfig(self.config['social.setup'])
            self.relation = FileIO.loadRelationship(config,
                                                    self.config['social'])

        print 'preprocessing...'

    def execute(self):
        #import the algorithm module
        importStr = 'from algorithm.rating.' + self.config[
            'recommender'] + ' import ' + self.config['recommender']
        exec(importStr)
        if self.evaluation.contains('-cv'):
            k = int(self.evaluation['-cv'])
            if k <= 1 or k > 10:
                k = 3
            #create the manager used to communication in multiprocess
            manager = Manager()
            m = manager.dict()
            i = 1
            tasks = []
            for train, test in DataSplit.crossValidation(self.trainingData, k):
                fold = '[' + str(i) + ']'
                if self.config.contains('social'):
                    recommender = self.config[
                        'recommender'] + "(self.config,train,test,self.relation,fold)"
                else:
                    recommender = self.config[
                        'recommender'] + "(self.config,train,test,fold)"
            #create the process
                p = Process(target=run, args=(m, eval(recommender), i))
                tasks.append(p)
                i += 1
            #start the processes
            for p in tasks:
                p.start()
            #wait until all processes are completed
            for p in tasks:
                p.join()
            #compute the mean error of k-fold cross validation
            self.measure = [dict(m)[i] for i in range(1, k + 1)]
            res = []
            for i in range(len(self.measure[0])):
                measure = self.measure[0][i].split(':')[0]
                total = 0
                for j in range(k):
                    total += float(self.measure[j][i].split(':')[1])
                res.append(measure + ':' + str(total / k) + '\n')
            #output result
            outDir = LineConfig(self.config['output.setup'])['-dir']
            fileName = self.config['recommender'] + '@' + str(
                k) + '-fold-cv' + '.txt'
            FileIO.writeFile(outDir, fileName, res)

        else:
            if self.config.contains('social'):
                recommender = self.config[
                    'recommender'] + '(self.config,self.trainingData,self.testData,self.relation)'
            else:
                recommender = self.config[
                    'recommender'] + '(self.config,self.trainingData,self.testData)'
            eval(recommender).execute()