Exemple #1
0
def randomGenerateDataSet(ratingFile):
    trainSet = utils.createWriteFile('RDTrain.dat')
    testSet = utils.createWriteFile('RDTest.dat')

    for row in ratingFile:
        random.seed()
        value = random.random()
        if value > 0.75:
            testSet.write(row)
        else:
            trainSet.write(row)
Exemple #2
0
def splitDatasetWithUsers(splitRatio, ratingFile, dataset, savePrefix):
    dataList = {}
    userIDDict = {}
    itemIDDict = {}
    functions = {
        'eachMovie6': convertStringToRatingInfoForEachMovie,
        'eachMovie': convertStringToRatingInfoForEachMovie,
        'movieLens': convertStringToRatingInfo
    }

    for row in ratingFile:
        userID, itemID, rating, weight, timestamp = functions[dataset](
            row, separator)
        #        userID, itemID, rating, timestamp = convertStringToRatingInfo(row, separator)
        if int(userID) not in userIDDict:
            userIDDict[int(userID)] = 1
            dataList[int(userID)] = [row]
        else:
            dataList[int(userID)].append(row)
        if int(itemID) not in itemIDDict:
            itemIDDict[int(itemID)] = 1


#        dataList.append(row)

    userIDList = userIDDict.keys()
    random.shuffle(userIDList)

    totalLength = len(userIDList)
    splitedIndex = totalLength * splitRatio

    trainSet = utils.createWriteFile(savePrefix + 'splitedUserTrain.dat')
    testSet = utils.createWriteFile(savePrefix + 'splitedUserTest.dat')
    count = 0
    ccount = 0
    print len(userIDList)
    for userID in userIDList:
        rows = dataList[userID]
        if len(rows) <= 10:
            ccount += 1
        if userID > splitedIndex:
            for row in rows:
                testSet.write(row)
        else:
            for row in rows:
                trainSet.write(row)
            count += 1
    print count, len(userIDList) - count
    print ccount
Exemple #3
0
def splitDataWithACertainTime(timestampArray, ratingDict, separator,
                              splitRatio):
    oldUserDict = {}
    oldItemDict = {}
    count = 0
    trainFile = utils.createWriteFile('train.dat')
    testFile = utils.createWriteFile('test.dat')
    total = len(timestampArray)
    ratingsWithNewUser = 0
    ratingsWithNewItem = 0
    ratings = 0
    newToNew = 0
    for timestamp in timestampArray:
        string = ratingDict[timestamp]
        userID, itemID, rating, timestamp = convertStringToRatingInfo(
            string, separator)
        if count <= split * total:
            if userID not in oldUserDict:
                oldUserDict[userID] = 1
            else:
                oldUserDict[userID] += 1
            if itemID not in oldItemDict:
                oldItemDict[itemID] = 1
            else:
                oldItemDict[itemID] += 1
            trainFile.write(string)
        else:
            flag = 0
            ratings += 1
            if userID not in oldUserDict:
                ratingsWithNewUser += 1
                flag = 1
            if itemID not in oldItemDict:
                ratingsWithNewItem += 1
                if flag == 1:
                    newToNew += 1

            testFile.write(string)
        count += 1

    print ratings, ratingsWithNewUser, ratingsWithNewItem, newToNew
Exemple #4
0
import utils

if __name__ == "__main__":
    inputFile = 'inputQ3'
    inputFile = 'C-small-2-attempt0.in'
    inputFile = 'C-large.in'
    #inputFile = "D-small-attempt0.in"
    #inputFile = "C-small-attempt0.in"
    #inputFile = "A-large.in.txt"
    #inputFile = "inputQ3"
    outputFile = "outputQ3"
    inputData = utils.createReadFile(inputFile)
    outputData = utils.createWriteFile(outputFile)
    cases = inputData.next()
    cases = cases.strip()
    print cases
    for index in range(1, int(cases) + 1):
        print "case ", index
        outputString = "Case #" + str(index) + ": "

        rowData = inputData.next()
        rowData = rowData.strip()
        strs = rowData.split(' ')
        N = int(strs[0])
        K = int(strs[1])
        array = {}
        large = N
        array[large] = 1
        i = 0
        while i < K:
            selectN = large