def profileLearnModel(self): #Profile full gradient descent X, U, V = DatasetUtils.syntheticDataset1(u=0.01, m=1000, n=2000) #X, U, V = DatasetUtils.syntheticDataset1() #X, U, V = DatasetUtils.syntheticDataset1(u=0.2, sd=0.2) #X = DatasetUtils.flixster() u = 0.2 w = 1 - u eps = 10**-6 alpha = 0.5 maxLocalAuc = MaxLocalAUC(self.k, w, alpha=alpha, eps=eps, stochastic=True) maxLocalAuc.maxNormU = 10 maxLocalAuc.maxNormV = 10 maxLocalAuc.maxIterations = 100 maxLocalAuc.initialAlg = "rand" maxLocalAuc.rate = "constant" maxLocalAuc.parallelSGD = True maxLocalAuc.numProcesses = 8 maxLocalAuc.numAucSamples = 10 maxLocalAuc.numRowSamples = 30 maxLocalAuc.scaleAlpha = False maxLocalAuc.loss = "hinge" maxLocalAuc.validationUsers = 0.0 print(maxLocalAuc) ProfileUtils.profile('maxLocalAuc.learnModel(X)', globals(), locals())
def testParallelLearnModel(self): numpy.random.seed(21) m = 500 n = 200 k = 5 X = SparseUtils.generateSparseBinaryMatrix((m, n), k, csarray=True) from wallhack.rankingexp.DatasetUtils import DatasetUtils X, U, V = DatasetUtils.syntheticDataset1() u = 0.1 w = 1-u eps = 0.05 maxLocalAuc = MaxLocalAUC(k, w, alpha=1.0, eps=eps, stochastic=True) maxLocalAuc.maxIterations = 3 maxLocalAuc.recordStep = 1 maxLocalAuc.rate = "optimal" maxLocalAuc.t0 = 2.0 maxLocalAuc.validationUsers = 0.0 maxLocalAuc.numProcesses = 4 os.system('taskset -p 0xffffffff %d' % os.getpid()) print(X.nnz/maxLocalAuc.numAucSamples) U, V = maxLocalAuc.parallelLearnModel(X)
def __init__(self): numpy.random.seed(21) #Create a low rank matrix m = 1000 n = 500 self.k = 8 #self.X = SparseUtils.generateSparseBinaryMatrix((m, n), self.k, csarray=True) self.X, U, V = DatasetUtils.syntheticDataset1(u=0.2, sd=0.2)
def computeRProfile(self): X, U, V = DatasetUtils.syntheticDataset1(m=1000, n=20000) w = 0.9 indsPerRow = 50 numRuns = 1000 def run(): for i in range(numRuns): SparseUtilsCython.computeR(U, V, w, indsPerRow) ProfileUtils.profile('run()', globals(), locals())
def profileRestrictOmega(self): X, U, V = DatasetUtils.syntheticDataset1(u=0.01, m=1000, n=2000) m, n = X.shape indPtr, colInds = SparseUtils.getOmegaListPtr(X) colIndsSubset = numpy.random.choice(n, 500, replace=False) def run(): for i in range(100): newIndPtr, newColInds = restrictOmega(indPtr, colInds, colIndsSubset) ProfileUtils.profile('run()', globals(), locals())
def profileLearnModel2(self): #Profile stochastic case #X = DatasetUtils.flixster() #X = Sampling.sampleUsers(X, 1000) X, U, V = DatasetUtils.syntheticDataset1(u=0.001, m=10000, n=1000) rho = 0.00 u = 0.2 w = 1 - u eps = 10**-6 alpha = 0.5 k = self.k maxLocalAuc = MaxLocalAUC(k, w, alpha=alpha, eps=eps, stochastic=True) maxLocalAuc.numRowSamples = 2 maxLocalAuc.numAucSamples = 10 maxLocalAuc.maxIterations = 1 maxLocalAuc.numRecordAucSamples = 100 maxLocalAuc.recordStep = 10 maxLocalAuc.initialAlg = "rand" maxLocalAuc.rate = "optimal" #maxLocalAuc.parallelSGD = True trainTestX = Sampling.shuffleSplitRows(X, maxLocalAuc.folds, 5) trainX, testX = trainTestX[0] def run(): U, V, trainMeasures, testMeasures, iterations, time = maxLocalAuc.learnModel( trainX, True) #logging.debug("Train Precision@5=" + str(MCEvaluator.precisionAtK(trainX, U, V, 5))) #logging.debug("Train Precision@10=" + str(MCEvaluator.precisionAtK(trainX, U, V, 10))) #logging.debug("Train Precision@20=" + str(MCEvaluator.precisionAtK(trainX, U, V, 20))) #logging.debug("Train Precision@50=" + str(MCEvaluator.precisionAtK(trainX, U, V, 50))) #logging.debug("Test Precision@5=" + str(MCEvaluator.precisionAtK(testX, U, V, 5))) #logging.debug("Test Precision@10=" + str(MCEvaluator.precisionAtK(testX, U, V, 10))) #logging.debug("Test Precision@20=" + str(MCEvaluator.precisionAtK(testX, U, V, 20))) #logging.debug("Test Precision@50=" + str(MCEvaluator.precisionAtK(testX, U, V, 50))) ProfileUtils.profile('run()', globals(), locals())
def testParallelLearnModel(self): numpy.random.seed(21) m = 500 n = 200 k = 5 X = SparseUtils.generateSparseBinaryMatrix((m, n), k, csarray=True) from wallhack.rankingexp.DatasetUtils import DatasetUtils X, U, V = DatasetUtils.syntheticDataset1() u = 0.1 w = 1 - u eps = 0.05 maxLocalAuc = MaxLocalAUC(k, w, alpha=1.0, eps=eps, stochastic=True) maxLocalAuc.maxIterations = 3 maxLocalAuc.recordStep = 1 maxLocalAuc.rate = "optimal" maxLocalAuc.t0 = 2.0 maxLocalAuc.validationUsers = 0.0 maxLocalAuc.numProcesses = 4 os.system('taskset -p 0xffffffff %d' % os.getpid()) print(X.nnz / maxLocalAuc.numAucSamples) U, V = maxLocalAuc.parallelLearnModel(X)
learners = [("SoftImpute", softImpute), ("WRMF", wrmf), ("KNN", knn), ("MLAUC", maxLocalAuc), ("SLIM", slim)] #Figure out the correct learner for tempLearnerName, tempLearner in learners: if args.alg == tempLearnerName: learnerName = tempLearnerName learner = tempLearner if "learner" not in globals(): raise ValueError("Learner not found: " + learnerName) os.system('taskset -p 0xffffffff %d' % os.getpid()) for dataset in datasets: X = DatasetUtils.mendeley2(minNnzRows=0, dataset=dataset) outputFilename = resultsDir + "Results_" + learnerName + "_" + dataset + ".npz" similaritiesFileName = resultsDir + "Recommendations_" + learnerName + "_" + dataset + ".csv" fileLock = FileLock(outputFilename) if not (fileLock.isLocked() or fileLock.fileExists()) or overwrite: fileLock.lock() logging.debug(learner) try: #Do some recommendation if type(learner) == IterativeSoftImpute: trainX = X.toScipyCsc() trainIterator = iter([trainX])
Util.setupScript() """ Script to see if model selection is the same on a subset of rows or elements. We use bounds on the rows of U and V. """ if len(sys.argv) > 1: dataset = sys.argv[1] else: dataset = "synthetic" saveResults = True prefix = "Regularisation5" outputFile = PathDefaults.getOutputDir() + "ranking/" + prefix + dataset.title() + "Results.npz" X = DatasetUtils.getDataset(dataset) testSize = 5 trainTestXs = Sampling.shuffleSplitRows(X, 1, testSize) trainX, testX = trainTestXs[0] logging.debug("Number of non-zero elements: " + str((trainX.nnz, testX.nnz))) u = 0.1 w = 1-u k2 = 64 eps = 10**-6 maxLocalAuc = MaxLocalAUC(k2, w, eps=eps, stochastic=True) maxLocalAuc.alpha = 0.1 maxLocalAuc.alphas = 2.0**-numpy.arange(0, 5, 1) maxLocalAuc.folds = 1
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) numpy.random.seed(21) numpy.set_printoptions(precision=4, suppress=True, linewidth=150) #numpy.seterr(all="raise") if len(sys.argv) > 1: dataset = sys.argv[1] else: dataset = "synthetic" saveResults = True prefix = "LossROC" outputFile = PathDefaults.getOutputDir() + "ranking/" + prefix + dataset.title() + "Results.npz" X = DatasetUtils.getDataset(dataset, nnz=20000) m, n = X.shape u = 0.1 w = 1-u testSize = 5 folds = 5 trainTestXs = Sampling.shuffleSplitRows(X, folds, testSize) numRecordAucSamples = 200 k2 = 8 u2 = 0.5 w2 = 1-u2 eps = 10**-4
dataParser.add_argument("--dataset", type=str, help="The dataset to use: either Doc or Keyword (default: %(default)s)", default=dataParser.dataset) devNull, remainingArgs = dataParser.parse_known_args(namespace=dataArgs) if dataArgs.help: helpParser = argparse.ArgumentParser(description="", add_help=False, parents=[dataParser, RankingExpHelper.newAlgoParser(defaultAlgoArgs)]) helpParser.print_help() exit() # print args # logging.info("Data params:") keys = list(vars(dataArgs).keys()) keys.sort() for key in keys: logging.info(" " + str(key) + ": " + str(dataArgs.__getattribute__(key))) logging.info("Creating the exp-runner") #Load/create the dataset - sample at most a million nnzs X = DatasetUtils.mendeley(dataset=dataArgs.dataset) numpy.random.seed(21) X, userInds = Sampling.sampleUsers2(X, 10**6, prune=True) m, n = X.shape dataArgs.extendedDirName = "" dataArgs.extendedDirName += "MendeleyCoauthors" + dataParser.dataset rankingExpHelper = RankingExpHelper(remainingArgs, defaultAlgoArgs, dataArgs.extendedDirName) rankingExpHelper.printAlgoArgs() rankingExpHelper.runExperiment(X)
import numpy import logging import sys import argparse from wallhack.rankingexp.RankingExpHelper import RankingExpHelper from wallhack.rankingexp.DatasetUtils import DatasetUtils from sandbox.util.Util import Util Util.setupScript() #Create a low rank matrix X = DatasetUtils.syntheticDataset2() m, n = X.shape u = 0.1 w = 1-u # Arguments related to the dataset dataArgs = argparse.Namespace() # Arguments related to the algorithm defaultAlgoArgs = argparse.Namespace() defaultAlgoArgs.u = u #defaultAlgoArgs.validationUsers = 0.0 defaultAlgoArgs.ks = numpy.array([8]) # data args parser # dataParser = argparse.ArgumentParser(description="", add_help=False) dataParser.add_argument("-h", "--help", action="store_true", help="show this help message and exit") devNull, remainingArgs = dataParser.parse_known_args(namespace=dataArgs) if dataArgs.help: helpParser = argparse.ArgumentParser(description="", add_help=False, parents=[dataParser, RankingExpHelper.newAlgoParser(defaultAlgoArgs)]) helpParser.print_help()
import numpy import logging import sys import argparse from wallhack.rankingexp.RankingExpHelper import RankingExpHelper from wallhack.rankingexp.DatasetUtils import DatasetUtils from sandbox.util.Util import Util Util.setupScript() #Create a low rank matrix X, U, V = DatasetUtils.syntheticDataset1(u=0.2, sd=0.2) m, n = X.shape u = 0.1 w = 1-u # Arguments related to the dataset dataArgs = argparse.Namespace() # Arguments related to the algorithm defaultAlgoArgs = argparse.Namespace() defaultAlgoArgs.u = 5/float(n) #defaultAlgoArgs.validationUsers = 0.0 defaultAlgoArgs.ks = numpy.array([8]) # data args parser # dataParser = argparse.ArgumentParser(description="", add_help=False) dataParser.add_argument("-h", "--help", action="store_true", help="show this help message and exit") devNull, remainingArgs = dataParser.parse_known_args(namespace=dataArgs) if dataArgs.help: helpParser = argparse.ArgumentParser(description="", add_help=False, parents=[dataParser, RankingExpHelper.newAlgoParser(defaultAlgoArgs)])
defaultAlgoArgs.numRowSamples = 15 defaultAlgoArgs.parallelSGD = True defaultAlgoArgs.recordFolds = 1 defaultAlgoArgs.validationUsers = 0.0 # data args parser # dataParser = argparse.ArgumentParser(description="", add_help=False) dataParser.add_argument("-h", "--help", action="store_true", help="show this help message and exit") devNull, remainingArgs = dataParser.parse_known_args(namespace=dataArgs) if dataArgs.help: helpParser = argparse.ArgumentParser(description="", add_help=False, parents=[dataParser, RankingExpHelper.newAlgoParser(defaultAlgoArgs)]) helpParser.print_help() exit() #Create/load a low rank matrix X = DatasetUtils.bookCrossing(minNnzRows=10) (m, n) = X.shape dataArgs.extendedDirName = "" dataArgs.extendedDirName += "BookCrossing" # print args # logging.info("Running on " + dataArgs.extendedDirName) logging.info("Data params:") keys = list(vars(dataArgs).keys()) keys.sort() for key in keys: logging.info(" " + str(key) + ": " + str(dataArgs.__getattribute__(key))) logging.info("Creating the exp-runner")
from sandbox.util.Util import Util Util.setupScript() """ Script to see if there is an advantage of having independent learning rates alphaU and alphaV """ if len(sys.argv) > 1: dataset = sys.argv[1] else: dataset = "movielens" saveResults = False prefix = "LearningRate2" outputFile = PathDefaults.getOutputDir() + "ranking/" + prefix + dataset.title() + "Results.npz" X = DatasetUtils.getDataset(dataset) m, n = X.shape k2 = 64 u2 = 5/float(n) w2 = 1-u2 eps = 10**-8 lmbda = 0.01 maxLocalAuc = MaxLocalAUC(k2, w2, eps=eps, lmbdaU=0.1, lmbdaV=0.1, stochastic=True) maxLocalAuc.alpha = 0.5 maxLocalAuc.alphas = 2.0**-numpy.arange(2, 9, 2) maxLocalAuc.beta = 2 maxLocalAuc.bound = False maxLocalAuc.delta = 0.1 maxLocalAuc.eta = 20 maxLocalAuc.folds = 2
defaultAlgoArgs.numRowSamples = 15 defaultAlgoArgs.parallelSGD = True defaultAlgoArgs.recordFolds = 1 defaultAlgoArgs.validationUsers = 0.0 # data args parser # dataParser = argparse.ArgumentParser(description="", add_help=False) dataParser.add_argument("-h", "--help", action="store_true", help="show this help message and exit") devNull, remainingArgs = dataParser.parse_known_args(namespace=dataArgs) if dataArgs.help: helpParser = argparse.ArgumentParser(description="", add_help=False, parents=[dataParser, RankingExpHelper.newAlgoParser(defaultAlgoArgs)]) helpParser.print_help() exit() #Create/load a low rank matrix X = DatasetUtils.epinions(minNnzRows=10) (m, n) = X.shape #For the moment, use a subsample #modelSelectSamples = 2*10**5 #X, userInds = Sampling.sampleUsers2(X, modelSelectSamples, prune=True) dataArgs.extendedDirName = "" dataArgs.extendedDirName += "Epinions" # print args # logging.info("Running on " + dataArgs.extendedDirName) logging.info("Data params:") keys = list(vars(dataArgs).keys()) keys.sort() for key in keys:
dataArgs = argparse.Namespace() # Arguments related to the algorithm defaultAlgoArgs = argparse.Namespace() # data args parser # dataParser = argparse.ArgumentParser(description="", add_help=False) dataParser.add_argument("-h", "--help", action="store_true", help="show this help message and exit") devNull, remainingArgs = dataParser.parse_known_args(namespace=dataArgs) if dataArgs.help: helpParser = argparse.ArgumentParser(description="", add_help=False, parents=[dataParser, RankingExpHelper.newAlgoParser(defaultAlgoArgs)]) helpParser.print_help() exit() #Load/create the dataset X = DatasetUtils.movieLens() (m, n) = X.shape defaultAlgoArgs.u = 0.1 defaultAlgoArgs.ks = numpy.array([32, 64, 128]) dataArgs.extendedDirName = "" dataArgs.extendedDirName += "MovieLens" # print args # logging.info("Running on " + dataArgs.extendedDirName) logging.info("Data params:") keys = list(vars(dataArgs).keys()) keys.sort() for key in keys: logging.info(" " + str(key) + ": " + str(dataArgs.__getattribute__(key)))
import sys import numpy import matplotlib import powerlaw matplotlib.use("GTK3Agg") import matplotlib.pyplot as plt from wallhack.rankingexp.DatasetUtils import DatasetUtils """ Do some basic analysis on the recommendation datasets. """ logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) #X, U, V = DatasetUtils.syntheticDataset1() #X = DatasetUtils.syntheticDataset2() X = DatasetUtils.movieLens(quantile=100) #X = DatasetUtils.flixster(quantile=100) #X = DatasetUtils.mendeley(quantile=50) print(X.shape) m, n = X.shape userCounts = X.sum(1) itemCounts = X.sum(0) results = powerlaw.Fit(itemCounts, discrete=True, xmax=n) print(results.power_law.alpha) print(results.power_law.xmin) print(results.power_law.xmax) u = 5
defaultAlgoArgs.numRowSamples = 15 defaultAlgoArgs.parallelSGD = True defaultAlgoArgs.recordFolds = 1 defaultAlgoArgs.validationUsers = 0.0 # data args parser # dataParser = argparse.ArgumentParser(description="", add_help=False) dataParser.add_argument("-h", "--help", action="store_true", help="show this help message and exit") devNull, remainingArgs = dataParser.parse_known_args(namespace=dataArgs) if dataArgs.help: helpParser = argparse.ArgumentParser(description="", add_help=False, parents=[dataParser, RankingExpHelper.newAlgoParser(defaultAlgoArgs)]) helpParser.print_help() exit() #Create/load a low rank matrix X = DatasetUtils.flixster() (m, n) = X.shape #For the moment, use a subsample #modelSelectSamples = 2*10**5 #X, userInds = Sampling.sampleUsers2(X, modelSelectSamples, prune=True) dataArgs.extendedDirName = "" dataArgs.extendedDirName += "Flixster" # print args # logging.info("Running on " + dataArgs.extendedDirName) logging.info("Data params:") keys = list(vars(dataArgs).keys()) keys.sort() for key in keys: