def learnParameters(self, corpus: list, N: int): """ Wrapper function to learn the parameters (lambda1 and lambda2) in interpolated smoothing. The function first creates K NGrams with the train folds of the corpus. Then optimizes lambdas with respect to the test folds of the corpus depending on given N. PARAMETERS ---------- corpus : list Train corpus used to optimize lambda parameters N : int N in N-Gram. """ if N <= 1: return K = 10 nGrams = [] kFoldCrossValidation = KFoldCrossValidation(corpus, K, 0) for i in range(K): nGrams.append(NGram(N, kFoldCrossValidation.getTrainFold(i))) for j in range(2, N + 1): nGrams[i].calculateNGramProbabilitiesSimpleLevel( self.__simpleSmoothing, j) nGrams[i].calculateNGramProbabilitiesSimpleLevel( self.__simpleSmoothing, 1) if N == 2: self.__lambda1 = self.__learnBestLambda(nGrams, kFoldCrossValidation, 0.1) elif N == 3: (self.__lambda1, self.__lambda2) = self.__learnBestLambdas(nGrams, kFoldCrossValidation, 0.1, 0.1)
def test_LargeSample2Fold(self): kFoldCrossValidation = KFoldCrossValidation(self.largeSample, 2, 1) for i in range(2): items = set() trainFold = kFoldCrossValidation.getTrainFold(i) testFold = kFoldCrossValidation.getTestFold(i) items.update(trainFold) items.update(testFold) self.assertEquals(500, len(testFold)) self.assertEquals(500, len(trainFold)) self.assertEquals(1000, len(items))
def execute(self, experiment: Experiment) -> ExperimentPerformance: """ Execute K-fold cross-validation with separate test set with the given classifier on the given data set using the given parameters. PARAMETERS ---------- experiment : Experiment Experiment to be run. RETURNS ------- ExperimentPerformance An ExperimentPerformance instance. """ result = ExperimentPerformance() instanceList = experiment.getDataSet().getInstanceList() partition = Partition(instanceList, 0.25, experiment.getParameter().getSeed(), True) crossValidation = KFoldCrossValidation( partition.get(1).getInstances(), self.K, experiment.getParameter().getSeed()) self.runExperiment(experiment.getClassifier(), experiment.getParameter(), result, crossValidation, partition.get(0)) return result
def learnParameters(self, corpus: list, N: int): """ Wrapper function to learn the parameter (delta) in additive smoothing. The function first creates K NGrams with the train folds of the corpus. Then optimizes delta with respect to the test folds of the corpus. PARAMETERS ---------- corpus : list Train corpus used to optimize delta parameter N : int N in N-Gram. """ K = 10 nGrams = [] kFoldCrossValidation = KFoldCrossValidation(corpus, K, 0) for i in range(K): nGrams.append(NGram(N, kFoldCrossValidation.getTrainFold(i))) self.__delta = self.__learnBestDelta(nGrams, kFoldCrossValidation, 0.1)
def __learnBestLambda(self, nGrams: list, kFoldCrossValidation: KFoldCrossValidation, lowerBound: float) -> float: """ The algorithm tries to optimize the best lambda for a given corpus. The algorithm uses perplexity on the validation set as the optimization criterion. PARAMETERS ---------- nGrams : list 10 N-Grams learned for different folds of the corpus. nGrams[i] is the N-Gram trained with i'th train fold of the corpus. kFoldCrossValidation : KFoldCrossvalidation Cross-validation data used in training and testing the N-grams. lowerBound : float Initial lower bound for optimizing the best lambda. RETURNS ------- float Best lambda optimized with k-fold crossvalidation. """ bestPrevious = -1 upperBound = 0.999 bestLambda = (lowerBound + upperBound) / 2 numberOfParts = 5 testFolds = [] for i in range(10): testFolds.append(kFoldCrossValidation.getTestFold(i)) while True: bestPerplexity = 1000000000 value = lowerBound while value <= upperBound: perplexity = 0 for i in range(10): nGrams[i].setLambda2(value) perplexity += nGrams[i].getPerplexity(testFolds[i]) if perplexity < bestPerplexity: bestPerplexity = perplexity bestLambda = value value += (upperBound - lowerBound) / numberOfParts lowerBound = self.newLowerBound(bestLambda, lowerBound, upperBound, numberOfParts) upperBound = self.newUpperBound(bestLambda, lowerBound, upperBound, numberOfParts) if bestPrevious != -1: if math.fabs(bestPrevious - bestPerplexity) / bestPerplexity < 0.001: break bestPrevious = bestPerplexity return bestLambda
def execute(self, experiment: Experiment) -> Performance: """ Execute Single K-fold cross-validation with the given classifier on the given data set using the given parameters. PARAMETERS ----- experiment : Experiment Experiment to be run. RETURNS ------- Performance A Performance instance. """ crossValidation = KFoldCrossValidation(experiment.getDataSet().getInstances(), self.__K, experiment.getParameter().getSeed()) return self.runExperiment(experiment.getClassifier(), experiment.getParameter(), crossValidation)
def execute(self, experiment: Experiment) -> ExperimentPerformance: """ Execute the MxKFold run with the given classifier on the given data set using the given parameters. PARAMETERS ---------- experiment : Experiment Experiment to be run. RETURNS ------- ExperimentPerformance An ExperimentPerformance instance. """ result = ExperimentPerformance() for j in range(self.M): crossValidation = KFoldCrossValidation( experiment.getDataSet().getInstances(), self.K, experiment.getParameter().getSeed()) self.runExperiment(experiment.getClassifier(), experiment.getParameter(), result, crossValidation) return result
def __learnBestLambdas(self, nGrams: list, kFoldCrossValidation: KFoldCrossValidation, lowerBound1: float, lowerBound2: float) -> tuple: """ The algorithm tries to optimize the best lambdas (lambda1, lambda2) for a given corpus. The algorithm uses perplexity on the validation set as the optimization criterion. PARAMETERS ---------- nGrams : list 10 N-Grams learned for different folds of the corpus. nGrams[i] is the N-Gram trained with i'th train fold of the corpus. kFoldCrossValidation : KFoldCrossValidation Cross-validation data used in training and testing the N-grams. lowerBound1 : float Initial lower bound for optimizing the best lambda1. lowerBound2 : float Initial lower bound for optimizing the best lambda2. RETURNS ------- tuple bestLambda1 and bestLambda2 """ upperBound1 = 0.999 upperBound2 = 0.999 bestPrevious = -1 bestLambda1 = (lowerBound1 + upperBound1) / 2 bestLambda2 = (lowerBound2 + upperBound2) / 2 numberOfParts = 5 testFolds = [] for i in range(10): testFolds.append(kFoldCrossValidation.getTestFold(i)) while True: bestPerplexity = 1000000000 value1 = lowerBound1 while value1 <= upperBound1: value2 = lowerBound2 while value2 <= upperBound2 and value1 + value2 < 1: perplexity = 0 for i in range(10): nGrams[i].setLambda3(value1, value2) perplexity += nGrams[i].getPerplexity(testFolds[i]) if perplexity < bestPerplexity: bestPerplexity = perplexity bestLambda1 = value1 bestLambda2 = value2 value2 += (upperBound1 - lowerBound1) / numberOfParts value1 += (upperBound1 - lowerBound1) / numberOfParts lowerBound1 = self.newLowerBound(bestLambda1, lowerBound1, upperBound1, numberOfParts) upperBound1 = self.newUpperBound(bestLambda1, lowerBound1, upperBound1, numberOfParts) lowerBound2 = self.newLowerBound(bestLambda2, lowerBound2, upperBound2, numberOfParts) upperBound2 = self.newUpperBound(bestLambda2, lowerBound2, upperBound2, numberOfParts) if bestPrevious != -1: if math.fabs(bestPrevious - bestPerplexity) / bestPerplexity < 0.001: break bestPrevious = bestPerplexity return bestLambda1, bestLambda2
def test_SmallSample2Fold(self): kFoldCrossValidation = KFoldCrossValidation(self.smallSample, 2, 1) expected3 = ["7", "9", "10", "8", "6"] self.assertEquals(expected3, kFoldCrossValidation.getTestFold(0))
def test_SmallSample5Fold(self): kFoldCrossValidation = KFoldCrossValidation(self.smallSample, 5, 1) expected2 = ["7", "9"] self.assertEquals(expected2, kFoldCrossValidation.getTestFold(0))
def test_SmallSample10Fold(self): kFoldCrossValidation = KFoldCrossValidation(self.smallSample, 10, 1) expected1 = ["7"] self.assertEquals(expected1, kFoldCrossValidation.getTestFold(0))