コード例 #1
0
    def learnParameters(self, corpus: list, N: int):
        """
        Wrapper function to learn the parameters (lambda1 and lambda2) in interpolated smoothing. The function first
        creates K NGrams with the train folds of the corpus. Then optimizes lambdas with respect to the test folds of
        the corpus depending on given N.

        PARAMETERS
        ----------
        corpus : list
            Train corpus used to optimize lambda parameters
        N : int
            N in N-Gram.
        """
        if N <= 1:
            return
        K = 10
        nGrams = []
        kFoldCrossValidation = KFoldCrossValidation(corpus, K, 0)
        for i in range(K):
            nGrams.append(NGram(N, kFoldCrossValidation.getTrainFold(i)))
            for j in range(2, N + 1):
                nGrams[i].calculateNGramProbabilitiesSimpleLevel(
                    self.__simpleSmoothing, j)
            nGrams[i].calculateNGramProbabilitiesSimpleLevel(
                self.__simpleSmoothing, 1)
        if N == 2:
            self.__lambda1 = self.__learnBestLambda(nGrams,
                                                    kFoldCrossValidation, 0.1)
        elif N == 3:
            (self.__lambda1,
             self.__lambda2) = self.__learnBestLambdas(nGrams,
                                                       kFoldCrossValidation,
                                                       0.1, 0.1)
コード例 #2
0
 def test_LargeSample2Fold(self):
     kFoldCrossValidation = KFoldCrossValidation(self.largeSample, 2, 1)
     for i in range(2):
         items = set()
         trainFold = kFoldCrossValidation.getTrainFold(i)
         testFold = kFoldCrossValidation.getTestFold(i)
         items.update(trainFold)
         items.update(testFold)
         self.assertEquals(500, len(testFold))
         self.assertEquals(500, len(trainFold))
         self.assertEquals(1000, len(items))
コード例 #3
0
    def execute(self, experiment: Experiment) -> ExperimentPerformance:
        """
        Execute K-fold cross-validation with separate test set with the given classifier on the given data set using the
        given parameters.

        PARAMETERS
        ----------
        experiment : Experiment
            Experiment to be run.

        RETURNS
        -------
        ExperimentPerformance
            An ExperimentPerformance instance.
        """
        result = ExperimentPerformance()
        instanceList = experiment.getDataSet().getInstanceList()
        partition = Partition(instanceList, 0.25,
                              experiment.getParameter().getSeed(), True)
        crossValidation = KFoldCrossValidation(
            partition.get(1).getInstances(), self.K,
            experiment.getParameter().getSeed())
        self.runExperiment(experiment.getClassifier(),
                           experiment.getParameter(), result, crossValidation,
                           partition.get(0))
        return result
コード例 #4
0
    def learnParameters(self, corpus: list, N: int):
        """
        Wrapper function to learn the parameter (delta) in additive smoothing. The function first creates K NGrams
        with the train folds of the corpus. Then optimizes delta with respect to the test folds of the corpus.

        PARAMETERS
        ----------
        corpus : list
            Train corpus used to optimize delta parameter
        N : int
            N in N-Gram.
        """
        K = 10
        nGrams = []
        kFoldCrossValidation = KFoldCrossValidation(corpus, K, 0)
        for i in range(K):
            nGrams.append(NGram(N, kFoldCrossValidation.getTrainFold(i)))
        self.__delta = self.__learnBestDelta(nGrams, kFoldCrossValidation, 0.1)
コード例 #5
0
    def __learnBestLambda(self, nGrams: list,
                          kFoldCrossValidation: KFoldCrossValidation,
                          lowerBound: float) -> float:
        """
        The algorithm tries to optimize the best lambda for a given corpus. The algorithm uses perplexity on the
        validation set as the optimization criterion.

        PARAMETERS
        ----------
        nGrams : list
            10 N-Grams learned for different folds of the corpus. nGrams[i] is the N-Gram trained with i'th train fold
            of the corpus.
        kFoldCrossValidation : KFoldCrossvalidation
            Cross-validation data used in training and testing the N-grams.
        lowerBound : float
            Initial lower bound for optimizing the best lambda.

        RETURNS
        -------
        float
            Best lambda optimized with k-fold crossvalidation.
        """
        bestPrevious = -1
        upperBound = 0.999
        bestLambda = (lowerBound + upperBound) / 2
        numberOfParts = 5
        testFolds = []
        for i in range(10):
            testFolds.append(kFoldCrossValidation.getTestFold(i))
        while True:
            bestPerplexity = 1000000000
            value = lowerBound
            while value <= upperBound:
                perplexity = 0
                for i in range(10):
                    nGrams[i].setLambda2(value)
                    perplexity += nGrams[i].getPerplexity(testFolds[i])
                if perplexity < bestPerplexity:
                    bestPerplexity = perplexity
                    bestLambda = value
                value += (upperBound - lowerBound) / numberOfParts
            lowerBound = self.newLowerBound(bestLambda, lowerBound, upperBound,
                                            numberOfParts)
            upperBound = self.newUpperBound(bestLambda, lowerBound, upperBound,
                                            numberOfParts)
            if bestPrevious != -1:
                if math.fabs(bestPrevious -
                             bestPerplexity) / bestPerplexity < 0.001:
                    break
            bestPrevious = bestPerplexity
        return bestLambda
コード例 #6
0
    def execute(self, experiment: Experiment) -> Performance:
        """
        Execute Single K-fold cross-validation with the given classifier on the given data set using the given
        parameters.

        PARAMETERS
        -----
        experiment : Experiment
            Experiment to be run.

        RETURNS
        -------
        Performance
            A Performance instance.
        """
        crossValidation = KFoldCrossValidation(experiment.getDataSet().getInstances(), self.__K,
                                               experiment.getParameter().getSeed())
        return self.runExperiment(experiment.getClassifier(), experiment.getParameter(), crossValidation)
コード例 #7
0
    def execute(self, experiment: Experiment) -> ExperimentPerformance:
        """
        Execute the MxKFold run with the given classifier on the given data set using the given parameters.

        PARAMETERS
        ----------
        experiment : Experiment
            Experiment to be run.

        RETURNS
        -------
        ExperimentPerformance
            An ExperimentPerformance instance.
        """
        result = ExperimentPerformance()
        for j in range(self.M):
            crossValidation = KFoldCrossValidation(
                experiment.getDataSet().getInstances(), self.K,
                experiment.getParameter().getSeed())
            self.runExperiment(experiment.getClassifier(),
                               experiment.getParameter(), result,
                               crossValidation)
        return result
コード例 #8
0
    def __learnBestLambdas(self, nGrams: list,
                           kFoldCrossValidation: KFoldCrossValidation,
                           lowerBound1: float, lowerBound2: float) -> tuple:
        """
        The algorithm tries to optimize the best lambdas (lambda1, lambda2) for a given corpus. The algorithm uses
        perplexity on the validation set as the optimization criterion.

        PARAMETERS
        ----------
        nGrams : list
            10 N-Grams learned for different folds of the corpus. nGrams[i] is the N-Gram trained with i'th train fold
            of the corpus.
        kFoldCrossValidation : KFoldCrossValidation
            Cross-validation data used in training and testing the N-grams.
        lowerBound1 : float
            Initial lower bound for optimizing the best lambda1.
        lowerBound2 : float
            Initial lower bound for optimizing the best lambda2.

        RETURNS
        -------
        tuple
            bestLambda1 and bestLambda2
        """
        upperBound1 = 0.999
        upperBound2 = 0.999
        bestPrevious = -1
        bestLambda1 = (lowerBound1 + upperBound1) / 2
        bestLambda2 = (lowerBound2 + upperBound2) / 2
        numberOfParts = 5
        testFolds = []
        for i in range(10):
            testFolds.append(kFoldCrossValidation.getTestFold(i))
        while True:
            bestPerplexity = 1000000000
            value1 = lowerBound1
            while value1 <= upperBound1:
                value2 = lowerBound2
                while value2 <= upperBound2 and value1 + value2 < 1:
                    perplexity = 0
                    for i in range(10):
                        nGrams[i].setLambda3(value1, value2)
                        perplexity += nGrams[i].getPerplexity(testFolds[i])
                    if perplexity < bestPerplexity:
                        bestPerplexity = perplexity
                        bestLambda1 = value1
                        bestLambda2 = value2
                    value2 += (upperBound1 - lowerBound1) / numberOfParts
                value1 += (upperBound1 - lowerBound1) / numberOfParts
            lowerBound1 = self.newLowerBound(bestLambda1, lowerBound1,
                                             upperBound1, numberOfParts)
            upperBound1 = self.newUpperBound(bestLambda1, lowerBound1,
                                             upperBound1, numberOfParts)
            lowerBound2 = self.newLowerBound(bestLambda2, lowerBound2,
                                             upperBound2, numberOfParts)
            upperBound2 = self.newUpperBound(bestLambda2, lowerBound2,
                                             upperBound2, numberOfParts)
            if bestPrevious != -1:
                if math.fabs(bestPrevious -
                             bestPerplexity) / bestPerplexity < 0.001:
                    break
            bestPrevious = bestPerplexity
        return bestLambda1, bestLambda2
コード例 #9
0
 def test_SmallSample2Fold(self):
     kFoldCrossValidation = KFoldCrossValidation(self.smallSample, 2, 1)
     expected3 = ["7", "9", "10", "8", "6"]
     self.assertEquals(expected3, kFoldCrossValidation.getTestFold(0))
コード例 #10
0
 def test_SmallSample5Fold(self):
     kFoldCrossValidation = KFoldCrossValidation(self.smallSample, 5, 1)
     expected2 = ["7", "9"]
     self.assertEquals(expected2, kFoldCrossValidation.getTestFold(0))
コード例 #11
0
 def test_SmallSample10Fold(self):
     kFoldCrossValidation = KFoldCrossValidation(self.smallSample, 10, 1)
     expected1 = ["7"]
     self.assertEquals(expected1, kFoldCrossValidation.getTestFold(0))