Ejemplo n.º 1
0
    def calculateRMinusY(self, instance: Instance, inputVector: Vector,
                         weights: Matrix) -> Vector:
        """
        The calculateRMinusY method creates a new Vector with given Instance, then it multiplies given
        input Vector with given weights Matrix. After normalizing the output, it return the difference between the newly
        created Vector and normalized output.

        PARAMETERS
        ----------
        instance : Instance
            Instance is used to get class labels.
        inputVector : Vector
            Vector to multiply weights.
        weights : Matrix
            Matrix of weights

        RETURNS
        -------
        Vector
            Difference between newly created Vector and normalized output.
        """
        r = Vector()
        r.initAllZerosExceptOne(
            self.K, self.classLabels.index(instance.getClassLabel()), 1.0)
        o = weights.multiplyWithVectorFromRight(inputVector)
        y = self.normalizeOutput(o)
        return r.difference(y)
Ejemplo n.º 2
0
    def train(self, trainSet: InstanceList, parameters: Parameter):
        """
        Training algorithm for the linear discriminant analysis classifier (Introduction to Machine Learning, Alpaydin,
        2015).

        PARAMETERS
        ----------
        trainSet : InstanceList
            Training data given to the algorithm.
        parameters : Parameter
            Parameter of the Lda algorithm.
        """
        w0 = {}
        w = {}
        priorDistribution = trainSet.classDistribution()
        classLists = Partition(trainSet)
        covariance = Matrix(trainSet.get(0).continuousAttributeSize(), trainSet.get(0).continuousAttributeSize())
        for i in range(classLists.size()):
            averageVector = Vector(classLists.get(i).continuousAverage())
            classCovariance = classLists.get(i).covariance(averageVector)
            classCovariance.multiplyWithConstant(classLists.get(i).size() - 1)
            covariance.add(classCovariance)
        covariance.divideByConstant(trainSet.size() - classLists.size())
        covariance.inverse()
        for i in range(classLists.size()):
            Ci = classLists.get(i).getClassLabel()
            averageVector = Vector(classLists.get(i).continuousAverage())
            wi = covariance.multiplyWithVectorFromRight(averageVector)
            w[Ci] = wi
            w0i = -0.5 * wi.dotProduct(averageVector) + math.log(priorDistribution.getProbability(Ci))
            w0[Ci] = w0i
        self.model = LdaModel(priorDistribution, w, w0)
Ejemplo n.º 3
0
    def viterbi(self, s: list) -> list:
        """
        viterbi calculates the most probable state sequence for a set of observed symbols.

        PARAMETERS
        ----------
        s : list
            A set of observed symbols.

        RETURNS
        -------
        list
            The most probable state sequence as an {@link ArrayList}.
        """
        result = []
        sequenceLength = len(s)
        gamma = Matrix(sequenceLength, self.stateCount * self.stateCount)
        phi = Matrix(sequenceLength, self.stateCount * self.stateCount)
        qs = Vector(sequenceLength, 0)
        emission1 = s[0]
        emission2 = s[1]
        for i in range(self.stateCount):
            for j in range(self.stateCount):
                observationLikelihood = self.states[i].getEmitProb(
                    emission1) * self.states[j].getEmitProb(emission2)
                gamma.setValue(
                    1, i * self.stateCount + j,
                    self.safeLog(self.__pi.getValue(i, j)) +
                    self.safeLog(observationLikelihood))
        for t in range(2, sequenceLength):
            emission = s[t]
            for j in range(self.stateCount * self.stateCount):
                current = self.__logOfColumn(j)
                previous = gamma.getRowVector(t - 1).skipVector(
                    self.stateCount, j // self.stateCount)
                current.addVector(previous)
                maxIndex = current.maxIndex()
                observationLikelihood = self.states[
                    j % self.stateCount].getEmitProb(emission)
                gamma.setValue(
                    t, j,
                    current.getValue(maxIndex) +
                    self.safeLog(observationLikelihood))
                phi.setValue(t, j,
                             maxIndex * self.stateCount + j // self.stateCount)
        qs.setValue(sequenceLength - 1,
                    gamma.getRowVector(sequenceLength - 1).maxIndex())
        result.insert(
            0, self.states[int(qs.getValue(sequenceLength - 1)) %
                           self.stateCount].getState())
        for i in range(sequenceLength - 2, 0, -1):
            qs.setValue(i, phi.getValue(i + 1, int(qs.getValue(i + 1))))
            result.insert(
                0,
                self.states[int(qs.getValue(i)) % self.stateCount].getState())
        result.insert(
            0, self.states[int(qs.getValue(1)) // self.stateCount].getState())
        return result
Ejemplo n.º 4
0
    def addVectorAttribute(self, vector: Vector):
        """
        Adds a Vector of continuous attributes.

        PARAMETERS
        ----------
        vector : Vector
            Vector that has the continuous attributes.
        """
        for i in range(vector.size()):
            self.__attributes.append(ContinuousAttribute(vector.getValue(i)))
Ejemplo n.º 5
0
    def kMeansClustering(self, iteration: int, k: int) -> list:
        """
        The kMeansClustering method takes an integer iteration and k as inputs. K-means clustering aims to partition n
        observations into k clusters in which each observation belongs to the cluster with the nearest mean.

        PARAMETERS
        ----------
        iteration : int
            Integer input.
        k : int
            Integer input.

        RETURNS
        -------
        list
            list result which holds the k-means clustered words.
        """
        result = []
        means = []
        vectorSize = self.words[0].getVector().size()
        for i in range(k):
            result.append([])
            v = Vector()
            v.initAllSame(vectorSize, 0)
            means.append(v)
        for i in range(len(self.words)):
            result[i % k].append(self.words[i])
            means[i % k].add(self.words[i]).getVector()
        for i in range(k):
            means[i].divide(len(result[i]))
            means[i].divide(math.sqrt(means[i].dotProductWithSelf()))
        for i in range(iteration):
            for j in range(k):
                result[j].clear()
            for vectorizedWord in self.words:
                maxClusterDistance = means[0].dotProduct(
                    vectorizedWord.getVector())
                maxClusterIndex = 0
                for j in range(1, k):
                    clusterDistance = means[j].dotProduct(
                        vectorizedWord.getVector())
                    if clusterDistance > maxClusterDistance:
                        maxClusterDistance = clusterDistance
                        maxClusterIndex = j
                result[maxClusterIndex].append(vectorizedWord)
            for j in range(k):
                means[j].clear()
                for word in result[j]:
                    means[j].add(word.getVector())
                means[j].divide(len(result[j]))
                means[j].divide(math.sqrt(means[j].dotProductWithSelf()))
        return result
Ejemplo n.º 6
0
    def sumOfRows(self) -> Vector:
        """
        The sumOfRows method creates a mew result Vector and adds the result of columnDum method's corresponding
        index to the newly created result Vector.

        RETURNS
        -------
        Vector
            Vector that holds column sum.
        """
        result = Vector()
        for i in range(self.__col):
            result.add(self.columnSum(i))
        return result
Ejemplo n.º 7
0
    def train(self, trainSet: InstanceList, parameters: Parameter):
        """
        Training algorithm for the quadratic discriminant analysis classifier (Introduction to Machine Learning,
        Alpaydin, 2015).

        PARAMETERS
        ----------
        trainSet : InstanceList
            Training data given to the algorithm.
        """
        w0 = {}
        w = {}
        W = {}
        classLists = Partition(trainSet)
        priorDistribution = trainSet.classDistribution()
        for i in range(classLists.size()):
            Ci = classLists.get(i).getClassLabel()
            averageVector = Vector(classLists.get(i).continuousAverage())
            classCovariance = classLists.get(i).covariance(averageVector)
            determinant = classCovariance.determinant()
            classCovariance.inverse()
            Wi = deepcopy(classCovariance)
            Wi.multiplyWithConstant(-0.5)
            W[Ci] = Wi
            wi = classCovariance.multiplyWithVectorFromLeft(averageVector)
            w[Ci] = wi
            w0i = -0.5 * (wi.dotProduct(averageVector) + math.log(determinant)) + math.log(priorDistribution.
                                                                                           getProbability(Ci))
            w0[Ci] = w0i
        self.model = QdaModel(priorDistribution, W, w, w0)
Ejemplo n.º 8
0
    def convertInstance(self, instance: Instance):
        """
        The convertInstance method takes an Instance as an input and creates a Vector attributes from continuous
        Attributes. After removing all attributes of given instance, it then adds new ContinuousAttribute by using the
        dot product of attributes Vector and the eigenvectors.

        PARAMETERS
        ----------
        instance : Instance
            Instance that will be converted to ContinuousAttribute by using eigenvectors.
        """
        attributes = Vector(instance.continuousAttributes())
        instance.removeAllAttributes()
        for eigenvector in self.__eigenvectors:
            instance.addAttribute(
                ContinuousAttribute(attributes.dotProduct(eigenvector)))
Ejemplo n.º 9
0
    def addRowVector(self, rowNo: int, v: Vector):
        """
        The add method which takes a row number and a Vector as inputs. It sums up the corresponding values at the given
        row of values list and given Vector. If the sizes of both Matrix and values list do not match, it throws
        MatrixColumnMismatch exception.

        PARAMETERS
        ----------
        rowNo : int
            integer input for row number.
        v : Vector
            Vector type input.
        """
        if self.__col != v.size():
            raise MatrixColumnMismatch
        for i in range(self.__col):
            self.__values[rowNo][i] += v.getValue(i)
Ejemplo n.º 10
0
    def calculatePi(self, observations: list):
        """
        calculatePi calculates the prior probability vector (initial probabilities for each state) from a set of
        observations. For each observation, the function extracts the first state in that observation. Normalizing the
        counts of the states returns us the prior probabilities for each state.

        PARAMETERS
        ----------
        observations : list
            A set of observations used to calculate the prior probabilities.
        """
        self.__pi = Vector()
        self.__pi.initAllSame(self.stateCount, 0.0)
        for observation in observations:
            index = self.stateIndexes[observation[0]]
            self.__pi.addValue(index, 1.0)
        self.__pi.l1Normalize()
Ejemplo n.º 11
0
    def normalizeOutput(self, o: Vector) -> Vector:
        """
        The normalizeOutput method takes an input {@link Vector} o, gets the result for e^o of each element of o,
        then sums them up. At the end, divides the each e^o by the summation.

        PARAMETERS
        ----------
        o : Vector
            Vector to normalize.

        RETURNS
        -------
        Vector
            Normalized vector.
        """
        total = 0.0
        values = []
        for i in range(o.size()):
            if o.getValue(i) > 500:
                total += math.exp(500)
            else:
                total += math.exp(o.getValue(i))
        for i in range(o.size()):
            if o.getValue(i) > 500:
                values.append(math.exp(500) / total)
            else:
                values.append(math.exp(o.getValue(i)) / total)
        return Vector(values)
Ejemplo n.º 12
0
    def __logOfColumn(self, column: int) -> Vector:
        """
        logOfColumn calculates the logarithm of each value in a specific column in the transition probability matrix.

        PARAMETERS
        ----------
        column : int
            Column index of the transition probability matrix.

        RETURNS
        -------
        Vector
            A vector consisting of the logarithm of each value in the column in the transition probability matrix.
        """
        result = Vector()
        for i in range(self.stateCount):
            result.add(self.safeLog(self.transitionProbabilities.getValue(i, column)))
        return result
Ejemplo n.º 13
0
 def setUp(self):
     self.small = Matrix(3, 3)
     for i in range(3):
         for j in range(3):
             self.small.setValue(i, j, 1.0)
     self.v = Vector(3, 1.0)
     self.large = Matrix(1000, 1000)
     for i in range(1000):
         for j in range(1000):
             self.large.setValue(i, j, 1.0)
     self.medium = Matrix(100, 100)
     for i in range(100):
         for j in range(100):
             self.medium.setValue(i, j, 1.0)
     self.V = Vector(1000, 1.0)
     self.vr = Vector(100, 1.0)
     self.random = Matrix(100, 100, 1, 10, 1)
     self.originalSum = self.random.sumOfElements()
     self.identity = Matrix(100)
Ejemplo n.º 14
0
 def setUp(self):
     data2 = [8, 7, 6, 5, 4]
     self.smallVector1 = Vector(self.data1)
     self.smallVector2 = Vector(data2)
     largeData1 = []
     for i in range(1, 1001):
         largeData1.append(i)
     self.largeVector1 = Vector(largeData1)
     largeData2 = []
     for i in range(1, 1001):
         largeData2.append(1000 - i + 1)
     self.largeVector2 = Vector(largeData2)
Ejemplo n.º 15
0
    def multiplyWithVectorFromRight(self, v: Vector) -> Vector:
        """
        The multiplyWithVectorFromRight method takes a Vector as an input and creates a result list.
        Then, multiplies values of input Vector starting from the right side with the values list,
        accumulates the multiplication, and assigns to the result list. If the sizes of both Vector
        and row number do not match, it throws MatrixColumnMismatch exception.

        PARAMETERS
        ----------
        v : Vector
            Vector type input.

        RETURNS
        -------
        Vector
            Vector that holds the result.
        """
        if self.__col != v.size():
            raise MatrixColumnMismatch
        result = Vector()
        for i in range(self.__row):
            total = 0.0
            for j in range(self.__col):
                total += v.getValue(j) * self.__values[i][j]
            result.add(total)
        return result
Ejemplo n.º 16
0
    def toVector(self) -> Vector:
        """
        The toVector method returns a Vector of continuous attributes and discrete indexed attributes.

        RETURNS
        -------
        Vector
            Vector of continuous attributes and discrete indexed attributes.
        """
        values = []
        for attribute in self.__attributes:
            values.extend(attribute.continuousAttributes())
        return Vector(values)
Ejemplo n.º 17
0
 def train(self):
     """
     The train method creates an averageVector from continuousAttributeAverage and a covariance {@link Matrix} from
     that averageVector. Then finds the eigenvectors of that covariance matrix and removes its unnecessary
     eigenvectors.
     """
     averageVector = Vector(
         self.dataSet.getInstanceList().continuousAverage())
     covariance = self.dataSet.getInstanceList().covariance(averageVector)
     self.__eigenvectors = covariance.characteristics()
     if self.__numberOfDimensions != -1:
         self.__removeAllEigenvectorsExceptTheMostImportantK()
     else:
         self.__removeUnnecessaryEigenvectors()
Ejemplo n.º 18
0
    def getRowVector(self, row: int) -> Vector:
        """
        The getRowVector method returns the vector of values list at given row input.

        PARAMETERS
        ----------
        row : int
            row integer input for row number.

        RETURNS
        -------
        Vector
            Vector of values list at given row input.
        """
        rowList = self.__values[row]
        rowVector = Vector(rowList)
        return rowVector
Ejemplo n.º 19
0
    def viterbi(self, s: list) -> list:
        """
        viterbi calculates the most probable state sequence for a set of observed symbols.

        PARAMETERS
        ----------
        s : list
            A set of observed symbols.

        RETURNS
        -------
        list
            The most probable state sequence as an {@link ArrayList}.
        """
        result = []
        sequenceLength = len(s)
        gamma = Matrix(sequenceLength, self.stateCount)
        phi = Matrix(sequenceLength, self.stateCount)
        qs = Vector(sequenceLength, 0)
        emission = s[0]
        for i in range(self.stateCount):
            observationLikelihood = self.states[i].getEmitProb(emission)
            gamma.setValue(0, i, self.safeLog(self.__pi.getValue(i)) + self.safeLog(observationLikelihood))
        for t in range(1, sequenceLength):
            emission = s[t]
            for j in range(self.stateCount):
                tempArray = self.__logOfColumn(j)
                tempArray.addVector(gamma.getRowVector(t - 1))
                maxIndex = tempArray.maxIndex()
                observationLikelihood = self.states[j].getEmitProb(emission)
                gamma.setValue(t, j, tempArray.getValue(maxIndex) + self.safeLog(observationLikelihood))
                phi.setValue(t, j, maxIndex)
        qs.setValue(sequenceLength - 1, gamma.getRowVector(sequenceLength - 1).maxIndex())
        result.insert(0, self.states[int(qs.getValue(sequenceLength - 1))].getState())
        for i in range(sequenceLength - 2, -1, -1):
            qs.setValue(i, phi.getValue(i + 1, int(qs.getValue(i + 1))))
            result.insert(0, self.states[int(qs.getValue(i))].getState())
        return result
Ejemplo n.º 20
0
    def __linearRegressionOnCountsOfCounts(self, countsOfCounts: list) -> list:
        """
        Given counts of counts, this function will calculate the estimated counts of counts c$^*$ with
        Good-Turing smoothing. First, the algorithm filters the non-zero counts from counts of counts array and constructs
        c and r arrays. Then it constructs Z_n array with Z_n = (2C_n / (r_{n+1} - r_{n-1})). The algorithm then uses
        simple linear regression on Z_n values to estimate w_1 and w_0, where log(N[i]) = w_1log(i) + w_0

        PARAMETERS
        ----------
        countsOfCounts : list
            Counts of counts. countsOfCounts[1] is the number of words occurred once in the corpus. countsOfCounts[i] is
            the number of words occurred i times in the corpus.

        RETURNS
        ------
        list
            Estimated counts of counts array. N[1] is the estimated count for out of vocabulary words.
        """
        N = [0.0] * len(countsOfCounts)
        r = []
        c = []
        for i in range(1, len(countsOfCounts)):
            if countsOfCounts[i] != 0:
                r.append(i)
                c.append(countsOfCounts[i])
        A = Matrix(2, 2)
        y = Vector(2, 0)
        for i in range(len(r)):
            xt = math.log(r[i])
            if i == 0:
                rt = math.log(c[i])
            else:
                if i == len(r) - 1:
                    rt = math.log((1.0 * c[i]) / (r[i] - r[i - 1]))
                else:
                    rt = math.log((2.0 * c[i]) / (r[i + 1] - r[i - 1]))
            A.addValue(0, 0, 1.0)
            A.addValue(0, 1, xt)
            A.addValue(1, 0, xt)
            A.addValue(1, 1, xt * xt)
            y.addValue(0, rt)
            y.addValue(1, rt * xt)
        A.inverse()
        w = A.multiplyWithVectorFromRight(y)
        w0 = w.getValue(0)
        w1 = w.getValue(1)
        for i in range(1, len(countsOfCounts)):
            N[i] = math.exp(math.log(i) * w1 + w0)
        return N
Ejemplo n.º 21
0
    def calculateOneMinusHidden(self, hidden: Vector) -> Vector:
        """
        The calculateOneMinusHidden method takes a {@link java.util.Vector} as input. It creates a Vector of ones and
         returns the difference between given Vector.

        PARAMETERS
        ----------
        hidden : Vector
            Vector to find difference.

        RETURNS
        -------
        Vector
            Returns the difference between ones Vector and input Vector.
        """
        one = Vector()
        one.initAllSame(hidden.size(), 1.0)
        return one.difference(hidden)
Ejemplo n.º 22
0
class MatrixTest(unittest.TestCase):
    def setUp(self):
        self.small = Matrix(3, 3)
        for i in range(3):
            for j in range(3):
                self.small.setValue(i, j, 1.0)
        self.v = Vector(3, 1.0)
        self.large = Matrix(1000, 1000)
        for i in range(1000):
            for j in range(1000):
                self.large.setValue(i, j, 1.0)
        self.medium = Matrix(100, 100)
        for i in range(100):
            for j in range(100):
                self.medium.setValue(i, j, 1.0)
        self.V = Vector(1000, 1.0)
        self.vr = Vector(100, 1.0)
        self.random = Matrix(100, 100, 1, 10, 1)
        self.originalSum = self.random.sumOfElements()
        self.identity = Matrix(100)

    def test_ColumnWiseNormalize(self):
        mClone = self.small.clone()
        mClone.columnWiseNormalize()
        self.assertEqual(3, mClone.sumOfElements())
        MClone = self.large.clone()
        MClone.columnWiseNormalize()
        self.assertAlmostEqual(1000, MClone.sumOfElements(), 3)
        self.identity.columnWiseNormalize()
        self.assertEqual(100, self.identity.sumOfElements())

    def test_MultiplyWithConstant(self):
        self.small.multiplyWithConstant(4)
        self.assertEqual(36, self.small.sumOfElements())
        self.small.divideByConstant(4)
        self.large.multiplyWithConstant(1.001)
        self.assertAlmostEqual(1001000, self.large.sumOfElements(), 3)
        self.large.divideByConstant(1.001)
        self.random.multiplyWithConstant(3.6)
        self.assertAlmostEqual(self.originalSum * 3.6,
                               self.random.sumOfElements(), 4)
        self.random.divideByConstant(3.6)

    def test_DivideByConstant(self):
        self.small.divideByConstant(4)
        self.assertEqual(2.25, self.small.sumOfElements())
        self.small.multiplyWithConstant(4)
        self.large.divideByConstant(10)
        self.assertAlmostEqual(100000, self.large.sumOfElements(), 3)
        self.large.multiplyWithConstant(10)
        self.random.divideByConstant(3.6)
        self.assertAlmostEqual(self.originalSum / 3.6,
                               self.random.sumOfElements(), 4)
        self.random.multiplyWithConstant(3.6)

    def test_Add(self):
        self.random.add(self.identity)
        self.assertAlmostEqual(self.originalSum + 100,
                               self.random.sumOfElements(), 4)
        self.random.subtract(self.identity)

    def test_AddVector(self):
        self.large.addRowVector(4, self.V)
        self.assertEqual(1001000, self.large.sumOfElements(), 0.0)
        self.V.multiply(-1.0)
        self.large.addRowVector(4, self.V)
        self.V.multiply(-1.0)

    def test_Subtract(self):
        self.random.subtract(self.identity)
        self.assertAlmostEqual(self.originalSum - 100,
                               self.random.sumOfElements(), 4)
        self.random.add(self.identity)

    def test_MultiplyWithVectorFromLeft(self):
        result = self.small.multiplyWithVectorFromLeft(self.v)
        self.assertEqual(9, result.sumOfElements())
        result = self.large.multiplyWithVectorFromLeft(self.V)
        self.assertEqual(1000000, result.sumOfElements())
        result = self.random.multiplyWithVectorFromLeft(self.vr)
        self.assertAlmostEqual(self.originalSum, result.sumOfElements(), 4)

    def test_MultiplyWithVectorFromRight(self):
        result = self.small.multiplyWithVectorFromRight(self.v)
        self.assertEqual(9, result.sumOfElements())
        result = self.large.multiplyWithVectorFromRight(self.V)
        self.assertEqual(1000000, result.sumOfElements())
        result = self.random.multiplyWithVectorFromRight(self.vr)
        self.assertAlmostEqual(self.originalSum, result.sumOfElements(), 4)

    def test_ColumnSum(self):
        self.assertEqual(3, self.small.columnSum(randrange(3)))
        self.assertEqual(1000, self.large.columnSum(randrange(1000)))
        self.assertEqual(1, self.identity.columnSum(randrange(100)))

    def test_SumOfRows(self):
        self.assertEqual(9, self.small.sumOfRows().sumOfElements())
        self.assertEqual(1000000, self.large.sumOfRows().sumOfElements())
        self.assertEqual(100, self.identity.sumOfRows().sumOfElements())
        self.assertAlmostEqual(self.originalSum,
                               self.random.sumOfRows().sumOfElements(), 3)

    def test_RowSum(self):
        self.assertEqual(3, self.small.rowSum(randrange(3)))
        self.assertEqual(1000, self.large.rowSum(randrange(1000)))
        self.assertEqual(1, self.identity.rowSum(randrange(100)))

    def test_Multiply(self):
        result = self.small.multiply(self.small)
        self.assertEqual(27, result.sumOfElements())
        result = self.medium.multiply(self.medium)
        self.assertEqual(1000000.0, result.sumOfElements())
        result = self.random.multiply(self.identity)
        self.assertEqual(self.originalSum, result.sumOfElements())
        result = self.identity.multiply(self.random)
        self.assertEqual(self.originalSum, result.sumOfElements())

    def test_ElementProduct(self):
        result = self.small.elementProduct(self.small)
        self.assertEqual(9, result.sumOfElements())
        result = self.large.elementProduct(self.large)
        self.assertEqual(1000000, result.sumOfElements())
        result = self.random.elementProduct(self.identity)
        self.assertEqual(result.trace(), result.sumOfElements())

    def test_SumOfElements(self):
        self.assertEqual(9, self.small.sumOfElements())
        self.assertEqual(1000000, self.large.sumOfElements())
        self.assertEqual(100, self.identity.sumOfElements())
        self.assertEqual(self.originalSum, self.random.sumOfElements())

    def test_Trace(self):
        self.assertEqual(3, self.small.trace())
        self.assertEqual(1000, self.large.trace())
        self.assertEqual(100, self.identity.trace())

    def test_Transpose(self):
        self.assertEqual(9, self.small.transpose().sumOfElements())
        self.assertEqual(1000000, self.large.transpose().sumOfElements())
        self.assertEqual(100, self.identity.transpose().sumOfElements())
        self.assertAlmostEqual(self.originalSum,
                               self.random.transpose().sumOfElements(), 3)

    def test_IsSymmetric(self):
        self.assertTrue(self.small.isSymmetric())
        self.assertTrue(self.large.isSymmetric())
        self.assertTrue(self.identity.isSymmetric())
        self.assertFalse(self.random.isSymmetric())

    def test_Determinant(self):
        self.assertEqual(0, self.small.determinant())
        self.assertEqual(0, self.large.determinant())
        self.assertEqual(1, self.identity.determinant())

    def test_Inverse(self):
        self.identity.inverse()
        self.assertEqual(100, self.identity.sumOfElements())
        self.random.inverse()
        self.random.inverse()
        self.assertAlmostEqual(self.originalSum, self.random.sumOfElements(),
                               5)

    def test_Characteristics(self):
        vectors = self.small.characteristics()
        self.assertEqual(2, len(vectors))
        vectors = self.identity.characteristics()
        self.assertEqual(100, len(vectors))
        vectors = self.medium.characteristics()
        self.assertEqual(46, len(vectors))
Ejemplo n.º 23
0
class Hmm1(Hmm):

    __pi: Vector

    def __init__(self, states: set, observations: list, emittedSymbols: list):
        """
        A constructor of Hmm1 class which takes a Set of states, an array of observations (which also
        consists of an array of states) and an array of instances (which also consists of an array of emitted symbols).
        The constructor calls its super method to calculate the emission probabilities for those states.

        PARAMETERS
        ----------
        states : set
            A Set of states, consisting of all possible states for this problem.
        observations : list
            An array of instances, where each instance consists of an array of states.
        emittedSymbols : list
            An array of instances, where each instance consists of an array of symbols.
        """
        super().__init__(states, observations, emittedSymbols)

    def calculatePi(self, observations: list):
        """
        calculatePi calculates the prior probability vector (initial probabilities for each state) from a set of
        observations. For each observation, the function extracts the first state in that observation. Normalizing the
        counts of the states returns us the prior probabilities for each state.

        PARAMETERS
        ----------
        observations : list
            A set of observations used to calculate the prior probabilities.
        """
        self.__pi = Vector()
        self.__pi.initAllSame(self.stateCount, 0.0)
        for observation in observations:
            index = self.stateIndexes[observation[0]]
            self.__pi.addValue(index, 1.0)
        self.__pi.l1Normalize()

    def calculateTransitionProbabilities(self, observations: list):
        """
        calculateTransitionProbabilities calculates the transition probabilities matrix from each state to another
        state. For each observation and for each transition in each observation, the function gets the states.
        Normalizing the counts of the pair of states returns us the transition probabilities.

        PARAMETERS
        ----------
        observations : list
            A set of observations used to calculate the transition probabilities.
        """
        self.transitionProbabilities = Matrix(self.stateCount, self.stateCount)
        for current in observations:
            for j in range(len(current) - 1):
                fromIndex = self.stateIndexes[current[j]]
                toIndex = self.stateIndexes[current[j + 1]]
                self.transitionProbabilities.increment(fromIndex, toIndex)
        self.transitionProbabilities.columnWiseNormalize()

    def __logOfColumn(self, column: int) -> Vector:
        """
        logOfColumn calculates the logarithm of each value in a specific column in the transition probability matrix.

        PARAMETERS
        ----------
        column : int
            Column index of the transition probability matrix.

        RETURNS
        -------
        Vector
            A vector consisting of the logarithm of each value in the column in the transition probability matrix.
        """
        result = Vector()
        for i in range(self.stateCount):
            result.add(self.safeLog(self.transitionProbabilities.getValue(i, column)))
        return result

    def viterbi(self, s: list) -> list:
        """
        viterbi calculates the most probable state sequence for a set of observed symbols.

        PARAMETERS
        ----------
        s : list
            A set of observed symbols.

        RETURNS
        -------
        list
            The most probable state sequence as an {@link ArrayList}.
        """
        result = []
        sequenceLength = len(s)
        gamma = Matrix(sequenceLength, self.stateCount)
        phi = Matrix(sequenceLength, self.stateCount)
        qs = Vector(sequenceLength, 0)
        emission = s[0]
        for i in range(self.stateCount):
            observationLikelihood = self.states[i].getEmitProb(emission)
            gamma.setValue(0, i, self.safeLog(self.__pi.getValue(i)) + self.safeLog(observationLikelihood))
        for t in range(1, sequenceLength):
            emission = s[t]
            for j in range(self.stateCount):
                tempArray = self.__logOfColumn(j)
                tempArray.addVector(gamma.getRowVector(t - 1))
                maxIndex = tempArray.maxIndex()
                observationLikelihood = self.states[j].getEmitProb(emission)
                gamma.setValue(t, j, tempArray.getValue(maxIndex) + self.safeLog(observationLikelihood))
                phi.setValue(t, j, maxIndex)
        qs.setValue(sequenceLength - 1, gamma.getRowVector(sequenceLength - 1).maxIndex())
        result.insert(0, self.states[int(qs.getValue(sequenceLength - 1))].getState())
        for i in range(sequenceLength - 2, -1, -1):
            qs.setValue(i, phi.getValue(i + 1, int(qs.getValue(i + 1))))
            result.insert(0, self.states[int(qs.getValue(i))].getState())
        return result
Ejemplo n.º 24
0
 def test_Sigmoid(self):
     smallVector3 = Vector(self.data1)
     smallVector3.sigmoid()
     self.assertAlmostEqual(0.8807971, smallVector3.getValue(0), 6)
     self.assertAlmostEqual(0.9975274, smallVector3.getValue(4), 6)
Ejemplo n.º 25
0
 def __trainSkipGram(self):
     """
     Main method for training the SkipGram version of Word2Vec algorithm.
     """
     iteration = Iteration(self.__corpus, self.__parameter)
     currentSentence = self.__corpus.getSentence(
         iteration.getSentenceIndex())
     outputs = Vector()
     outputs.initAllSame(self.__parameter.getLayerSize(), 0.0)
     outputUpdate = Vector()
     outputUpdate.initAllSame(self.__parameter.getLayerSize(), 0)
     self.__corpus.shuffleSentences(1)
     while iteration.getIterationCount(
     ) < self.__parameter.getNumberOfIterations():
         iteration.alphaUpdate()
         wordIndex = self.__vocabulary.getPosition(
             currentSentence.getWord(iteration.getSentencePosition()))
         currentWord = self.__vocabulary.getWord(wordIndex)
         outputs.clear()
         outputUpdate.clear()
         b = randrange(self.__parameter.getWindow())
         for a in range(b, self.__parameter.getWindow() * 2 + 1 - b):
             c = iteration.getSentencePosition(
             ) - self.__parameter.getWindow() + a
             if a != self.__parameter.getWindow(
             ) and currentSentence.safeIndex(c):
                 lastWordIndex = self.__vocabulary.getPosition(
                     currentSentence.getWord(c))
                 l1 = lastWordIndex
                 outputUpdate.clear()
                 if self.__parameter.isHierarchicalSoftMax():
                     for d in range(currentWord.getCodeLength()):
                         l2 = currentWord.getPoint(d)
                         f = self.__wordVectors.getRowVector(l1).dotProduct(
                             self.__wordVectorUpdate.getRowVector(l2))
                         if f <= -NeuralNetwork.MAX_EXP or f >= NeuralNetwork.MAX_EXP:
                             continue
                         else:
                             f = self.__expTable[int(
                                 (f + NeuralNetwork.MAX_EXP) *
                                 (NeuralNetwork.EXP_TABLE_SIZE //
                                  NeuralNetwork.MAX_EXP // 2))]
                         g = (1 - currentWord.getCode(d) -
                              f) * iteration.getAlpha()
                         outputUpdate.addVector(
                             self.__wordVectorUpdate.getRowVector(
                                 l2).product(g))
                         self.__wordVectorUpdate.addRowVector(
                             l2,
                             self.__wordVectors.getRowVector(l1).product(g))
                 else:
                     for d in range(
                             self.__parameter.getNegativeSamplingSize() +
                             1):
                         if d == 0:
                             target = wordIndex
                             label = 1
                         else:
                             target = self.__vocabulary.getTableValue(
                                 randrange(
                                     self.__vocabulary.getTableSize()))
                             if target == 0:
                                 target = randrange(
                                     self.__vocabulary.size() - 1) + 1
                             if target == wordIndex:
                                 continue
                             label = 0
                         l2 = target
                         f = self.__wordVectors.getRowVector(l1).dotProduct(
                             self.__wordVectorUpdate.getRowVector(l2))
                         g = self.__calculateG(f, iteration.getAlpha(),
                                               label)
                         outputUpdate.addVector(
                             self.__wordVectorUpdate.getRowVector(
                                 l2).product(g))
                         self.__wordVectorUpdate.addRowVector(
                             l2,
                             self.__wordVectors.getRowVector(l1).product(g))
                 self.__wordVectors.addRowVector(l1, outputUpdate)
         currentSentence = iteration.sentenceUpdate(currentSentence)
Ejemplo n.º 26
0
class VectorTest(unittest.TestCase):
    data1 = [2, 3, 4, 5, 6]

    def setUp(self):
        data2 = [8, 7, 6, 5, 4]
        self.smallVector1 = Vector(self.data1)
        self.smallVector2 = Vector(data2)
        largeData1 = []
        for i in range(1, 1001):
            largeData1.append(i)
        self.largeVector1 = Vector(largeData1)
        largeData2 = []
        for i in range(1, 1001):
            largeData2.append(1000 - i + 1)
        self.largeVector2 = Vector(largeData2)

    def test_Biased(self):
        biased = self.smallVector1.biased()
        self.assertEqual(1, biased.getValue(0))
        self.assertEqual(self.smallVector1.size() + 1, biased.size())

    def test_ElementAdd(self):
        self.smallVector1.add(7)
        self.assertEqual(7, self.smallVector1.getValue(5))
        self.assertEqual(6, self.smallVector1.size())
        self.smallVector1.remove(5)

    def test_Insert(self):
        self.smallVector1.insert(3, 6)
        self.assertEqual(6, self.smallVector1.getValue(3))
        self.assertEqual(6, self.smallVector1.size())
        self.smallVector1.remove(3)

    def test_Remove(self):
        self.smallVector1.remove(2)
        self.assertEqual(5, self.smallVector1.getValue(2))
        self.assertEqual(4, self.smallVector1.size())
        self.smallVector1.insert(2, 4)

    def test_SumOfElementsSmall(self):
        self.assertEqual(20, self.smallVector1.sumOfElements())
        self.assertEqual(30, self.smallVector2.sumOfElements())

    def test_SumOfElementsLarge(self):
        self.assertEqual(20, self.smallVector1.sumOfElements())
        self.assertEqual(30, self.smallVector2.sumOfElements())
        self.assertEqual(500500, self.largeVector1.sumOfElements())
        self.assertEqual(500500, self.largeVector2.sumOfElements())

    def test_MaxIndex(self):
        self.assertEqual(4, self.smallVector1.maxIndex())
        self.assertEqual(0, self.smallVector2.maxIndex())

    def test_Sigmoid(self):
        smallVector3 = Vector(self.data1)
        smallVector3.sigmoid()
        self.assertAlmostEqual(0.8807971, smallVector3.getValue(0), 6)
        self.assertAlmostEqual(0.9975274, smallVector3.getValue(4), 6)

    def test_SkipVectorSmall(self):
        smallVector3 = self.smallVector1.skipVector(2, 0)
        self.assertEqual(2, smallVector3.getValue(0))
        self.assertEqual(6, smallVector3.getValue(2))
        smallVector3 = self.smallVector1.skipVector(3, 1)
        self.assertEqual(3, smallVector3.getValue(0))
        self.assertEqual(6, smallVector3.getValue(1))

    def test_SkipVectorLarge(self):
        largeVector3 = self.largeVector1.skipVector(2, 0)
        self.assertEqual(250000, largeVector3.sumOfElements())
        largeVector3 = self.largeVector1.skipVector(5, 3)
        self.assertEqual(100300, largeVector3.sumOfElements())

    def test_VectorAddSmall(self):
        self.smallVector1.addVector(self.smallVector2)
        self.assertEqual(50, self.smallVector1.sumOfElements())
        self.smallVector1.subtract(self.smallVector2)

    def test_VectorAddLarge(self):
        self.largeVector1.addVector(self.largeVector2)
        self.assertEqual(1001000, self.largeVector1.sumOfElements())
        self.largeVector1.subtract(self.largeVector2)

    def test_SubtractSmall(self):
        self.smallVector1.subtract(self.smallVector2)
        self.assertEqual(-10, self.smallVector1.sumOfElements())
        self.smallVector1.addVector(self.smallVector2)

    def test_SubtractLarge(self):
        self.largeVector1.subtract(self.largeVector2)
        self.assertEqual(0, self.largeVector1.sumOfElements())
        self.largeVector1.addVector(self.largeVector2)

    def test_DifferenceSmall(self):
        smallVector3 = self.smallVector1.difference(self.smallVector2)
        self.assertEqual(-10, smallVector3.sumOfElements())

    def test_DifferenceLarge(self):
        largeVector3 = self.largeVector1.difference(self.largeVector2)
        self.assertEqual(0, largeVector3.sumOfElements())

    def test_DotProductWithVectorSmall(self):
        dotProduct = self.smallVector1.dotProduct(self.smallVector2)
        self.assertEqual(110, dotProduct)

    def test_DotProductWithVectorLarge(self):
        dotProduct = self.largeVector1.dotProduct(self.largeVector2)
        self.assertEqual(167167000, dotProduct)

    def test_DotProductWithItselfSmall(self):
        dotProduct = self.smallVector1.dotProductWithSelf()
        self.assertEqual(90, dotProduct)

    def test_DotProductWithItselfLarge(self):
        dotProduct = self.largeVector1.dotProductWithSelf()
        self.assertEqual(333833500, dotProduct)

    def test_ElementProductSmall(self):
        smallVector3 = self.smallVector1.elementProduct(self.smallVector2)
        self.assertEqual(110, smallVector3.sumOfElements())

    def test_ElementProductLarge(self):
        largeVector3 = self.largeVector1.elementProduct(self.largeVector2)
        self.assertEqual(167167000, largeVector3.sumOfElements())

    def test_Divide(self):
        self.smallVector1.divide(10.0)
        self.assertEqual(2, self.smallVector1.sumOfElements())
        self.smallVector1.multiply(10.0)

    def test_Multiply(self):
        self.smallVector1.multiply(10.0)
        self.assertEqual(200, self.smallVector1.sumOfElements())
        self.smallVector1.divide(10.0)

    def test_Product(self):
        smallVector3 = self.smallVector1.product(7.0)
        self.assertEqual(140, smallVector3.sumOfElements())

    def test_L1NormalizeSmall(self):
        self.smallVector1.l1Normalize()
        self.assertEqual(1.0, self.smallVector1.sumOfElements())
        self.smallVector1.multiply(20)

    def test_L1NormalizeLarge(self):
        self.largeVector1.l1Normalize()
        self.assertEqual(1.0, self.largeVector1.sumOfElements())
        self.largeVector1.multiply(500500)

    def test_L2NormSmall(self):
        norm = self.smallVector1.l2Norm()
        self.assertEqual(norm, math.sqrt(90))

    def test_L2NormLarge(self):
        norm = self.largeVector1.l2Norm()
        self.assertEqual(norm, math.sqrt(333833500))

    def test_cosineSimilaritySmall(self):
        similarity = self.smallVector1.cosineSimilarity(self.smallVector2)
        self.assertAlmostEqual(0.8411910, similarity, 6)

    def test_cosineSimilarityLarge(self):
        similarity = self.largeVector1.cosineSimilarity(self.largeVector2)
        self.assertAlmostEqual(0.5007497, similarity, 6)