Beispiel #1
0
    def kMeansClustering(self, iteration: int, k: int) -> list:
        """
        The kMeansClustering method takes an integer iteration and k as inputs. K-means clustering aims to partition n
        observations into k clusters in which each observation belongs to the cluster with the nearest mean.

        PARAMETERS
        ----------
        iteration : int
            Integer input.
        k : int
            Integer input.

        RETURNS
        -------
        list
            list result which holds the k-means clustered words.
        """
        result = []
        means = []
        vectorSize = self.words[0].getVector().size()
        for i in range(k):
            result.append([])
            v = Vector()
            v.initAllSame(vectorSize, 0)
            means.append(v)
        for i in range(len(self.words)):
            result[i % k].append(self.words[i])
            means[i % k].add(self.words[i]).getVector()
        for i in range(k):
            means[i].divide(len(result[i]))
            means[i].divide(math.sqrt(means[i].dotProductWithSelf()))
        for i in range(iteration):
            for j in range(k):
                result[j].clear()
            for vectorizedWord in self.words:
                maxClusterDistance = means[0].dotProduct(
                    vectorizedWord.getVector())
                maxClusterIndex = 0
                for j in range(1, k):
                    clusterDistance = means[j].dotProduct(
                        vectorizedWord.getVector())
                    if clusterDistance > maxClusterDistance:
                        maxClusterDistance = clusterDistance
                        maxClusterIndex = j
                result[maxClusterIndex].append(vectorizedWord)
            for j in range(k):
                means[j].clear()
                for word in result[j]:
                    means[j].add(word.getVector())
                means[j].divide(len(result[j]))
                means[j].divide(math.sqrt(means[j].dotProductWithSelf()))
        return result
Beispiel #2
0
    def calculateOneMinusHidden(self, hidden: Vector) -> Vector:
        """
        The calculateOneMinusHidden method takes a {@link java.util.Vector} as input. It creates a Vector of ones and
         returns the difference between given Vector.

        PARAMETERS
        ----------
        hidden : Vector
            Vector to find difference.

        RETURNS
        -------
        Vector
            Returns the difference between ones Vector and input Vector.
        """
        one = Vector()
        one.initAllSame(hidden.size(), 1.0)
        return one.difference(hidden)
Beispiel #3
0
class Hmm1(Hmm):

    __pi: Vector

    def __init__(self, states: set, observations: list, emittedSymbols: list):
        """
        A constructor of Hmm1 class which takes a Set of states, an array of observations (which also
        consists of an array of states) and an array of instances (which also consists of an array of emitted symbols).
        The constructor calls its super method to calculate the emission probabilities for those states.

        PARAMETERS
        ----------
        states : set
            A Set of states, consisting of all possible states for this problem.
        observations : list
            An array of instances, where each instance consists of an array of states.
        emittedSymbols : list
            An array of instances, where each instance consists of an array of symbols.
        """
        super().__init__(states, observations, emittedSymbols)

    def calculatePi(self, observations: list):
        """
        calculatePi calculates the prior probability vector (initial probabilities for each state) from a set of
        observations. For each observation, the function extracts the first state in that observation. Normalizing the
        counts of the states returns us the prior probabilities for each state.

        PARAMETERS
        ----------
        observations : list
            A set of observations used to calculate the prior probabilities.
        """
        self.__pi = Vector()
        self.__pi.initAllSame(self.stateCount, 0.0)
        for observation in observations:
            index = self.stateIndexes[observation[0]]
            self.__pi.addValue(index, 1.0)
        self.__pi.l1Normalize()

    def calculateTransitionProbabilities(self, observations: list):
        """
        calculateTransitionProbabilities calculates the transition probabilities matrix from each state to another
        state. For each observation and for each transition in each observation, the function gets the states.
        Normalizing the counts of the pair of states returns us the transition probabilities.

        PARAMETERS
        ----------
        observations : list
            A set of observations used to calculate the transition probabilities.
        """
        self.transitionProbabilities = Matrix(self.stateCount, self.stateCount)
        for current in observations:
            for j in range(len(current) - 1):
                fromIndex = self.stateIndexes[current[j]]
                toIndex = self.stateIndexes[current[j + 1]]
                self.transitionProbabilities.increment(fromIndex, toIndex)
        self.transitionProbabilities.columnWiseNormalize()

    def __logOfColumn(self, column: int) -> Vector:
        """
        logOfColumn calculates the logarithm of each value in a specific column in the transition probability matrix.

        PARAMETERS
        ----------
        column : int
            Column index of the transition probability matrix.

        RETURNS
        -------
        Vector
            A vector consisting of the logarithm of each value in the column in the transition probability matrix.
        """
        result = Vector()
        for i in range(self.stateCount):
            result.add(self.safeLog(self.transitionProbabilities.getValue(i, column)))
        return result

    def viterbi(self, s: list) -> list:
        """
        viterbi calculates the most probable state sequence for a set of observed symbols.

        PARAMETERS
        ----------
        s : list
            A set of observed symbols.

        RETURNS
        -------
        list
            The most probable state sequence as an {@link ArrayList}.
        """
        result = []
        sequenceLength = len(s)
        gamma = Matrix(sequenceLength, self.stateCount)
        phi = Matrix(sequenceLength, self.stateCount)
        qs = Vector(sequenceLength, 0)
        emission = s[0]
        for i in range(self.stateCount):
            observationLikelihood = self.states[i].getEmitProb(emission)
            gamma.setValue(0, i, self.safeLog(self.__pi.getValue(i)) + self.safeLog(observationLikelihood))
        for t in range(1, sequenceLength):
            emission = s[t]
            for j in range(self.stateCount):
                tempArray = self.__logOfColumn(j)
                tempArray.addVector(gamma.getRowVector(t - 1))
                maxIndex = tempArray.maxIndex()
                observationLikelihood = self.states[j].getEmitProb(emission)
                gamma.setValue(t, j, tempArray.getValue(maxIndex) + self.safeLog(observationLikelihood))
                phi.setValue(t, j, maxIndex)
        qs.setValue(sequenceLength - 1, gamma.getRowVector(sequenceLength - 1).maxIndex())
        result.insert(0, self.states[int(qs.getValue(sequenceLength - 1))].getState())
        for i in range(sequenceLength - 2, -1, -1):
            qs.setValue(i, phi.getValue(i + 1, int(qs.getValue(i + 1))))
            result.insert(0, self.states[int(qs.getValue(i))].getState())
        return result
 def __trainSkipGram(self):
     """
     Main method for training the SkipGram version of Word2Vec algorithm.
     """
     iteration = Iteration(self.__corpus, self.__parameter)
     currentSentence = self.__corpus.getSentence(
         iteration.getSentenceIndex())
     outputs = Vector()
     outputs.initAllSame(self.__parameter.getLayerSize(), 0.0)
     outputUpdate = Vector()
     outputUpdate.initAllSame(self.__parameter.getLayerSize(), 0)
     self.__corpus.shuffleSentences(1)
     while iteration.getIterationCount(
     ) < self.__parameter.getNumberOfIterations():
         iteration.alphaUpdate()
         wordIndex = self.__vocabulary.getPosition(
             currentSentence.getWord(iteration.getSentencePosition()))
         currentWord = self.__vocabulary.getWord(wordIndex)
         outputs.clear()
         outputUpdate.clear()
         b = randrange(self.__parameter.getWindow())
         for a in range(b, self.__parameter.getWindow() * 2 + 1 - b):
             c = iteration.getSentencePosition(
             ) - self.__parameter.getWindow() + a
             if a != self.__parameter.getWindow(
             ) and currentSentence.safeIndex(c):
                 lastWordIndex = self.__vocabulary.getPosition(
                     currentSentence.getWord(c))
                 l1 = lastWordIndex
                 outputUpdate.clear()
                 if self.__parameter.isHierarchicalSoftMax():
                     for d in range(currentWord.getCodeLength()):
                         l2 = currentWord.getPoint(d)
                         f = self.__wordVectors.getRowVector(l1).dotProduct(
                             self.__wordVectorUpdate.getRowVector(l2))
                         if f <= -NeuralNetwork.MAX_EXP or f >= NeuralNetwork.MAX_EXP:
                             continue
                         else:
                             f = self.__expTable[int(
                                 (f + NeuralNetwork.MAX_EXP) *
                                 (NeuralNetwork.EXP_TABLE_SIZE //
                                  NeuralNetwork.MAX_EXP // 2))]
                         g = (1 - currentWord.getCode(d) -
                              f) * iteration.getAlpha()
                         outputUpdate.addVector(
                             self.__wordVectorUpdate.getRowVector(
                                 l2).product(g))
                         self.__wordVectorUpdate.addRowVector(
                             l2,
                             self.__wordVectors.getRowVector(l1).product(g))
                 else:
                     for d in range(
                             self.__parameter.getNegativeSamplingSize() +
                             1):
                         if d == 0:
                             target = wordIndex
                             label = 1
                         else:
                             target = self.__vocabulary.getTableValue(
                                 randrange(
                                     self.__vocabulary.getTableSize()))
                             if target == 0:
                                 target = randrange(
                                     self.__vocabulary.size() - 1) + 1
                             if target == wordIndex:
                                 continue
                             label = 0
                         l2 = target
                         f = self.__wordVectors.getRowVector(l1).dotProduct(
                             self.__wordVectorUpdate.getRowVector(l2))
                         g = self.__calculateG(f, iteration.getAlpha(),
                                               label)
                         outputUpdate.addVector(
                             self.__wordVectorUpdate.getRowVector(
                                 l2).product(g))
                         self.__wordVectorUpdate.addRowVector(
                             l2,
                             self.__wordVectors.getRowVector(l1).product(g))
                 self.__wordVectors.addRowVector(l1, outputUpdate)
         currentSentence = iteration.sentenceUpdate(currentSentence)