class NeuralNetwork:
    __wordVectors: Matrix
    __wordVectorUpdate: Matrix
    __vocabulary: Vocabulary
    __parameter: WordToVecParameter
    __corpus: Corpus
    __expTable: list

    EXP_TABLE_SIZE = 1000
    MAX_EXP = 6

    def __init__(self, corpus: Corpus, parameter: WordToVecParameter):
        """
        Constructor for the NeuralNetwork class. Gets corpus and network parameters as input and sets the
        corresponding parameters first. After that, initializes the network with random weights between -0.5 and 0.5.
        Constructs vector update matrix and prepares the exp table.

        PARAMETERS
        ----------
        corpus : Corpus
            Corpus used to train word vectors using Word2Vec algorithm.
        parameter : WordToVecParameter
            Parameters of the Word2Vec algorithm.
        """
        self.__vocabulary = Vocabulary(corpus)
        self.__parameter = parameter
        self.__corpus = corpus
        self.__wordVectors = Matrix(self.__vocabulary.size(),
                                    self.__parameter.getLayerSize(), -0.5, 0.5)
        self.__wordVectorUpdate = Matrix(self.__vocabulary.size(),
                                         self.__parameter.getLayerSize())
        self.__prepareExpTable()

    def __prepareExpTable(self):
        """
        Constructs the fast exponentiation table. Instead of taking exponent at each time, the algorithm will lookup
        the table.
        """
        self.__expTable = [0.0] * (NeuralNetwork.EXP_TABLE_SIZE + 1)
        for i in range(NeuralNetwork.EXP_TABLE_SIZE):
            self.__expTable[i] = math.exp(
                (i / (NeuralNetwork.EXP_TABLE_SIZE + 0.0) * 2 - 1) *
                NeuralNetwork.MAX_EXP)
            self.__expTable[i] = self.__expTable[i] / (self.__expTable[i] + 1)

    def train(self) -> VectorizedDictionary:
        """
        Main method for training the Word2Vec algorithm. Depending on the training parameter, CBox or SkipGram algorithm
        is applied.

        RETURNS
        -------
        VectorizedDictionary
            Dictionary of word vectors.
        """
        result = VectorizedDictionary()
        if self.__parameter.isCbow():
            self.__trainCbow()
        else:
            self.__trainSkipGram()
        for i in range(self.__vocabulary.size()):
            result.addWord(
                VectorizedWord(
                    self.__vocabulary.getWord(i).getName(),
                    self.__wordVectors.getRowVector(i)))
        return result

    def __calculateG(self, f: float, alpha: float, label: float) -> float:
        """
        Calculates G value in the Word2Vec algorithm.

        PARAMETERS
        ----------
        f : float
            F value.
        alpha : float
            Learning rate alpha.
        label : float
            Label of the instance.

        RETURNS
        -------
        float
            Calculated G value.
        """
        if f > NeuralNetwork.MAX_EXP:
            return (label - 1) * alpha
        elif f < -NeuralNetwork.MAX_EXP:
            return label * alpha
        else:
            return (label - self.__expTable[int(
                (f + NeuralNetwork.MAX_EXP) *
                (NeuralNetwork.EXP_TABLE_SIZE // NeuralNetwork.MAX_EXP // 2))]
                    ) * alpha

    def __trainCbow(self):
        """
        Main method for training the CBow version of Word2Vec algorithm.
        """
        iteration = Iteration(self.__corpus, self.__parameter)
        currentSentence = self.__corpus.getSentence(
            iteration.getSentenceIndex())
        outputs = Vector()
        outputs.initAllSame(self.__parameter.getLayerSize(), 0.0)
        outputUpdate = Vector()
        outputUpdate.initAllSame(self.__parameter.getLayerSize(), 0)
        self.__corpus.shuffleSentences(1)
        while iteration.getIterationCount(
        ) < self.__parameter.getNumberOfIterations():
            iteration.alphaUpdate()
            wordIndex = self.__vocabulary.getPosition(
                currentSentence.getWord(iteration.getSentencePosition()))
            currentWord = self.__vocabulary.getWord(wordIndex)
            outputs.clear()
            outputUpdate.clear()
            b = randrange(self.__parameter.getWindow())
            cw = 0
            for a in range(b, self.__parameter.getWindow() * 2 + 1 - b):
                c = iteration.getSentencePosition(
                ) - self.__parameter.getWindow() + a
                if a != self.__parameter.getWindow(
                ) and currentSentence.safeIndex(c):
                    lastWordIndex = self.__vocabulary.getPosition(
                        currentSentence.getWord(c))
                    outputs.addVector(
                        self.__wordVectors.getRowVector(lastWordIndex))
                    cw = cw + 1
            if cw > 0:
                outputs.divide(cw)
                if self.__parameter.isHierarchicalSoftMax():
                    for d in range(currentWord.getCodeLength()):
                        l2 = currentWord.getPoint(d)
                        f = outputs.dotProduct(
                            self.__wordVectorUpdate.getRowVector(l2))
                        if f <= -NeuralNetwork.MAX_EXP or f >= NeuralNetwork.MAX_EXP:
                            continue
                        else:
                            f = self.__expTable[int(
                                (f + NeuralNetwork.MAX_EXP) *
                                (NeuralNetwork.EXP_TABLE_SIZE //
                                 NeuralNetwork.MAX_EXP // 2))]
                        g = (1 - currentWord.getCode(d) -
                             f) * iteration.getAlpha()
                        outputUpdate.addVector(
                            self.__wordVectorUpdate.getRowVector(l2).product(
                                g))
                        self.__wordVectorUpdate.addRowVector(
                            l2, outputs.product(g))
                else:
                    for d in range(self.__parameter.getNegativeSamplingSize() +
                                   1):
                        if d == 0:
                            target = wordIndex
                            label = 1
                        else:
                            target = self.__vocabulary.getTableValue(
                                randrange(self.__vocabulary.getTableSize()))
                            if target == 0:
                                target = randrange(self.__vocabulary.size() -
                                                   1) + 1
                            if target == wordIndex:
                                continue
                            label = 0
                        l2 = target
                        f = outputs.dotProduct(
                            self.__wordVectorUpdate.getRowVector(l2))
                        g = self.__calculateG(f, iteration.getAlpha(), label)
                        outputUpdate.addVector(
                            self.__wordVectorUpdate.getRowVector(l2).product(
                                g))
                        self.__wordVectorUpdate.addRowVector(
                            l2, outputs.product(g))
                for a in range(b, self.__parameter.getWindow() * 2 + 1 - b):
                    c = iteration.getSentencePosition(
                    ) - self.__parameter.getWindow() + a
                    if a != self.__parameter.getWindow(
                    ) and currentSentence.safeIndex(c):
                        lastWordIndex = self.__vocabulary.getPosition(
                            currentSentence.getWord(c))
                        self.__wordVectors.addRowVector(
                            lastWordIndex, outputUpdate)
            currentSentence = iteration.sentenceUpdate(currentSentence)

    def __trainSkipGram(self):
        """
        Main method for training the SkipGram version of Word2Vec algorithm.
        """
        iteration = Iteration(self.__corpus, self.__parameter)
        currentSentence = self.__corpus.getSentence(
            iteration.getSentenceIndex())
        outputs = Vector()
        outputs.initAllSame(self.__parameter.getLayerSize(), 0.0)
        outputUpdate = Vector()
        outputUpdate.initAllSame(self.__parameter.getLayerSize(), 0)
        self.__corpus.shuffleSentences(1)
        while iteration.getIterationCount(
        ) < self.__parameter.getNumberOfIterations():
            iteration.alphaUpdate()
            wordIndex = self.__vocabulary.getPosition(
                currentSentence.getWord(iteration.getSentencePosition()))
            currentWord = self.__vocabulary.getWord(wordIndex)
            outputs.clear()
            outputUpdate.clear()
            b = randrange(self.__parameter.getWindow())
            for a in range(b, self.__parameter.getWindow() * 2 + 1 - b):
                c = iteration.getSentencePosition(
                ) - self.__parameter.getWindow() + a
                if a != self.__parameter.getWindow(
                ) and currentSentence.safeIndex(c):
                    lastWordIndex = self.__vocabulary.getPosition(
                        currentSentence.getWord(c))
                    l1 = lastWordIndex
                    outputUpdate.clear()
                    if self.__parameter.isHierarchicalSoftMax():
                        for d in range(currentWord.getCodeLength()):
                            l2 = currentWord.getPoint(d)
                            f = self.__wordVectors.getRowVector(l1).dotProduct(
                                self.__wordVectorUpdate.getRowVector(l2))
                            if f <= -NeuralNetwork.MAX_EXP or f >= NeuralNetwork.MAX_EXP:
                                continue
                            else:
                                f = self.__expTable[int(
                                    (f + NeuralNetwork.MAX_EXP) *
                                    (NeuralNetwork.EXP_TABLE_SIZE //
                                     NeuralNetwork.MAX_EXP // 2))]
                            g = (1 - currentWord.getCode(d) -
                                 f) * iteration.getAlpha()
                            outputUpdate.addVector(
                                self.__wordVectorUpdate.getRowVector(
                                    l2).product(g))
                            self.__wordVectorUpdate.addRowVector(
                                l2,
                                self.__wordVectors.getRowVector(l1).product(g))
                    else:
                        for d in range(
                                self.__parameter.getNegativeSamplingSize() +
                                1):
                            if d == 0:
                                target = wordIndex
                                label = 1
                            else:
                                target = self.__vocabulary.getTableValue(
                                    randrange(
                                        self.__vocabulary.getTableSize()))
                                if target == 0:
                                    target = randrange(
                                        self.__vocabulary.size() - 1) + 1
                                if target == wordIndex:
                                    continue
                                label = 0
                            l2 = target
                            f = self.__wordVectors.getRowVector(l1).dotProduct(
                                self.__wordVectorUpdate.getRowVector(l2))
                            g = self.__calculateG(f, iteration.getAlpha(),
                                                  label)
                            outputUpdate.addVector(
                                self.__wordVectorUpdate.getRowVector(
                                    l2).product(g))
                            self.__wordVectorUpdate.addRowVector(
                                l2,
                                self.__wordVectors.getRowVector(l1).product(g))
                    self.__wordVectors.addRowVector(l1, outputUpdate)
            currentSentence = iteration.sentenceUpdate(currentSentence)
Exemple #2
0
class MatrixTest(unittest.TestCase):
    def setUp(self):
        self.small = Matrix(3, 3)
        for i in range(3):
            for j in range(3):
                self.small.setValue(i, j, 1.0)
        self.v = Vector(3, 1.0)
        self.large = Matrix(1000, 1000)
        for i in range(1000):
            for j in range(1000):
                self.large.setValue(i, j, 1.0)
        self.medium = Matrix(100, 100)
        for i in range(100):
            for j in range(100):
                self.medium.setValue(i, j, 1.0)
        self.V = Vector(1000, 1.0)
        self.vr = Vector(100, 1.0)
        self.random = Matrix(100, 100, 1, 10, 1)
        self.originalSum = self.random.sumOfElements()
        self.identity = Matrix(100)

    def test_ColumnWiseNormalize(self):
        mClone = self.small.clone()
        mClone.columnWiseNormalize()
        self.assertEqual(3, mClone.sumOfElements())
        MClone = self.large.clone()
        MClone.columnWiseNormalize()
        self.assertAlmostEqual(1000, MClone.sumOfElements(), 3)
        self.identity.columnWiseNormalize()
        self.assertEqual(100, self.identity.sumOfElements())

    def test_MultiplyWithConstant(self):
        self.small.multiplyWithConstant(4)
        self.assertEqual(36, self.small.sumOfElements())
        self.small.divideByConstant(4)
        self.large.multiplyWithConstant(1.001)
        self.assertAlmostEqual(1001000, self.large.sumOfElements(), 3)
        self.large.divideByConstant(1.001)
        self.random.multiplyWithConstant(3.6)
        self.assertAlmostEqual(self.originalSum * 3.6,
                               self.random.sumOfElements(), 4)
        self.random.divideByConstant(3.6)

    def test_DivideByConstant(self):
        self.small.divideByConstant(4)
        self.assertEqual(2.25, self.small.sumOfElements())
        self.small.multiplyWithConstant(4)
        self.large.divideByConstant(10)
        self.assertAlmostEqual(100000, self.large.sumOfElements(), 3)
        self.large.multiplyWithConstant(10)
        self.random.divideByConstant(3.6)
        self.assertAlmostEqual(self.originalSum / 3.6,
                               self.random.sumOfElements(), 4)
        self.random.multiplyWithConstant(3.6)

    def test_Add(self):
        self.random.add(self.identity)
        self.assertAlmostEqual(self.originalSum + 100,
                               self.random.sumOfElements(), 4)
        self.random.subtract(self.identity)

    def test_AddVector(self):
        self.large.addRowVector(4, self.V)
        self.assertEqual(1001000, self.large.sumOfElements(), 0.0)
        self.V.multiply(-1.0)
        self.large.addRowVector(4, self.V)
        self.V.multiply(-1.0)

    def test_Subtract(self):
        self.random.subtract(self.identity)
        self.assertAlmostEqual(self.originalSum - 100,
                               self.random.sumOfElements(), 4)
        self.random.add(self.identity)

    def test_MultiplyWithVectorFromLeft(self):
        result = self.small.multiplyWithVectorFromLeft(self.v)
        self.assertEqual(9, result.sumOfElements())
        result = self.large.multiplyWithVectorFromLeft(self.V)
        self.assertEqual(1000000, result.sumOfElements())
        result = self.random.multiplyWithVectorFromLeft(self.vr)
        self.assertAlmostEqual(self.originalSum, result.sumOfElements(), 4)

    def test_MultiplyWithVectorFromRight(self):
        result = self.small.multiplyWithVectorFromRight(self.v)
        self.assertEqual(9, result.sumOfElements())
        result = self.large.multiplyWithVectorFromRight(self.V)
        self.assertEqual(1000000, result.sumOfElements())
        result = self.random.multiplyWithVectorFromRight(self.vr)
        self.assertAlmostEqual(self.originalSum, result.sumOfElements(), 4)

    def test_ColumnSum(self):
        self.assertEqual(3, self.small.columnSum(randrange(3)))
        self.assertEqual(1000, self.large.columnSum(randrange(1000)))
        self.assertEqual(1, self.identity.columnSum(randrange(100)))

    def test_SumOfRows(self):
        self.assertEqual(9, self.small.sumOfRows().sumOfElements())
        self.assertEqual(1000000, self.large.sumOfRows().sumOfElements())
        self.assertEqual(100, self.identity.sumOfRows().sumOfElements())
        self.assertAlmostEqual(self.originalSum,
                               self.random.sumOfRows().sumOfElements(), 3)

    def test_RowSum(self):
        self.assertEqual(3, self.small.rowSum(randrange(3)))
        self.assertEqual(1000, self.large.rowSum(randrange(1000)))
        self.assertEqual(1, self.identity.rowSum(randrange(100)))

    def test_Multiply(self):
        result = self.small.multiply(self.small)
        self.assertEqual(27, result.sumOfElements())
        result = self.medium.multiply(self.medium)
        self.assertEqual(1000000.0, result.sumOfElements())
        result = self.random.multiply(self.identity)
        self.assertEqual(self.originalSum, result.sumOfElements())
        result = self.identity.multiply(self.random)
        self.assertEqual(self.originalSum, result.sumOfElements())

    def test_ElementProduct(self):
        result = self.small.elementProduct(self.small)
        self.assertEqual(9, result.sumOfElements())
        result = self.large.elementProduct(self.large)
        self.assertEqual(1000000, result.sumOfElements())
        result = self.random.elementProduct(self.identity)
        self.assertEqual(result.trace(), result.sumOfElements())

    def test_SumOfElements(self):
        self.assertEqual(9, self.small.sumOfElements())
        self.assertEqual(1000000, self.large.sumOfElements())
        self.assertEqual(100, self.identity.sumOfElements())
        self.assertEqual(self.originalSum, self.random.sumOfElements())

    def test_Trace(self):
        self.assertEqual(3, self.small.trace())
        self.assertEqual(1000, self.large.trace())
        self.assertEqual(100, self.identity.trace())

    def test_Transpose(self):
        self.assertEqual(9, self.small.transpose().sumOfElements())
        self.assertEqual(1000000, self.large.transpose().sumOfElements())
        self.assertEqual(100, self.identity.transpose().sumOfElements())
        self.assertAlmostEqual(self.originalSum,
                               self.random.transpose().sumOfElements(), 3)

    def test_IsSymmetric(self):
        self.assertTrue(self.small.isSymmetric())
        self.assertTrue(self.large.isSymmetric())
        self.assertTrue(self.identity.isSymmetric())
        self.assertFalse(self.random.isSymmetric())

    def test_Determinant(self):
        self.assertEqual(0, self.small.determinant())
        self.assertEqual(0, self.large.determinant())
        self.assertEqual(1, self.identity.determinant())

    def test_Inverse(self):
        self.identity.inverse()
        self.assertEqual(100, self.identity.sumOfElements())
        self.random.inverse()
        self.random.inverse()
        self.assertAlmostEqual(self.originalSum, self.random.sumOfElements(),
                               5)

    def test_Characteristics(self):
        vectors = self.small.characteristics()
        self.assertEqual(2, len(vectors))
        vectors = self.identity.characteristics()
        self.assertEqual(100, len(vectors))
        vectors = self.medium.characteristics()
        self.assertEqual(46, len(vectors))