Exemple #1
0
    def convertInstance(self, instance: Instance):
        """
        The convertInstance method takes an Instance as an input and creates a Vector attributes from continuous
        Attributes. After removing all attributes of given instance, it then adds new ContinuousAttribute by using the
        dot product of attributes Vector and the eigenvectors.

        PARAMETERS
        ----------
        instance : Instance
            Instance that will be converted to ContinuousAttribute by using eigenvectors.
        """
        attributes = Vector(instance.continuousAttributes())
        instance.removeAllAttributes()
        for eigenvector in self.__eigenvectors:
            instance.addAttribute(
                ContinuousAttribute(attributes.dotProduct(eigenvector)))
 def __trainCbow(self):
     """
     Main method for training the CBow version of Word2Vec algorithm.
     """
     iteration = Iteration(self.__corpus, self.__parameter)
     currentSentence = self.__corpus.getSentence(
         iteration.getSentenceIndex())
     outputs = Vector()
     outputs.initAllSame(self.__parameter.getLayerSize(), 0.0)
     outputUpdate = Vector()
     outputUpdate.initAllSame(self.__parameter.getLayerSize(), 0)
     self.__corpus.shuffleSentences(1)
     while iteration.getIterationCount(
     ) < self.__parameter.getNumberOfIterations():
         iteration.alphaUpdate()
         wordIndex = self.__vocabulary.getPosition(
             currentSentence.getWord(iteration.getSentencePosition()))
         currentWord = self.__vocabulary.getWord(wordIndex)
         outputs.clear()
         outputUpdate.clear()
         b = randrange(self.__parameter.getWindow())
         cw = 0
         for a in range(b, self.__parameter.getWindow() * 2 + 1 - b):
             c = iteration.getSentencePosition(
             ) - self.__parameter.getWindow() + a
             if a != self.__parameter.getWindow(
             ) and currentSentence.safeIndex(c):
                 lastWordIndex = self.__vocabulary.getPosition(
                     currentSentence.getWord(c))
                 outputs.addVector(
                     self.__wordVectors.getRowVector(lastWordIndex))
                 cw = cw + 1
         if cw > 0:
             outputs.divide(cw)
             if self.__parameter.isHierarchicalSoftMax():
                 for d in range(currentWord.getCodeLength()):
                     l2 = currentWord.getPoint(d)
                     f = outputs.dotProduct(
                         self.__wordVectorUpdate.getRowVector(l2))
                     if f <= -NeuralNetwork.MAX_EXP or f >= NeuralNetwork.MAX_EXP:
                         continue
                     else:
                         f = self.__expTable[int(
                             (f + NeuralNetwork.MAX_EXP) *
                             (NeuralNetwork.EXP_TABLE_SIZE //
                              NeuralNetwork.MAX_EXP // 2))]
                     g = (1 - currentWord.getCode(d) -
                          f) * iteration.getAlpha()
                     outputUpdate.addVector(
                         self.__wordVectorUpdate.getRowVector(l2).product(
                             g))
                     self.__wordVectorUpdate.addRowVector(
                         l2, outputs.product(g))
             else:
                 for d in range(self.__parameter.getNegativeSamplingSize() +
                                1):
                     if d == 0:
                         target = wordIndex
                         label = 1
                     else:
                         target = self.__vocabulary.getTableValue(
                             randrange(self.__vocabulary.getTableSize()))
                         if target == 0:
                             target = randrange(self.__vocabulary.size() -
                                                1) + 1
                         if target == wordIndex:
                             continue
                         label = 0
                     l2 = target
                     f = outputs.dotProduct(
                         self.__wordVectorUpdate.getRowVector(l2))
                     g = self.__calculateG(f, iteration.getAlpha(), label)
                     outputUpdate.addVector(
                         self.__wordVectorUpdate.getRowVector(l2).product(
                             g))
                     self.__wordVectorUpdate.addRowVector(
                         l2, outputs.product(g))
             for a in range(b, self.__parameter.getWindow() * 2 + 1 - b):
                 c = iteration.getSentencePosition(
                 ) - self.__parameter.getWindow() + a
                 if a != self.__parameter.getWindow(
                 ) and currentSentence.safeIndex(c):
                     lastWordIndex = self.__vocabulary.getPosition(
                         currentSentence.getWord(c))
                     self.__wordVectors.addRowVector(
                         lastWordIndex, outputUpdate)
         currentSentence = iteration.sentenceUpdate(currentSentence)
Exemple #3
0
class VectorTest(unittest.TestCase):
    data1 = [2, 3, 4, 5, 6]

    def setUp(self):
        data2 = [8, 7, 6, 5, 4]
        self.smallVector1 = Vector(self.data1)
        self.smallVector2 = Vector(data2)
        largeData1 = []
        for i in range(1, 1001):
            largeData1.append(i)
        self.largeVector1 = Vector(largeData1)
        largeData2 = []
        for i in range(1, 1001):
            largeData2.append(1000 - i + 1)
        self.largeVector2 = Vector(largeData2)

    def test_Biased(self):
        biased = self.smallVector1.biased()
        self.assertEqual(1, biased.getValue(0))
        self.assertEqual(self.smallVector1.size() + 1, biased.size())

    def test_ElementAdd(self):
        self.smallVector1.add(7)
        self.assertEqual(7, self.smallVector1.getValue(5))
        self.assertEqual(6, self.smallVector1.size())
        self.smallVector1.remove(5)

    def test_Insert(self):
        self.smallVector1.insert(3, 6)
        self.assertEqual(6, self.smallVector1.getValue(3))
        self.assertEqual(6, self.smallVector1.size())
        self.smallVector1.remove(3)

    def test_Remove(self):
        self.smallVector1.remove(2)
        self.assertEqual(5, self.smallVector1.getValue(2))
        self.assertEqual(4, self.smallVector1.size())
        self.smallVector1.insert(2, 4)

    def test_SumOfElementsSmall(self):
        self.assertEqual(20, self.smallVector1.sumOfElements())
        self.assertEqual(30, self.smallVector2.sumOfElements())

    def test_SumOfElementsLarge(self):
        self.assertEqual(20, self.smallVector1.sumOfElements())
        self.assertEqual(30, self.smallVector2.sumOfElements())
        self.assertEqual(500500, self.largeVector1.sumOfElements())
        self.assertEqual(500500, self.largeVector2.sumOfElements())

    def test_MaxIndex(self):
        self.assertEqual(4, self.smallVector1.maxIndex())
        self.assertEqual(0, self.smallVector2.maxIndex())

    def test_Sigmoid(self):
        smallVector3 = Vector(self.data1)
        smallVector3.sigmoid()
        self.assertAlmostEqual(0.8807971, smallVector3.getValue(0), 6)
        self.assertAlmostEqual(0.9975274, smallVector3.getValue(4), 6)

    def test_SkipVectorSmall(self):
        smallVector3 = self.smallVector1.skipVector(2, 0)
        self.assertEqual(2, smallVector3.getValue(0))
        self.assertEqual(6, smallVector3.getValue(2))
        smallVector3 = self.smallVector1.skipVector(3, 1)
        self.assertEqual(3, smallVector3.getValue(0))
        self.assertEqual(6, smallVector3.getValue(1))

    def test_SkipVectorLarge(self):
        largeVector3 = self.largeVector1.skipVector(2, 0)
        self.assertEqual(250000, largeVector3.sumOfElements())
        largeVector3 = self.largeVector1.skipVector(5, 3)
        self.assertEqual(100300, largeVector3.sumOfElements())

    def test_VectorAddSmall(self):
        self.smallVector1.addVector(self.smallVector2)
        self.assertEqual(50, self.smallVector1.sumOfElements())
        self.smallVector1.subtract(self.smallVector2)

    def test_VectorAddLarge(self):
        self.largeVector1.addVector(self.largeVector2)
        self.assertEqual(1001000, self.largeVector1.sumOfElements())
        self.largeVector1.subtract(self.largeVector2)

    def test_SubtractSmall(self):
        self.smallVector1.subtract(self.smallVector2)
        self.assertEqual(-10, self.smallVector1.sumOfElements())
        self.smallVector1.addVector(self.smallVector2)

    def test_SubtractLarge(self):
        self.largeVector1.subtract(self.largeVector2)
        self.assertEqual(0, self.largeVector1.sumOfElements())
        self.largeVector1.addVector(self.largeVector2)

    def test_DifferenceSmall(self):
        smallVector3 = self.smallVector1.difference(self.smallVector2)
        self.assertEqual(-10, smallVector3.sumOfElements())

    def test_DifferenceLarge(self):
        largeVector3 = self.largeVector1.difference(self.largeVector2)
        self.assertEqual(0, largeVector3.sumOfElements())

    def test_DotProductWithVectorSmall(self):
        dotProduct = self.smallVector1.dotProduct(self.smallVector2)
        self.assertEqual(110, dotProduct)

    def test_DotProductWithVectorLarge(self):
        dotProduct = self.largeVector1.dotProduct(self.largeVector2)
        self.assertEqual(167167000, dotProduct)

    def test_DotProductWithItselfSmall(self):
        dotProduct = self.smallVector1.dotProductWithSelf()
        self.assertEqual(90, dotProduct)

    def test_DotProductWithItselfLarge(self):
        dotProduct = self.largeVector1.dotProductWithSelf()
        self.assertEqual(333833500, dotProduct)

    def test_ElementProductSmall(self):
        smallVector3 = self.smallVector1.elementProduct(self.smallVector2)
        self.assertEqual(110, smallVector3.sumOfElements())

    def test_ElementProductLarge(self):
        largeVector3 = self.largeVector1.elementProduct(self.largeVector2)
        self.assertEqual(167167000, largeVector3.sumOfElements())

    def test_Divide(self):
        self.smallVector1.divide(10.0)
        self.assertEqual(2, self.smallVector1.sumOfElements())
        self.smallVector1.multiply(10.0)

    def test_Multiply(self):
        self.smallVector1.multiply(10.0)
        self.assertEqual(200, self.smallVector1.sumOfElements())
        self.smallVector1.divide(10.0)

    def test_Product(self):
        smallVector3 = self.smallVector1.product(7.0)
        self.assertEqual(140, smallVector3.sumOfElements())

    def test_L1NormalizeSmall(self):
        self.smallVector1.l1Normalize()
        self.assertEqual(1.0, self.smallVector1.sumOfElements())
        self.smallVector1.multiply(20)

    def test_L1NormalizeLarge(self):
        self.largeVector1.l1Normalize()
        self.assertEqual(1.0, self.largeVector1.sumOfElements())
        self.largeVector1.multiply(500500)

    def test_L2NormSmall(self):
        norm = self.smallVector1.l2Norm()
        self.assertEqual(norm, math.sqrt(90))

    def test_L2NormLarge(self):
        norm = self.largeVector1.l2Norm()
        self.assertEqual(norm, math.sqrt(333833500))

    def test_cosineSimilaritySmall(self):
        similarity = self.smallVector1.cosineSimilarity(self.smallVector2)
        self.assertAlmostEqual(0.8411910, similarity, 6)

    def test_cosineSimilarityLarge(self):
        similarity = self.largeVector1.cosineSimilarity(self.largeVector2)
        self.assertAlmostEqual(0.5007497, similarity, 6)