def __trainSkipGram(self): """ Main method for training the SkipGram version of Word2Vec algorithm. """ iteration = Iteration(self.__corpus, self.__parameter) currentSentence = self.__corpus.getSentence( iteration.getSentenceIndex()) outputs = Vector() outputs.initAllSame(self.__parameter.getLayerSize(), 0.0) outputUpdate = Vector() outputUpdate.initAllSame(self.__parameter.getLayerSize(), 0) self.__corpus.shuffleSentences(1) while iteration.getIterationCount( ) < self.__parameter.getNumberOfIterations(): iteration.alphaUpdate() wordIndex = self.__vocabulary.getPosition( currentSentence.getWord(iteration.getSentencePosition())) currentWord = self.__vocabulary.getWord(wordIndex) outputs.clear() outputUpdate.clear() b = randrange(self.__parameter.getWindow()) for a in range(b, self.__parameter.getWindow() * 2 + 1 - b): c = iteration.getSentencePosition( ) - self.__parameter.getWindow() + a if a != self.__parameter.getWindow( ) and currentSentence.safeIndex(c): lastWordIndex = self.__vocabulary.getPosition( currentSentence.getWord(c)) l1 = lastWordIndex outputUpdate.clear() if self.__parameter.isHierarchicalSoftMax(): for d in range(currentWord.getCodeLength()): l2 = currentWord.getPoint(d) f = self.__wordVectors.getRowVector(l1).dotProduct( self.__wordVectorUpdate.getRowVector(l2)) if f <= -NeuralNetwork.MAX_EXP or f >= NeuralNetwork.MAX_EXP: continue else: f = self.__expTable[int( (f + NeuralNetwork.MAX_EXP) * (NeuralNetwork.EXP_TABLE_SIZE // NeuralNetwork.MAX_EXP // 2))] g = (1 - currentWord.getCode(d) - f) * iteration.getAlpha() outputUpdate.addVector( self.__wordVectorUpdate.getRowVector( l2).product(g)) self.__wordVectorUpdate.addRowVector( l2, self.__wordVectors.getRowVector(l1).product(g)) else: for d in range( self.__parameter.getNegativeSamplingSize() + 1): if d == 0: target = wordIndex label = 1 else: target = self.__vocabulary.getTableValue( randrange( self.__vocabulary.getTableSize())) if target == 0: target = randrange( self.__vocabulary.size() - 1) + 1 if target == wordIndex: continue label = 0 l2 = target f = self.__wordVectors.getRowVector(l1).dotProduct( self.__wordVectorUpdate.getRowVector(l2)) g = self.__calculateG(f, iteration.getAlpha(), label) outputUpdate.addVector( self.__wordVectorUpdate.getRowVector( l2).product(g)) self.__wordVectorUpdate.addRowVector( l2, self.__wordVectors.getRowVector(l1).product(g)) self.__wordVectors.addRowVector(l1, outputUpdate) currentSentence = iteration.sentenceUpdate(currentSentence)
class VectorTest(unittest.TestCase): data1 = [2, 3, 4, 5, 6] def setUp(self): data2 = [8, 7, 6, 5, 4] self.smallVector1 = Vector(self.data1) self.smallVector2 = Vector(data2) largeData1 = [] for i in range(1, 1001): largeData1.append(i) self.largeVector1 = Vector(largeData1) largeData2 = [] for i in range(1, 1001): largeData2.append(1000 - i + 1) self.largeVector2 = Vector(largeData2) def test_Biased(self): biased = self.smallVector1.biased() self.assertEqual(1, biased.getValue(0)) self.assertEqual(self.smallVector1.size() + 1, biased.size()) def test_ElementAdd(self): self.smallVector1.add(7) self.assertEqual(7, self.smallVector1.getValue(5)) self.assertEqual(6, self.smallVector1.size()) self.smallVector1.remove(5) def test_Insert(self): self.smallVector1.insert(3, 6) self.assertEqual(6, self.smallVector1.getValue(3)) self.assertEqual(6, self.smallVector1.size()) self.smallVector1.remove(3) def test_Remove(self): self.smallVector1.remove(2) self.assertEqual(5, self.smallVector1.getValue(2)) self.assertEqual(4, self.smallVector1.size()) self.smallVector1.insert(2, 4) def test_SumOfElementsSmall(self): self.assertEqual(20, self.smallVector1.sumOfElements()) self.assertEqual(30, self.smallVector2.sumOfElements()) def test_SumOfElementsLarge(self): self.assertEqual(20, self.smallVector1.sumOfElements()) self.assertEqual(30, self.smallVector2.sumOfElements()) self.assertEqual(500500, self.largeVector1.sumOfElements()) self.assertEqual(500500, self.largeVector2.sumOfElements()) def test_MaxIndex(self): self.assertEqual(4, self.smallVector1.maxIndex()) self.assertEqual(0, self.smallVector2.maxIndex()) def test_Sigmoid(self): smallVector3 = Vector(self.data1) smallVector3.sigmoid() self.assertAlmostEqual(0.8807971, smallVector3.getValue(0), 6) self.assertAlmostEqual(0.9975274, smallVector3.getValue(4), 6) def test_SkipVectorSmall(self): smallVector3 = self.smallVector1.skipVector(2, 0) self.assertEqual(2, smallVector3.getValue(0)) self.assertEqual(6, smallVector3.getValue(2)) smallVector3 = self.smallVector1.skipVector(3, 1) self.assertEqual(3, smallVector3.getValue(0)) self.assertEqual(6, smallVector3.getValue(1)) def test_SkipVectorLarge(self): largeVector3 = self.largeVector1.skipVector(2, 0) self.assertEqual(250000, largeVector3.sumOfElements()) largeVector3 = self.largeVector1.skipVector(5, 3) self.assertEqual(100300, largeVector3.sumOfElements()) def test_VectorAddSmall(self): self.smallVector1.addVector(self.smallVector2) self.assertEqual(50, self.smallVector1.sumOfElements()) self.smallVector1.subtract(self.smallVector2) def test_VectorAddLarge(self): self.largeVector1.addVector(self.largeVector2) self.assertEqual(1001000, self.largeVector1.sumOfElements()) self.largeVector1.subtract(self.largeVector2) def test_SubtractSmall(self): self.smallVector1.subtract(self.smallVector2) self.assertEqual(-10, self.smallVector1.sumOfElements()) self.smallVector1.addVector(self.smallVector2) def test_SubtractLarge(self): self.largeVector1.subtract(self.largeVector2) self.assertEqual(0, self.largeVector1.sumOfElements()) self.largeVector1.addVector(self.largeVector2) def test_DifferenceSmall(self): smallVector3 = self.smallVector1.difference(self.smallVector2) self.assertEqual(-10, smallVector3.sumOfElements()) def test_DifferenceLarge(self): largeVector3 = self.largeVector1.difference(self.largeVector2) self.assertEqual(0, largeVector3.sumOfElements()) def test_DotProductWithVectorSmall(self): dotProduct = self.smallVector1.dotProduct(self.smallVector2) self.assertEqual(110, dotProduct) def test_DotProductWithVectorLarge(self): dotProduct = self.largeVector1.dotProduct(self.largeVector2) self.assertEqual(167167000, dotProduct) def test_DotProductWithItselfSmall(self): dotProduct = self.smallVector1.dotProductWithSelf() self.assertEqual(90, dotProduct) def test_DotProductWithItselfLarge(self): dotProduct = self.largeVector1.dotProductWithSelf() self.assertEqual(333833500, dotProduct) def test_ElementProductSmall(self): smallVector3 = self.smallVector1.elementProduct(self.smallVector2) self.assertEqual(110, smallVector3.sumOfElements()) def test_ElementProductLarge(self): largeVector3 = self.largeVector1.elementProduct(self.largeVector2) self.assertEqual(167167000, largeVector3.sumOfElements()) def test_Divide(self): self.smallVector1.divide(10.0) self.assertEqual(2, self.smallVector1.sumOfElements()) self.smallVector1.multiply(10.0) def test_Multiply(self): self.smallVector1.multiply(10.0) self.assertEqual(200, self.smallVector1.sumOfElements()) self.smallVector1.divide(10.0) def test_Product(self): smallVector3 = self.smallVector1.product(7.0) self.assertEqual(140, smallVector3.sumOfElements()) def test_L1NormalizeSmall(self): self.smallVector1.l1Normalize() self.assertEqual(1.0, self.smallVector1.sumOfElements()) self.smallVector1.multiply(20) def test_L1NormalizeLarge(self): self.largeVector1.l1Normalize() self.assertEqual(1.0, self.largeVector1.sumOfElements()) self.largeVector1.multiply(500500) def test_L2NormSmall(self): norm = self.smallVector1.l2Norm() self.assertEqual(norm, math.sqrt(90)) def test_L2NormLarge(self): norm = self.largeVector1.l2Norm() self.assertEqual(norm, math.sqrt(333833500)) def test_cosineSimilaritySmall(self): similarity = self.smallVector1.cosineSimilarity(self.smallVector2) self.assertAlmostEqual(0.8411910, similarity, 6) def test_cosineSimilarityLarge(self): similarity = self.largeVector1.cosineSimilarity(self.largeVector2) self.assertAlmostEqual(0.5007497, similarity, 6)