class NeuralNetwork: __wordVectors: Matrix __wordVectorUpdate: Matrix __vocabulary: Vocabulary __parameter: WordToVecParameter __corpus: Corpus __expTable: list EXP_TABLE_SIZE = 1000 MAX_EXP = 6 def __init__(self, corpus: Corpus, parameter: WordToVecParameter): """ Constructor for the NeuralNetwork class. Gets corpus and network parameters as input and sets the corresponding parameters first. After that, initializes the network with random weights between -0.5 and 0.5. Constructs vector update matrix and prepares the exp table. PARAMETERS ---------- corpus : Corpus Corpus used to train word vectors using Word2Vec algorithm. parameter : WordToVecParameter Parameters of the Word2Vec algorithm. """ self.__vocabulary = Vocabulary(corpus) self.__parameter = parameter self.__corpus = corpus self.__wordVectors = Matrix(self.__vocabulary.size(), self.__parameter.getLayerSize(), -0.5, 0.5) self.__wordVectorUpdate = Matrix(self.__vocabulary.size(), self.__parameter.getLayerSize()) self.__prepareExpTable() def __prepareExpTable(self): """ Constructs the fast exponentiation table. Instead of taking exponent at each time, the algorithm will lookup the table. """ self.__expTable = [0.0] * (NeuralNetwork.EXP_TABLE_SIZE + 1) for i in range(NeuralNetwork.EXP_TABLE_SIZE): self.__expTable[i] = math.exp( (i / (NeuralNetwork.EXP_TABLE_SIZE + 0.0) * 2 - 1) * NeuralNetwork.MAX_EXP) self.__expTable[i] = self.__expTable[i] / (self.__expTable[i] + 1) def train(self) -> VectorizedDictionary: """ Main method for training the Word2Vec algorithm. Depending on the training parameter, CBox or SkipGram algorithm is applied. RETURNS ------- VectorizedDictionary Dictionary of word vectors. """ result = VectorizedDictionary() if self.__parameter.isCbow(): self.__trainCbow() else: self.__trainSkipGram() for i in range(self.__vocabulary.size()): result.addWord( VectorizedWord( self.__vocabulary.getWord(i).getName(), self.__wordVectors.getRowVector(i))) return result def __calculateG(self, f: float, alpha: float, label: float) -> float: """ Calculates G value in the Word2Vec algorithm. PARAMETERS ---------- f : float F value. alpha : float Learning rate alpha. label : float Label of the instance. RETURNS ------- float Calculated G value. """ if f > NeuralNetwork.MAX_EXP: return (label - 1) * alpha elif f < -NeuralNetwork.MAX_EXP: return label * alpha else: return (label - self.__expTable[int( (f + NeuralNetwork.MAX_EXP) * (NeuralNetwork.EXP_TABLE_SIZE // NeuralNetwork.MAX_EXP // 2))] ) * alpha def __trainCbow(self): """ Main method for training the CBow version of Word2Vec algorithm. """ iteration = Iteration(self.__corpus, self.__parameter) currentSentence = self.__corpus.getSentence( iteration.getSentenceIndex()) outputs = Vector() outputs.initAllSame(self.__parameter.getLayerSize(), 0.0) outputUpdate = Vector() outputUpdate.initAllSame(self.__parameter.getLayerSize(), 0) self.__corpus.shuffleSentences(1) while iteration.getIterationCount( ) < self.__parameter.getNumberOfIterations(): iteration.alphaUpdate() wordIndex = self.__vocabulary.getPosition( currentSentence.getWord(iteration.getSentencePosition())) currentWord = self.__vocabulary.getWord(wordIndex) outputs.clear() outputUpdate.clear() b = randrange(self.__parameter.getWindow()) cw = 0 for a in range(b, self.__parameter.getWindow() * 2 + 1 - b): c = iteration.getSentencePosition( ) - self.__parameter.getWindow() + a if a != self.__parameter.getWindow( ) and currentSentence.safeIndex(c): lastWordIndex = self.__vocabulary.getPosition( currentSentence.getWord(c)) outputs.addVector( self.__wordVectors.getRowVector(lastWordIndex)) cw = cw + 1 if cw > 0: outputs.divide(cw) if self.__parameter.isHierarchicalSoftMax(): for d in range(currentWord.getCodeLength()): l2 = currentWord.getPoint(d) f = outputs.dotProduct( self.__wordVectorUpdate.getRowVector(l2)) if f <= -NeuralNetwork.MAX_EXP or f >= NeuralNetwork.MAX_EXP: continue else: f = self.__expTable[int( (f + NeuralNetwork.MAX_EXP) * (NeuralNetwork.EXP_TABLE_SIZE // NeuralNetwork.MAX_EXP // 2))] g = (1 - currentWord.getCode(d) - f) * iteration.getAlpha() outputUpdate.addVector( self.__wordVectorUpdate.getRowVector(l2).product( g)) self.__wordVectorUpdate.addRowVector( l2, outputs.product(g)) else: for d in range(self.__parameter.getNegativeSamplingSize() + 1): if d == 0: target = wordIndex label = 1 else: target = self.__vocabulary.getTableValue( randrange(self.__vocabulary.getTableSize())) if target == 0: target = randrange(self.__vocabulary.size() - 1) + 1 if target == wordIndex: continue label = 0 l2 = target f = outputs.dotProduct( self.__wordVectorUpdate.getRowVector(l2)) g = self.__calculateG(f, iteration.getAlpha(), label) outputUpdate.addVector( self.__wordVectorUpdate.getRowVector(l2).product( g)) self.__wordVectorUpdate.addRowVector( l2, outputs.product(g)) for a in range(b, self.__parameter.getWindow() * 2 + 1 - b): c = iteration.getSentencePosition( ) - self.__parameter.getWindow() + a if a != self.__parameter.getWindow( ) and currentSentence.safeIndex(c): lastWordIndex = self.__vocabulary.getPosition( currentSentence.getWord(c)) self.__wordVectors.addRowVector( lastWordIndex, outputUpdate) currentSentence = iteration.sentenceUpdate(currentSentence) def __trainSkipGram(self): """ Main method for training the SkipGram version of Word2Vec algorithm. """ iteration = Iteration(self.__corpus, self.__parameter) currentSentence = self.__corpus.getSentence( iteration.getSentenceIndex()) outputs = Vector() outputs.initAllSame(self.__parameter.getLayerSize(), 0.0) outputUpdate = Vector() outputUpdate.initAllSame(self.__parameter.getLayerSize(), 0) self.__corpus.shuffleSentences(1) while iteration.getIterationCount( ) < self.__parameter.getNumberOfIterations(): iteration.alphaUpdate() wordIndex = self.__vocabulary.getPosition( currentSentence.getWord(iteration.getSentencePosition())) currentWord = self.__vocabulary.getWord(wordIndex) outputs.clear() outputUpdate.clear() b = randrange(self.__parameter.getWindow()) for a in range(b, self.__parameter.getWindow() * 2 + 1 - b): c = iteration.getSentencePosition( ) - self.__parameter.getWindow() + a if a != self.__parameter.getWindow( ) and currentSentence.safeIndex(c): lastWordIndex = self.__vocabulary.getPosition( currentSentence.getWord(c)) l1 = lastWordIndex outputUpdate.clear() if self.__parameter.isHierarchicalSoftMax(): for d in range(currentWord.getCodeLength()): l2 = currentWord.getPoint(d) f = self.__wordVectors.getRowVector(l1).dotProduct( self.__wordVectorUpdate.getRowVector(l2)) if f <= -NeuralNetwork.MAX_EXP or f >= NeuralNetwork.MAX_EXP: continue else: f = self.__expTable[int( (f + NeuralNetwork.MAX_EXP) * (NeuralNetwork.EXP_TABLE_SIZE // NeuralNetwork.MAX_EXP // 2))] g = (1 - currentWord.getCode(d) - f) * iteration.getAlpha() outputUpdate.addVector( self.__wordVectorUpdate.getRowVector( l2).product(g)) self.__wordVectorUpdate.addRowVector( l2, self.__wordVectors.getRowVector(l1).product(g)) else: for d in range( self.__parameter.getNegativeSamplingSize() + 1): if d == 0: target = wordIndex label = 1 else: target = self.__vocabulary.getTableValue( randrange( self.__vocabulary.getTableSize())) if target == 0: target = randrange( self.__vocabulary.size() - 1) + 1 if target == wordIndex: continue label = 0 l2 = target f = self.__wordVectors.getRowVector(l1).dotProduct( self.__wordVectorUpdate.getRowVector(l2)) g = self.__calculateG(f, iteration.getAlpha(), label) outputUpdate.addVector( self.__wordVectorUpdate.getRowVector( l2).product(g)) self.__wordVectorUpdate.addRowVector( l2, self.__wordVectors.getRowVector(l1).product(g)) self.__wordVectors.addRowVector(l1, outputUpdate) currentSentence = iteration.sentenceUpdate(currentSentence)
class MatrixTest(unittest.TestCase): def setUp(self): self.small = Matrix(3, 3) for i in range(3): for j in range(3): self.small.setValue(i, j, 1.0) self.v = Vector(3, 1.0) self.large = Matrix(1000, 1000) for i in range(1000): for j in range(1000): self.large.setValue(i, j, 1.0) self.medium = Matrix(100, 100) for i in range(100): for j in range(100): self.medium.setValue(i, j, 1.0) self.V = Vector(1000, 1.0) self.vr = Vector(100, 1.0) self.random = Matrix(100, 100, 1, 10, 1) self.originalSum = self.random.sumOfElements() self.identity = Matrix(100) def test_ColumnWiseNormalize(self): mClone = self.small.clone() mClone.columnWiseNormalize() self.assertEqual(3, mClone.sumOfElements()) MClone = self.large.clone() MClone.columnWiseNormalize() self.assertAlmostEqual(1000, MClone.sumOfElements(), 3) self.identity.columnWiseNormalize() self.assertEqual(100, self.identity.sumOfElements()) def test_MultiplyWithConstant(self): self.small.multiplyWithConstant(4) self.assertEqual(36, self.small.sumOfElements()) self.small.divideByConstant(4) self.large.multiplyWithConstant(1.001) self.assertAlmostEqual(1001000, self.large.sumOfElements(), 3) self.large.divideByConstant(1.001) self.random.multiplyWithConstant(3.6) self.assertAlmostEqual(self.originalSum * 3.6, self.random.sumOfElements(), 4) self.random.divideByConstant(3.6) def test_DivideByConstant(self): self.small.divideByConstant(4) self.assertEqual(2.25, self.small.sumOfElements()) self.small.multiplyWithConstant(4) self.large.divideByConstant(10) self.assertAlmostEqual(100000, self.large.sumOfElements(), 3) self.large.multiplyWithConstant(10) self.random.divideByConstant(3.6) self.assertAlmostEqual(self.originalSum / 3.6, self.random.sumOfElements(), 4) self.random.multiplyWithConstant(3.6) def test_Add(self): self.random.add(self.identity) self.assertAlmostEqual(self.originalSum + 100, self.random.sumOfElements(), 4) self.random.subtract(self.identity) def test_AddVector(self): self.large.addRowVector(4, self.V) self.assertEqual(1001000, self.large.sumOfElements(), 0.0) self.V.multiply(-1.0) self.large.addRowVector(4, self.V) self.V.multiply(-1.0) def test_Subtract(self): self.random.subtract(self.identity) self.assertAlmostEqual(self.originalSum - 100, self.random.sumOfElements(), 4) self.random.add(self.identity) def test_MultiplyWithVectorFromLeft(self): result = self.small.multiplyWithVectorFromLeft(self.v) self.assertEqual(9, result.sumOfElements()) result = self.large.multiplyWithVectorFromLeft(self.V) self.assertEqual(1000000, result.sumOfElements()) result = self.random.multiplyWithVectorFromLeft(self.vr) self.assertAlmostEqual(self.originalSum, result.sumOfElements(), 4) def test_MultiplyWithVectorFromRight(self): result = self.small.multiplyWithVectorFromRight(self.v) self.assertEqual(9, result.sumOfElements()) result = self.large.multiplyWithVectorFromRight(self.V) self.assertEqual(1000000, result.sumOfElements()) result = self.random.multiplyWithVectorFromRight(self.vr) self.assertAlmostEqual(self.originalSum, result.sumOfElements(), 4) def test_ColumnSum(self): self.assertEqual(3, self.small.columnSum(randrange(3))) self.assertEqual(1000, self.large.columnSum(randrange(1000))) self.assertEqual(1, self.identity.columnSum(randrange(100))) def test_SumOfRows(self): self.assertEqual(9, self.small.sumOfRows().sumOfElements()) self.assertEqual(1000000, self.large.sumOfRows().sumOfElements()) self.assertEqual(100, self.identity.sumOfRows().sumOfElements()) self.assertAlmostEqual(self.originalSum, self.random.sumOfRows().sumOfElements(), 3) def test_RowSum(self): self.assertEqual(3, self.small.rowSum(randrange(3))) self.assertEqual(1000, self.large.rowSum(randrange(1000))) self.assertEqual(1, self.identity.rowSum(randrange(100))) def test_Multiply(self): result = self.small.multiply(self.small) self.assertEqual(27, result.sumOfElements()) result = self.medium.multiply(self.medium) self.assertEqual(1000000.0, result.sumOfElements()) result = self.random.multiply(self.identity) self.assertEqual(self.originalSum, result.sumOfElements()) result = self.identity.multiply(self.random) self.assertEqual(self.originalSum, result.sumOfElements()) def test_ElementProduct(self): result = self.small.elementProduct(self.small) self.assertEqual(9, result.sumOfElements()) result = self.large.elementProduct(self.large) self.assertEqual(1000000, result.sumOfElements()) result = self.random.elementProduct(self.identity) self.assertEqual(result.trace(), result.sumOfElements()) def test_SumOfElements(self): self.assertEqual(9, self.small.sumOfElements()) self.assertEqual(1000000, self.large.sumOfElements()) self.assertEqual(100, self.identity.sumOfElements()) self.assertEqual(self.originalSum, self.random.sumOfElements()) def test_Trace(self): self.assertEqual(3, self.small.trace()) self.assertEqual(1000, self.large.trace()) self.assertEqual(100, self.identity.trace()) def test_Transpose(self): self.assertEqual(9, self.small.transpose().sumOfElements()) self.assertEqual(1000000, self.large.transpose().sumOfElements()) self.assertEqual(100, self.identity.transpose().sumOfElements()) self.assertAlmostEqual(self.originalSum, self.random.transpose().sumOfElements(), 3) def test_IsSymmetric(self): self.assertTrue(self.small.isSymmetric()) self.assertTrue(self.large.isSymmetric()) self.assertTrue(self.identity.isSymmetric()) self.assertFalse(self.random.isSymmetric()) def test_Determinant(self): self.assertEqual(0, self.small.determinant()) self.assertEqual(0, self.large.determinant()) self.assertEqual(1, self.identity.determinant()) def test_Inverse(self): self.identity.inverse() self.assertEqual(100, self.identity.sumOfElements()) self.random.inverse() self.random.inverse() self.assertAlmostEqual(self.originalSum, self.random.sumOfElements(), 5) def test_Characteristics(self): vectors = self.small.characteristics() self.assertEqual(2, len(vectors)) vectors = self.identity.characteristics() self.assertEqual(100, len(vectors)) vectors = self.medium.characteristics() self.assertEqual(46, len(vectors))