def train(self, trainSet: InstanceList, parameters: Parameter): """ Training algorithm for the linear discriminant analysis classifier (Introduction to Machine Learning, Alpaydin, 2015). PARAMETERS ---------- trainSet : InstanceList Training data given to the algorithm. parameters : Parameter Parameter of the Lda algorithm. """ w0 = {} w = {} priorDistribution = trainSet.classDistribution() classLists = Partition(trainSet) covariance = Matrix(trainSet.get(0).continuousAttributeSize(), trainSet.get(0).continuousAttributeSize()) for i in range(classLists.size()): averageVector = Vector(classLists.get(i).continuousAverage()) classCovariance = classLists.get(i).covariance(averageVector) classCovariance.multiplyWithConstant(classLists.get(i).size() - 1) covariance.add(classCovariance) covariance.divideByConstant(trainSet.size() - classLists.size()) covariance.inverse() for i in range(classLists.size()): Ci = classLists.get(i).getClassLabel() averageVector = Vector(classLists.get(i).continuousAverage()) wi = covariance.multiplyWithVectorFromRight(averageVector) w[Ci] = wi w0i = -0.5 * wi.dotProduct(averageVector) + math.log(priorDistribution.getProbability(Ci)) w0[Ci] = w0i self.model = LdaModel(priorDistribution, w, w0)
def __linearRegressionOnCountsOfCounts(self, countsOfCounts: list) -> list: """ Given counts of counts, this function will calculate the estimated counts of counts c$^*$ with Good-Turing smoothing. First, the algorithm filters the non-zero counts from counts of counts array and constructs c and r arrays. Then it constructs Z_n array with Z_n = (2C_n / (r_{n+1} - r_{n-1})). The algorithm then uses simple linear regression on Z_n values to estimate w_1 and w_0, where log(N[i]) = w_1log(i) + w_0 PARAMETERS ---------- countsOfCounts : list Counts of counts. countsOfCounts[1] is the number of words occurred once in the corpus. countsOfCounts[i] is the number of words occurred i times in the corpus. RETURNS ------ list Estimated counts of counts array. N[1] is the estimated count for out of vocabulary words. """ N = [0.0] * len(countsOfCounts) r = [] c = [] for i in range(1, len(countsOfCounts)): if countsOfCounts[i] != 0: r.append(i) c.append(countsOfCounts[i]) A = Matrix(2, 2) y = Vector(2, 0) for i in range(len(r)): xt = math.log(r[i]) if i == 0: rt = math.log(c[i]) else: if i == len(r) - 1: rt = math.log((1.0 * c[i]) / (r[i] - r[i - 1])) else: rt = math.log((2.0 * c[i]) / (r[i + 1] - r[i - 1])) A.addValue(0, 0, 1.0) A.addValue(0, 1, xt) A.addValue(1, 0, xt) A.addValue(1, 1, xt * xt) y.addValue(0, rt) y.addValue(1, rt * xt) A.inverse() w = A.multiplyWithVectorFromRight(y) w0 = w.getValue(0) w1 = w.getValue(1) for i in range(1, len(countsOfCounts)): N[i] = math.exp(math.log(i) * w1 + w0) return N
class MatrixTest(unittest.TestCase): def setUp(self): self.small = Matrix(3, 3) for i in range(3): for j in range(3): self.small.setValue(i, j, 1.0) self.v = Vector(3, 1.0) self.large = Matrix(1000, 1000) for i in range(1000): for j in range(1000): self.large.setValue(i, j, 1.0) self.medium = Matrix(100, 100) for i in range(100): for j in range(100): self.medium.setValue(i, j, 1.0) self.V = Vector(1000, 1.0) self.vr = Vector(100, 1.0) self.random = Matrix(100, 100, 1, 10, 1) self.originalSum = self.random.sumOfElements() self.identity = Matrix(100) def test_ColumnWiseNormalize(self): mClone = self.small.clone() mClone.columnWiseNormalize() self.assertEqual(3, mClone.sumOfElements()) MClone = self.large.clone() MClone.columnWiseNormalize() self.assertAlmostEqual(1000, MClone.sumOfElements(), 3) self.identity.columnWiseNormalize() self.assertEqual(100, self.identity.sumOfElements()) def test_MultiplyWithConstant(self): self.small.multiplyWithConstant(4) self.assertEqual(36, self.small.sumOfElements()) self.small.divideByConstant(4) self.large.multiplyWithConstant(1.001) self.assertAlmostEqual(1001000, self.large.sumOfElements(), 3) self.large.divideByConstant(1.001) self.random.multiplyWithConstant(3.6) self.assertAlmostEqual(self.originalSum * 3.6, self.random.sumOfElements(), 4) self.random.divideByConstant(3.6) def test_DivideByConstant(self): self.small.divideByConstant(4) self.assertEqual(2.25, self.small.sumOfElements()) self.small.multiplyWithConstant(4) self.large.divideByConstant(10) self.assertAlmostEqual(100000, self.large.sumOfElements(), 3) self.large.multiplyWithConstant(10) self.random.divideByConstant(3.6) self.assertAlmostEqual(self.originalSum / 3.6, self.random.sumOfElements(), 4) self.random.multiplyWithConstant(3.6) def test_Add(self): self.random.add(self.identity) self.assertAlmostEqual(self.originalSum + 100, self.random.sumOfElements(), 4) self.random.subtract(self.identity) def test_AddVector(self): self.large.addRowVector(4, self.V) self.assertEqual(1001000, self.large.sumOfElements(), 0.0) self.V.multiply(-1.0) self.large.addRowVector(4, self.V) self.V.multiply(-1.0) def test_Subtract(self): self.random.subtract(self.identity) self.assertAlmostEqual(self.originalSum - 100, self.random.sumOfElements(), 4) self.random.add(self.identity) def test_MultiplyWithVectorFromLeft(self): result = self.small.multiplyWithVectorFromLeft(self.v) self.assertEqual(9, result.sumOfElements()) result = self.large.multiplyWithVectorFromLeft(self.V) self.assertEqual(1000000, result.sumOfElements()) result = self.random.multiplyWithVectorFromLeft(self.vr) self.assertAlmostEqual(self.originalSum, result.sumOfElements(), 4) def test_MultiplyWithVectorFromRight(self): result = self.small.multiplyWithVectorFromRight(self.v) self.assertEqual(9, result.sumOfElements()) result = self.large.multiplyWithVectorFromRight(self.V) self.assertEqual(1000000, result.sumOfElements()) result = self.random.multiplyWithVectorFromRight(self.vr) self.assertAlmostEqual(self.originalSum, result.sumOfElements(), 4) def test_ColumnSum(self): self.assertEqual(3, self.small.columnSum(randrange(3))) self.assertEqual(1000, self.large.columnSum(randrange(1000))) self.assertEqual(1, self.identity.columnSum(randrange(100))) def test_SumOfRows(self): self.assertEqual(9, self.small.sumOfRows().sumOfElements()) self.assertEqual(1000000, self.large.sumOfRows().sumOfElements()) self.assertEqual(100, self.identity.sumOfRows().sumOfElements()) self.assertAlmostEqual(self.originalSum, self.random.sumOfRows().sumOfElements(), 3) def test_RowSum(self): self.assertEqual(3, self.small.rowSum(randrange(3))) self.assertEqual(1000, self.large.rowSum(randrange(1000))) self.assertEqual(1, self.identity.rowSum(randrange(100))) def test_Multiply(self): result = self.small.multiply(self.small) self.assertEqual(27, result.sumOfElements()) result = self.medium.multiply(self.medium) self.assertEqual(1000000.0, result.sumOfElements()) result = self.random.multiply(self.identity) self.assertEqual(self.originalSum, result.sumOfElements()) result = self.identity.multiply(self.random) self.assertEqual(self.originalSum, result.sumOfElements()) def test_ElementProduct(self): result = self.small.elementProduct(self.small) self.assertEqual(9, result.sumOfElements()) result = self.large.elementProduct(self.large) self.assertEqual(1000000, result.sumOfElements()) result = self.random.elementProduct(self.identity) self.assertEqual(result.trace(), result.sumOfElements()) def test_SumOfElements(self): self.assertEqual(9, self.small.sumOfElements()) self.assertEqual(1000000, self.large.sumOfElements()) self.assertEqual(100, self.identity.sumOfElements()) self.assertEqual(self.originalSum, self.random.sumOfElements()) def test_Trace(self): self.assertEqual(3, self.small.trace()) self.assertEqual(1000, self.large.trace()) self.assertEqual(100, self.identity.trace()) def test_Transpose(self): self.assertEqual(9, self.small.transpose().sumOfElements()) self.assertEqual(1000000, self.large.transpose().sumOfElements()) self.assertEqual(100, self.identity.transpose().sumOfElements()) self.assertAlmostEqual(self.originalSum, self.random.transpose().sumOfElements(), 3) def test_IsSymmetric(self): self.assertTrue(self.small.isSymmetric()) self.assertTrue(self.large.isSymmetric()) self.assertTrue(self.identity.isSymmetric()) self.assertFalse(self.random.isSymmetric()) def test_Determinant(self): self.assertEqual(0, self.small.determinant()) self.assertEqual(0, self.large.determinant()) self.assertEqual(1, self.identity.determinant()) def test_Inverse(self): self.identity.inverse() self.assertEqual(100, self.identity.sumOfElements()) self.random.inverse() self.random.inverse() self.assertAlmostEqual(self.originalSum, self.random.sumOfElements(), 5) def test_Characteristics(self): vectors = self.small.characteristics() self.assertEqual(2, len(vectors)) vectors = self.identity.characteristics() self.assertEqual(100, len(vectors)) vectors = self.medium.characteristics() self.assertEqual(46, len(vectors))