def tes_multiply_by_B_left(self):
		"""Tests the functionality of multiplying a vector with A from the left. 
		The tested routine is used in the CG method."""

		num_documents_per_language_list = [[13], [4], [12, 22, 10], [132, 123, 123], [123, 120, 130]] # controls Z.shape[0] and the value of n
		voc_size_per_lang_list = [[6], [10], [23, 30, 4], [40, 70,50], [123, 20, 120]] # controls Z.shape[1]
		num_concepts_list = np.ones_like(voc_size_per_lang_list) # dummy used to create the data object

		lambda_values = [1, 0.5, 0.33, 1.66, 2]

		seeds_z = [12345, 1111, 222222] # generates different Z's
		seeds_v = [123, 1111, 22222, 333333] # generates different v's
		for i in range(len(num_documents_per_language_list)):
			num_docs_per_lang = num_documents_per_language_list[i]
			voc_size_per_lang = voc_size_per_lang_list[i]
			num_concepts = num_concepts_list[i]
			total_vocabulary = np.sum(voc_size_per_lang)

			for seed_z in seeds_z:
				data_obj = self.generate_data_obj(num_concepts, num_docs_per_lang, voc_size_per_lang, seed_z = seed_z)
				Z_dense = data_obj.Z.todense()

				for lambda_ in lambda_values:
					operations_obj = Operations(data_obj, lambda_)

					for seed_v in seeds_v:
						v = self.generate_v(total_vocabulary, seed_v)

						dense_B = self.get_dense_B(operations_obj)
						dense_multiply = dense_B @ v

						v = v.reshape(v.shape[0])
						custom_multiply = Operations.multiply_by_B_left(v, operations_obj)
						self.assertTrue(np.allclose(custom_multiply, dense_multiply.reshape(v.shape[0])))
	def tes_multiply_by_M(self):
		"""Tests the functionality of multiplying a vector by M from the left. 
		The tested routine is used by the iterative method for finding the SVD/Eigendecomposition decomposition of M."""

		num_documents_per_language_list = [[13], [4], [12, 22, 10], [132, 123, 123], [123, 120, 130]] # controls Z.shape[0], Y.shape[0] and n
		voc_size_per_lang_list = [[6], [10], [23, 30, 4], [40, 70,50], [123, 20, 120]] # controls Z.shape[1]
		num_concepts_list = [3, 4, 15, 100, 130] # controls Y.shape[1]

		lambda_values = [1, 0.5, 0.33, 1.66, 2]

		seeds_z = [12345, 1111, 222222] # generates different Z's
		seeds_y = [1, 123123123, 50000] # generates different Y's
		seeds_v = [123, 1111, 22222, 333333] # generates different v's

		for i in range(len(num_documents_per_language_list)):
			num_docs_per_lang = num_documents_per_language_list[i]
			voc_size_per_lang = voc_size_per_lang_list[i]
			num_concepts = num_concepts_list[i] # dimensions of vector v

			for seed_z in seeds_z:
				for seed_y in seeds_y:
					data_obj = self.generate_data_obj(num_concepts, num_docs_per_lang, voc_size_per_lang, seed_y = seed_y, seed_z = seed_z)

					for lambda_ in lambda_values:
						operations_obj = Operations(data_obj, lambda_)

						for seed_v in seeds_v:
							v = self.generate_v(num_concepts, seed_v)
							v = v.reshape(v.shape[0])

							dense_M = self.get_dense_M(operations_obj)
							dense_multiply = dense_M @ v
							custom_multiply = Operations.multiply_by_M_left(v, operations_obj)

							self.assertTrue(np.allclose(custom_multiply, dense_multiply))
	def tes_multiply_by_Z_T(self):
		"""Tests the functionality of multiplying a vector with Z transpose from the left"""

		num_documents_per_language_list = [[13], [4], [12, 22, 10], [132, 123, 123], [123, 120, 130]] # controls Z.shape[0]
		voc_size_per_lang_list = [[6], [10], [23, 30, 4], [40, 70,50], [123, 20, 120]] # controls Z.shape[1]
		num_concepts_list = np.ones_like(voc_size_per_lang_list) # dummy  used to create the data object

		seeds_z = [12345, 1111, 222222] # generates different Z's
		seeds_v = [123, 1111, 22222, 333333] # generates different v's

		for i in range(len(num_documents_per_language_list)):
			num_docs_per_lang = num_documents_per_language_list[i]
			voc_size_per_lang = voc_size_per_lang_list[i]
			num_concepts = num_concepts_list[i]
			total_num_of_docs = np.sum(num_docs_per_lang)

			for seed_z in seeds_z:
				data_obj = self.generate_data_obj(num_concepts, num_docs_per_lang, voc_size_per_lang, seed_z = seed_z)
				operations_obj = Operations(data_obj)
				Z_dense = data_obj.Z.todense()
				Z_dense_T = data_obj.Z_T.todense()

				for seed_v in seeds_v:
					v = self.generate_v(total_num_of_docs, seed_v)
					dense_multiply = Z_dense_T @ v
					custom_multiply = operations_obj.multiply_by_Z_T_left(v)
					mkl_multiply = operations_obj.multiply_by_Z_T_viaMKL(v)
					self.assertTrue(np.allclose(mkl_multiply, custom_multiply))
					self.assertTrue(np.allclose(custom_multiply, dense_multiply))
	def tes_multiply_by_Y_T_left(self):
		"""Tests the functionality of multiplying a vector with Y transpose from the left"""

		num_documents_per_language_list = [[8], [8], [100], [123]] # controls Y.shape[0]
		num_concepts_list = [5, 8, 35, 62] # controls Y.shape[1]

		seeds_y = [12345, 1111, 222222] # generates different Y's
		seeds_v = [1233, 111111, 22, 22222] # generates different v's
		for i in range(len(num_concepts_list)):
			num_docs_per_lang = voc_size_per_lang = num_documents_per_language_list[i]
			num_concepts = num_concepts_list[i]
			total_num_of_docs = np.sum(num_docs_per_lang)
			for seed_y in seeds_y:
				data_obj = self.generate_data_obj(num_concepts, num_docs_per_lang, voc_size_per_lang, seed_y = seed_y)
				operations_obj = Operations(data_obj)
				Y_dense = data_obj.Y.todense()
				Y_dense_T = Y_dense.T

				for seed_v in seeds_v:
					v = self.generate_v(total_num_of_docs, seed_v)

					dense_multiply = Y_dense_T @ v

					custom_multiply = operations_obj.multiply_by_Y_T_left(v)
					self.assertTrue(np.allclose(custom_multiply, dense_multiply))
	def test_multiply_by_inverse_cg(self):
		"""Tests the functionality of multiplying a vector by A inverse from the left. 
		The product is generated using CG method."""

		num_documents_per_language_list = [[13], [4], [12, 22, 10], [132, 123, 123], [123, 120, 130]] # controls Z.shape[0] and the value of n
		voc_size_per_lang_list = [[6], [10], [23, 30, 4], [40, 70,50], [123, 20, 120]] # controls Z.shape[1]
		num_concepts_list = np.ones_like(voc_size_per_lang_list) # dummy used to create the data object

		lambda_values = [1, 0.5, 0.33, 1.66, 2]

		seeds_z = [12345, 1111, 222222] # generates different Z's
		seeds_v = [123, 1111, 22222, 333333] # generates different v's
		for i in range(len(num_documents_per_language_list)):
			num_docs_per_lang = num_documents_per_language_list[i]
			voc_size_per_lang = voc_size_per_lang_list[i]
			num_concepts = num_concepts_list[i]
			total_vocabulary = np.sum(voc_size_per_lang) # dimension of vector v

			for seed_z in seeds_z:
				data_obj = self.generate_data_obj(num_concepts, num_docs_per_lang, voc_size_per_lang, seed_z = seed_z)
				#Z_dense = data_obj.Z.todense()

				for lambda_ in lambda_values:
					operations_obj = Operations(data_obj, lambda_)

					for seed_v in seeds_v:
						v = self.generate_v(total_vocabulary, seed_v)

						dense_B = self.get_dense_B(operations_obj)
						#print(self.is_pos_def(dense_A))
						dense_multiply = np.linalg.inv(dense_B) @ v

						return_obj = operations_obj.multiply_by_inverse_cg(v)
						custom_multiply = return_obj[0].reshape(total_vocabulary, 1)

						#if not np.allclose(custom_multiply, dense_multiply):
							#print(dense_multiply)
							# print(Z_dense.shape)
							# print(i)
							# print(seed_z)
							# print(lambda_)
							# print(seed_v)
						self.assertTrue(np.allclose(custom_multiply, dense_multiply))
	def tes_multiply_by_const_left(self):
		"""Tests the functionality of multiplying a vector with the constant factor from the left"""

		num_documents_per_language_list = [[8], [8], [100], [123]] # controls the value of n in the constant factor
		seeds_v = [123, 1111, 22222, 333333] # generates different v's
		seed_y = 123
		for i in range(len(num_documents_per_language_list)):
			num_docs_per_lang = voc_size_per_lang = num_documents_per_language_list[i]
			num_concepts = num_docs_per_lang[0]

			data_obj = self.generate_data_obj(num_concepts, num_docs_per_lang, voc_size_per_lang, seed_y = seed_y)
			operations_obj = Operations(data_obj)

			n = data_obj.Z.shape[0]
			const_dense = np.eye(n) - 1/n

			for seed_v in seeds_v:
				v = self.generate_v(num_concepts, seed_v)
				dense_multiply = const_dense @ v
				custom_multiply = operations_obj.multiply_by_const_left(v)
				cython_multiply = operations_obj.multiply_by_const_left_cython(v)
				self.assertTrue(np.allclose(custom_multiply, dense_multiply))
				self.assertTrue(np.allclose(dense_multiply, cython_multiply.reshape(v.shape[0], 1)))
	def tes_decompose_M_eigsh(self):
		"""Tests the whole framework and the resulting SVD/Eigendecomposition of M."""
		
		num_documents_per_language_list = [[13], [4], [12, 22, 10], [132, 123, 123], [123, 120, 130]] # controls Z.shape[0], Y.shape[0] and n
		voc_size_per_lang_list = [[6], [10], [23, 30, 4], [40, 70,50], [123, 20, 120]] # controls Z.shape[1]
		num_concepts_list = [3, 4, 15, 100, 130] # controls Y.shape[1]

		first_k_list = [1, 2, 6, 50, 30]

		lambda_values = [1, 0.33, 1.66]

		seeds_z = [12345, 1111, 222222] # generates different Z's
		seeds_y = [1, 123123123, 50000] # generates different Y's
		seeds_v = [123, 1111, 22222, 333333] # generates different v's for the multiplication test

		for i in range(len(num_documents_per_language_list)):
			print("Starting %d" % i)
			num_docs_per_lang = num_documents_per_language_list[i]
			voc_size_per_lang = voc_size_per_lang_list[i]
			num_concepts = num_concepts_list[i] # dimensions of vector v
			total_vocabulary = np.sum(voc_size_per_lang)

			k = first_k_list[i]

			for seed_z in seeds_z:
				for seed_y in seeds_y:
					data_obj = self.generate_data_obj(num_concepts, num_docs_per_lang, voc_size_per_lang, seed_y = seed_y, seed_z = seed_z)

					for lambda_ in lambda_values:
						operations_obj = Operations(data_obj, lambda_)
						dense_M = self.get_dense_M(operations_obj)

						u, s, vh = np.linalg.svd(dense_M, full_matrices=False)

						e_vals = s[:k]
						e_vals = np.flip(e_vals, axis=0)
						e_vecs = np.flip(vh[:k].T, axis=1)

						vals, vecs = operations_obj.decompose_M_eigsh(k)
						#print(e_vals)

						self.assertTrue(np.allclose(vals, e_vals))
						self.assertTrue(np.allclose(np.abs(e_vecs), np.abs(vecs)))

						## Test multiply_by_V

						for seed_v in seeds_v:
							v = self.generate_v(k, seed_v)

							dense_multiply = vecs @ v

							custom_multiply = operations_obj.multiply_by_V_left(v)
							self.assertTrue(np.allclose(custom_multiply, dense_multiply))

						## Test multiply_by_V_T

						for seed_v in seeds_v:
							v = self.generate_v(num_concepts, seed_v)

							dense_multiply = vecs.T @ v

							custom_multiply = operations_obj.multiply_by_V_T_left(v)
							self.assertTrue(np.allclose(custom_multiply, dense_multiply))

						## Test multiply_by_M2

						dense_M2 = self.get_dense_M2(operations_obj)

						for seed_v in seeds_v:
							v = self.generate_v(total_vocabulary, seed_v)

							dense_multiply = dense_M2 @ v

							custom_multiply = Operations.multiply_by_M2_left(v, operations_obj)

							self.assertTrue(np.allclose(custom_multiply, dense_multiply))

						## Test M2 eigendecomposition

						u, s, vh = np.linalg.svd(dense_M2, full_matrices=False)

						e_vals = s[:k]
						e_vals = np.flip(e_vals, axis=0)
						e_vecs = np.flip(vh[:k].T, axis=1)

						vals, vecs = operations_obj.decompose_M2_eigsh(k)
						#print(e_vals)

						self.assertTrue(np.allclose(vals, e_vals))
						self.assertTrue(np.allclose(np.abs(e_vecs), np.abs(vecs)))
Ejemplo n.º 8
0
    def run_experiment(self):
        '''Compleates an actual run of the full pipeline, with the parameters corresponding to the arguments passed in the constructor'''

        params = self.params
        print(params)

        cg_max_iter = 500
        eigs_max_iter = 250

        training_concepts_file_name = params['training_concepts_file_name']
        validation_set_file_name = params['validation_set_file_name']
        case_folding_flag = params['case_folding_flag']

        _lambda = params['lambda']

        cg_tol_1 = 10**(-1 * params['cg_tol_1'])
        eigs_tol_1 = 10**(-1 * params['eigs_tol_1'])

        # Same for now
        cg_tol_2 = 10**(-1 * params['cg_tol_2'])
        eigs_tol_2 = 10**(-1 * params['eigs_tol_2'])

        dims = params['dimensions']
        vocabulary_size = params['vocabulary_size']

        data_obj = Data()
        data_obj.load_training(training_concepts_file_name,
                               validation_set_file_name, case_folding_flag,
                               vocabulary_size)

        operations_obj = Operations(data_obj, _lambda, cg_max_iter, cg_tol_1)

        start = default_timer()

        try:
            vals, vecs = operations_obj.decompose_M_eigsh(
                dims, eigs_max_iter, eigs_tol_1)
        except ArpackError as e:
            try:
                print("ERROR occured!")
                print(e)
                vals, vecs = operations_obj.decompose_M_eigsh(
                    dims, eigs_max_iter, eigs_tol_1, True)
            except ArpackError as e:
                print("FAIL! Can't complete the decomposition!")
                return

        end = default_timer()
        time_elapsed = end - start
        print("Finished decomposition one: ", time_elapsed)

        training_outcome = {}

        training_outcome['e_vals'] = vals
        training_outcome['e_vecs'] = vecs

        results_obj = {}
        results_obj['training_outcome'] = training_outcome
        results_obj['parameters'] = params
        results_obj['data'] = data_obj.final_dataset_dump_name

        with open(self.results_dump_path, 'wb') as f:
            pickle.dump(results_obj, f, protocol=4)

        start = default_timer()

        try:
            vals_m2, vecs_m2 = operations_obj.decompose_M2_eigsh(
                dims, eigs_max_iter, eigs_tol_1)
            print(vals_m2)  # Visual sanity check
        except ArpackError as e:
            try:
                print("ERROR occured!")
                print(e)
                vals_m2, vecs_m2 = operations_obj.decompose_M2_eigsh(
                    dims, eigs_max_iter, eigs_tol_1, True)
            except ArpackError as e:
                print("FAIL! Can't complete the decomposition!")
                return

        end = default_timer()
        time_elapsed = end - start
        print("Finished decomposition two: ", time_elapsed)

        training_outcome['M2_e_vals'] = vals_m2
        training_outcome['M2_e_vecs'] = vecs_m2

        # training_outcome['cg_residuals'] = operations_obj.cg_residuals
        training_outcome['num_iter'] = operations_obj.num_iter
        # training_outcome['cg_residuals2'] = operations_obj.cg_residuals2
        training_outcome['num_iter2'] = operations_obj.num_iter2
        training_outcome['time_consumed'] = operations_obj.time_consumed

        results_obj['training_outcome'] = training_outcome

        with open(self.results_dump_path, 'wb') as f:
            pickle.dump(results_obj, f, protocol=4)

        self.logger.revert_standard_output()
        self.logger.log_run()