def tes_decompose_M_eigsh(self):
		"""Tests the whole framework and the resulting SVD/Eigendecomposition of M."""
		
		num_documents_per_language_list = [[13], [4], [12, 22, 10], [132, 123, 123], [123, 120, 130]] # controls Z.shape[0], Y.shape[0] and n
		voc_size_per_lang_list = [[6], [10], [23, 30, 4], [40, 70,50], [123, 20, 120]] # controls Z.shape[1]
		num_concepts_list = [3, 4, 15, 100, 130] # controls Y.shape[1]

		first_k_list = [1, 2, 6, 50, 30]

		lambda_values = [1, 0.33, 1.66]

		seeds_z = [12345, 1111, 222222] # generates different Z's
		seeds_y = [1, 123123123, 50000] # generates different Y's
		seeds_v = [123, 1111, 22222, 333333] # generates different v's for the multiplication test

		for i in range(len(num_documents_per_language_list)):
			print("Starting %d" % i)
			num_docs_per_lang = num_documents_per_language_list[i]
			voc_size_per_lang = voc_size_per_lang_list[i]
			num_concepts = num_concepts_list[i] # dimensions of vector v
			total_vocabulary = np.sum(voc_size_per_lang)

			k = first_k_list[i]

			for seed_z in seeds_z:
				for seed_y in seeds_y:
					data_obj = self.generate_data_obj(num_concepts, num_docs_per_lang, voc_size_per_lang, seed_y = seed_y, seed_z = seed_z)

					for lambda_ in lambda_values:
						operations_obj = Operations(data_obj, lambda_)
						dense_M = self.get_dense_M(operations_obj)

						u, s, vh = np.linalg.svd(dense_M, full_matrices=False)

						e_vals = s[:k]
						e_vals = np.flip(e_vals, axis=0)
						e_vecs = np.flip(vh[:k].T, axis=1)

						vals, vecs = operations_obj.decompose_M_eigsh(k)
						#print(e_vals)

						self.assertTrue(np.allclose(vals, e_vals))
						self.assertTrue(np.allclose(np.abs(e_vecs), np.abs(vecs)))

						## Test multiply_by_V

						for seed_v in seeds_v:
							v = self.generate_v(k, seed_v)

							dense_multiply = vecs @ v

							custom_multiply = operations_obj.multiply_by_V_left(v)
							self.assertTrue(np.allclose(custom_multiply, dense_multiply))

						## Test multiply_by_V_T

						for seed_v in seeds_v:
							v = self.generate_v(num_concepts, seed_v)

							dense_multiply = vecs.T @ v

							custom_multiply = operations_obj.multiply_by_V_T_left(v)
							self.assertTrue(np.allclose(custom_multiply, dense_multiply))

						## Test multiply_by_M2

						dense_M2 = self.get_dense_M2(operations_obj)

						for seed_v in seeds_v:
							v = self.generate_v(total_vocabulary, seed_v)

							dense_multiply = dense_M2 @ v

							custom_multiply = Operations.multiply_by_M2_left(v, operations_obj)

							self.assertTrue(np.allclose(custom_multiply, dense_multiply))

						## Test M2 eigendecomposition

						u, s, vh = np.linalg.svd(dense_M2, full_matrices=False)

						e_vals = s[:k]
						e_vals = np.flip(e_vals, axis=0)
						e_vecs = np.flip(vh[:k].T, axis=1)

						vals, vecs = operations_obj.decompose_M2_eigsh(k)
						#print(e_vals)

						self.assertTrue(np.allclose(vals, e_vals))
						self.assertTrue(np.allclose(np.abs(e_vecs), np.abs(vecs)))
Ejemplo n.º 2
0
    def run_experiment(self):
        '''Compleates an actual run of the full pipeline, with the parameters corresponding to the arguments passed in the constructor'''

        params = self.params
        print(params)

        cg_max_iter = 500
        eigs_max_iter = 250

        training_concepts_file_name = params['training_concepts_file_name']
        validation_set_file_name = params['validation_set_file_name']
        case_folding_flag = params['case_folding_flag']

        _lambda = params['lambda']

        cg_tol_1 = 10**(-1 * params['cg_tol_1'])
        eigs_tol_1 = 10**(-1 * params['eigs_tol_1'])

        # Same for now
        cg_tol_2 = 10**(-1 * params['cg_tol_2'])
        eigs_tol_2 = 10**(-1 * params['eigs_tol_2'])

        dims = params['dimensions']
        vocabulary_size = params['vocabulary_size']

        data_obj = Data()
        data_obj.load_training(training_concepts_file_name,
                               validation_set_file_name, case_folding_flag,
                               vocabulary_size)

        operations_obj = Operations(data_obj, _lambda, cg_max_iter, cg_tol_1)

        start = default_timer()

        try:
            vals, vecs = operations_obj.decompose_M_eigsh(
                dims, eigs_max_iter, eigs_tol_1)
        except ArpackError as e:
            try:
                print("ERROR occured!")
                print(e)
                vals, vecs = operations_obj.decompose_M_eigsh(
                    dims, eigs_max_iter, eigs_tol_1, True)
            except ArpackError as e:
                print("FAIL! Can't complete the decomposition!")
                return

        end = default_timer()
        time_elapsed = end - start
        print("Finished decomposition one: ", time_elapsed)

        training_outcome = {}

        training_outcome['e_vals'] = vals
        training_outcome['e_vecs'] = vecs

        results_obj = {}
        results_obj['training_outcome'] = training_outcome
        results_obj['parameters'] = params
        results_obj['data'] = data_obj.final_dataset_dump_name

        with open(self.results_dump_path, 'wb') as f:
            pickle.dump(results_obj, f, protocol=4)

        start = default_timer()

        try:
            vals_m2, vecs_m2 = operations_obj.decompose_M2_eigsh(
                dims, eigs_max_iter, eigs_tol_1)
            print(vals_m2)  # Visual sanity check
        except ArpackError as e:
            try:
                print("ERROR occured!")
                print(e)
                vals_m2, vecs_m2 = operations_obj.decompose_M2_eigsh(
                    dims, eigs_max_iter, eigs_tol_1, True)
            except ArpackError as e:
                print("FAIL! Can't complete the decomposition!")
                return

        end = default_timer()
        time_elapsed = end - start
        print("Finished decomposition two: ", time_elapsed)

        training_outcome['M2_e_vals'] = vals_m2
        training_outcome['M2_e_vecs'] = vecs_m2

        # training_outcome['cg_residuals'] = operations_obj.cg_residuals
        training_outcome['num_iter'] = operations_obj.num_iter
        # training_outcome['cg_residuals2'] = operations_obj.cg_residuals2
        training_outcome['num_iter2'] = operations_obj.num_iter2
        training_outcome['time_consumed'] = operations_obj.time_consumed

        results_obj['training_outcome'] = training_outcome

        with open(self.results_dump_path, 'wb') as f:
            pickle.dump(results_obj, f, protocol=4)

        self.logger.revert_standard_output()
        self.logger.log_run()