def test_default_training_set_based_feature_selection_for_raw_fingerprint_representations_of_training_and_test_set(self):
		##############################
		print 'Running unittests for this project: ', project_name
		print 'Running this unittest: ', self._testMethodName
		##################################
		
		from ml_input_utils import descriptorsFilesProcessor
		from ml_functions import filter_features_for_svmlight_format_files
		
		id2TrainClass = {'mA':1,'mB':1,'mC':0,'mD':1,'mE':0,'mG':0,'mF':0,'mH':0} #trying to make sure (in train_fp_file)  one feature (f1) is only found in class 1, not class 0, hence it should be selected, but that this feature (f1) is not found in the test set!
		id2TestClass = {'mX':1,'mY':1,'mZ':1}
		#Note to self: as ever, following file names need to be adjusted to make sure files in the directory of this test code Python file are parsed.
		train_fp_file = r'%s\contrived_fp_train_file.txt' % "\\".join(os.path.abspath(__file__).split('\\')[:-1])
		test_fp_file = r'%s\contrived_fp_test_file.txt' % "\\".join(os.path.abspath(__file__).split('\\')[:-1])
		all_feats_svmlight_train_file = r'%s\contrived_svmlight_train_file.txt' % "\\".join(os.path.abspath(__file__).split('\\')[:-1])
		all_feats_svmlight_test_file = r'%s\contrived_svmlight_test_file.txt' % "\\".join(os.path.abspath(__file__).split('\\')[:-1])
		
		
		print 'Preparing original files (pre-feature selection) in svmlight format.'
		
		
		our_descriptorsFilesProcessor = descriptorsFilesProcessor()
		
		record_of_all_feat2IndexFiles = [None]
		
		for TRAIN_OR_TEST_LABEL in ['Train','Test']:
			if 'Train' == TRAIN_OR_TEST_LABEL:
				id2class = id2TrainClass
				fp_file = train_fp_file
				all_feats_svmlight_file =  all_feats_svmlight_train_file
			else:
				assert 'Test' == TRAIN_OR_TEST_LABEL
				id2class = id2TestClass
				fp_file = test_fp_file
				all_feats_svmlight_file = all_feats_svmlight_test_file
			
			record_of_all_feat2IndexFiles = our_descriptorsFilesProcessor.write_svmlight_format_modellingFile_from_multiple_descriptors_files(list_of_descriptors_files=[fp_file],corresponding_list_of_whether_descriptors_file_is_actually_a_raw_fp_file=[True],corresponding_list_of_whether_descriptors_file_is_actually_a_jCompoundMapperStringFeatures_file=[False],descriptors_file_name=all_feats_svmlight_file,id2responseVariable=id2class,corresponding_list_of_unique_features_files=record_of_all_feat2IndexFiles)
		
		del our_descriptorsFilesProcessor
		
		print 'PREPARED original files (pre-feature selection) in svmlight format.'
		
		filter_features_for_svmlight_format_files(svmlight_format_train_file=all_feats_svmlight_train_file,svmlight_format_test_file=all_feats_svmlight_test_file,number_of_features_to_retain=2)
		
		filtered_feats_svmlight_train_file = r'%s\contrived_svmlight_train_file_fs_chi2_top_2.txt' % "\\".join(os.path.abspath(__file__).split('\\')[:-1])
		filtered_feats_svmlight_test_file = r'%s\contrived_svmlight_test_file_fs_chi2_top_2.txt' % "\\".join(os.path.abspath(__file__).split('\\')[:-1])
		training_set_feature_name_to_feature_index_file = r'%s\contrived_fp_train_file_fpFeat2InitialIndex.csv' % "\\".join(os.path.abspath(__file__).split('\\')[:-1])
		
		
		all_input_files_required_for_unittesting = [train_fp_file,test_fp_file]
		
		all_orig_output_files_to_be_compared_as_required_for_unittesting = []
		for new_file in [all_feats_svmlight_train_file,all_feats_svmlight_test_file,filtered_feats_svmlight_train_file,filtered_feats_svmlight_test_file,training_set_feature_name_to_feature_index_file]:
			file_ext = new_file.split('.')[-1]
			orig_file = re.sub('(\.%s$)' % file_ext,' - Copy.%s' % file_ext,new_file)
			all_orig_output_files_to_be_compared_as_required_for_unittesting.append(orig_file)
			self.compareOriginalAndNewFiles(orig_file,new_file)
		
		files_not_to_delete = all_input_files_required_for_unittesting+all_orig_output_files_to_be_compared_as_required_for_unittesting
		self.clean_up_if_all_checks_passed(specific_files_not_to_delete=files_not_to_delete)
Example #2
0
	def test_non_default_FS_which_adds_missing_validation_set_features(self):
		###############################
		#<N.B.: For first run, did not clean up output files (which were copied to give the file copies to compare with in later test runs) and turned off comparison to file copies. These output files were manually inspected w.r.t. input files to check consistency with expectations.>
		###############################
		##############################
		print 'Running unittests for this project: ', project_name
		print 'Running this unittest: ', self._testMethodName
		##################################
		
		all_input_files_required_for_unittesting = []
		all_orig_output_files_to_be_compared_as_required_for_unittesting = []
		
		from ml_functions import filter_features_for_svmlight_format_files,f_regression,chi2
		
		NUMBER_OF_KEPT_FEATURES = 2
		
		for NOT_DEFAULT in [True,False]:
			for REG_OR_CLASS in ['REG','CLASS']:
				if 'REG' == REG_OR_CLASS:
					FS_FUNCTION = f_regression
					fs_method_label = 'f_regression'
				else:
					assert 'CLASS' == REG_OR_CLASS
					FS_FUNCTION = chi2
					fs_method_label = 'chi2'
				
				all_feats_svmlight_train_file = r'%s\svmlight_%s_train_file.txt' % ("\\".join(os.path.abspath(__file__).split('\\')[:-1]),REG_OR_CLASS)
				all_feats_svmlight_test_file = r'%s\svmlight_%s_test_file.txt' % ("\\".join(os.path.abspath(__file__).split('\\')[:-1]),REG_OR_CLASS)
				
				
				filter_features_for_svmlight_format_files(svmlight_format_train_file=all_feats_svmlight_train_file,svmlight_format_test_file=all_feats_svmlight_test_file,univariate_scoring_function=FS_FUNCTION,number_of_features_to_retain=NUMBER_OF_KEPT_FEATURES,ensure_test_set_consistency=NOT_DEFAULT)
				
				
				#Names must be consistent with train/test file and options passed to filter_features_for_svmlight_format_files(...):
				
				if NOT_DEFAULT:
					DEFAULT_OR_NOT_LABEL = '_nonDefault_fs'
				else:
					DEFAULT_OR_NOT_LABEL = '_fs'
				
				filtered_feats_svmlight_train_file = r'%s\svmlight_%s_train_file%s_%s_top_%d.txt' % ("\\".join(os.path.abspath(__file__).split('\\')[:-1]),REG_OR_CLASS,DEFAULT_OR_NOT_LABEL,fs_method_label,NUMBER_OF_KEPT_FEATURES) 
				filtered_feats_svmlight_test_file = r'%s\svmlight_%s_test_file%s_%s_top_%d.txt' % ("\\".join(os.path.abspath(__file__).split('\\')[:-1]),REG_OR_CLASS,DEFAULT_OR_NOT_LABEL,fs_method_label,NUMBER_OF_KEPT_FEATURES)
				
				
				#02/05/2013: commented out below for first trial runs and then, when output looked as expected<TO DO:PARTIALLY DONE:TEST SETS LOOK AS EXPECTED WHEN LABELLED 'nonDefault' OR NOT>, copied output and re-ran test with the following uncommented:
				all_input_files_required_for_unittesting += [all_feats_svmlight_train_file,all_feats_svmlight_test_file]
				
				
				for new_file in [filtered_feats_svmlight_train_file,filtered_feats_svmlight_test_file]:
					file_ext = new_file.split('.')[-1]
					orig_file = re.sub('(\.%s$)' % file_ext,' - Copy.%s' % file_ext,new_file)
					all_orig_output_files_to_be_compared_as_required_for_unittesting.append(orig_file)
					self.compareOriginalAndNewFiles(orig_file,new_file)
				
		files_not_to_delete = all_input_files_required_for_unittesting+all_orig_output_files_to_be_compared_as_required_for_unittesting
		self.clean_up_if_all_checks_passed(specific_files_not_to_delete=files_not_to_delete)
	def test_14_filter_features_for_svmlight_format_files_RegExample(self):
		##############################
		print 'Running unittests for this project: ', project_name
		print 'Running this unittest: ', self._testMethodName
		##################################
		
		do_not_to_delete = glob.glob(r'%s\*' % current_dir)
		
		from ml_functions import filter_features_for_svmlight_format_files
		
		###########
		#c.f. generate_modelling_input.py:
		
		from ml_functions import f_regression as fsFunction
		
		subset2fsFile,indices_of_top_K_features = filter_features_for_svmlight_format_files(svmlight_format_train_file=r'%s\test14_svmlight_REG_train_file.txt' % current_dir,svmlight_format_test_file=r'%s\test14_svmlight_REG_test_file.txt' % current_dir,univariate_scoring_function=fsFunction,number_of_features_to_retain=1,ensure_test_set_consistency=True)
		
		del indices_of_top_K_features #not currently interested
		###########
		
		f_out = open(r'%s\test_14_output_names.txt' % current_dir,'wb')
		try:
			for subset in ['TRAIN','TEST']:
				f_out.write('%s=%s\r\n' % (subset,os.path.relpath(subset2fsFile[subset],start=current_dir)))
		finally:
			f_out.close()
			del f_out
		
		self.compareAllExpectedAndActualFiles(current_dir)
		
		self.clean_up_if_all_checks_passed(current_dir,specific_files_not_to_delete=do_not_to_delete)
Example #4
0
	def test_univariate_training_set_based_feature_selection_for_raw_fp_representations_of_training_and_test_set_for_a_REGRESSION_dataset(self):
		###############################
		#17/03/13:
		#<N.B.: Using exactly the same input files as per test_4.>
		#<N.B.: For first run, did not clean up output files (which were copied to give the file copies to compare with in later test runs) and turned off comparison to file copies.>
		###############################
		##############################
		print 'Running unittests for this project: ', project_name
		print 'Running this unittest: ', self._testMethodName
		##################################
		
		#Note to self: BELOW taken verbatim from ..\trial_runs\...\trial_run_fs_2.py
		#from descriptor_utils import descriptorsGenerator,descriptorsFilesProcessor #Note to self: replaced with the next line.
		from ml_input_utils import descriptorsFilesProcessor
		from ml_functions import filter_features_for_svmlight_format_files,f_regression
		
		id2TrainYValue = {'mA':1.8,'mB':1.8,'mC':0.1,'mD':1.8,'mE':0.1,'mG':1.8,'mF':0.1,'mH':1.8} #17/03/13: trying to make sure f2 has a perfect correlation with y-values in training set bu no such correlation [due to constant y-values] in the test set (see next line). <<DONE>:D.I.P.T.R>
		id2TestYValue = {'mX':1.0,'mY':1.0,'mZ':1.0}
		#Note to self: as ever, following file names need to be adjusted to make sure files in the directory of this test code Python file are parsed.
		train_fp_file = r'%s\contrived_fp_train_file.txt' % "\\".join(os.path.abspath(__file__).split('\\')[:-1])
		test_fp_file = r'%s\contrived_fp_test_file.txt' % "\\".join(os.path.abspath(__file__).split('\\')[:-1])
		all_feats_svmlight_train_file = r'%s\contrived_svmlight_train_file.txt' % "\\".join(os.path.abspath(__file__).split('\\')[:-1])
		all_feats_svmlight_test_file = r'%s\contrived_svmlight_test_file.txt' % "\\".join(os.path.abspath(__file__).split('\\')[:-1])
		
		
		print 'Preparing original files (pre-feature selection) in svmlight format.'
		
		
		our_descriptorsFilesProcessor = descriptorsFilesProcessor()
		
		record_of_all_feat2IndexFiles = [None]
		
		for TRAIN_OR_TEST_LABEL in ['Train','Test']:
			if 'Train' == TRAIN_OR_TEST_LABEL:
				id2class = id2TrainYValue
				fp_file = train_fp_file
				all_feats_svmlight_file =  all_feats_svmlight_train_file
			else:
				assert 'Test' == TRAIN_OR_TEST_LABEL
				id2class = id2TestYValue
				fp_file = test_fp_file
				all_feats_svmlight_file = all_feats_svmlight_test_file
			
			record_of_all_feat2IndexFiles = our_descriptorsFilesProcessor.write_svmlight_format_modellingFile_from_multiple_descriptors_files(list_of_descriptors_files=[fp_file],corresponding_list_of_whether_descriptors_file_is_actually_a_raw_fp_file=[True],corresponding_list_of_whether_descriptors_file_is_actually_a_jCompoundMapperStringFeatures_file=[False],descriptors_file_name=all_feats_svmlight_file,id2responseVariable=id2class,corresponding_list_of_unique_features_files=record_of_all_feat2IndexFiles)
		
		del our_descriptorsFilesProcessor
		
		print 'PREPARED original files (pre-feature selection) in svmlight format.'
		
		######
		#<10/10/12::16:45: N.B.: INSPECTION OF ABOVE OUTPUT =>  our_descriptorsFilesProcessor.write_svmlight_format_modellingFile_from_multiple_descriptors_files(...) WORKS!>
		######
		
		filter_features_for_svmlight_format_files(svmlight_format_train_file=all_feats_svmlight_train_file,svmlight_format_test_file=all_feats_svmlight_test_file,univariate_scoring_function=f_regression,number_of_features_to_retain=2)
		
		#Note to self: ABOVE taken verbatim from ..\trial_runs\...\trial_run_fs_2.py
		
		filtered_feats_svmlight_train_file = r'%s\contrived_svmlight_train_file_fs_f_regression_top_2.txt' % "\\".join(os.path.abspath(__file__).split('\\')[:-1]) 
		filtered_feats_svmlight_test_file = r'%s\contrived_svmlight_test_file_fs_f_regression_top_2.txt' % "\\".join(os.path.abspath(__file__).split('\\')[:-1]) 
		
		
		#23/03/13: commented out below for first trial runs and then, when output looked as expected<ok>, copied output and re-ran test with the following uncommented:
		all_input_files_required_for_unittesting = [train_fp_file,test_fp_file]
		
		all_orig_output_files_to_be_compared_as_required_for_unittesting = []
		for new_file in [all_feats_svmlight_train_file,all_feats_svmlight_test_file,filtered_feats_svmlight_train_file,filtered_feats_svmlight_test_file]:
			file_ext = new_file.split('.')[-1]
			orig_file = re.sub('(\.%s$)' % file_ext,' - Copy.%s' % file_ext,new_file)
			all_orig_output_files_to_be_compared_as_required_for_unittesting.append(orig_file)
			self.compareOriginalAndNewFiles(orig_file,new_file)
		
		files_not_to_delete = all_input_files_required_for_unittesting+all_orig_output_files_to_be_compared_as_required_for_unittesting
		self.clean_up_if_all_checks_passed(specific_files_not_to_delete=files_not_to_delete)