def test_default_training_set_based_feature_selection_for_raw_fingerprint_representations_of_training_and_test_set(self):
		##############################
		print 'Running unittests for this project: ', project_name
		print 'Running this unittest: ', self._testMethodName
		##################################
		
		from ml_input_utils import descriptorsFilesProcessor
		from ml_functions import filter_features_for_svmlight_format_files
		
		id2TrainClass = {'mA':1,'mB':1,'mC':0,'mD':1,'mE':0,'mG':0,'mF':0,'mH':0} #trying to make sure (in train_fp_file)  one feature (f1) is only found in class 1, not class 0, hence it should be selected, but that this feature (f1) is not found in the test set!
		id2TestClass = {'mX':1,'mY':1,'mZ':1}
		#Note to self: as ever, following file names need to be adjusted to make sure files in the directory of this test code Python file are parsed.
		train_fp_file = r'%s\contrived_fp_train_file.txt' % "\\".join(os.path.abspath(__file__).split('\\')[:-1])
		test_fp_file = r'%s\contrived_fp_test_file.txt' % "\\".join(os.path.abspath(__file__).split('\\')[:-1])
		all_feats_svmlight_train_file = r'%s\contrived_svmlight_train_file.txt' % "\\".join(os.path.abspath(__file__).split('\\')[:-1])
		all_feats_svmlight_test_file = r'%s\contrived_svmlight_test_file.txt' % "\\".join(os.path.abspath(__file__).split('\\')[:-1])
		
		
		print 'Preparing original files (pre-feature selection) in svmlight format.'
		
		
		our_descriptorsFilesProcessor = descriptorsFilesProcessor()
		
		record_of_all_feat2IndexFiles = [None]
		
		for TRAIN_OR_TEST_LABEL in ['Train','Test']:
			if 'Train' == TRAIN_OR_TEST_LABEL:
				id2class = id2TrainClass
				fp_file = train_fp_file
				all_feats_svmlight_file =  all_feats_svmlight_train_file
			else:
				assert 'Test' == TRAIN_OR_TEST_LABEL
				id2class = id2TestClass
				fp_file = test_fp_file
				all_feats_svmlight_file = all_feats_svmlight_test_file
			
			record_of_all_feat2IndexFiles = our_descriptorsFilesProcessor.write_svmlight_format_modellingFile_from_multiple_descriptors_files(list_of_descriptors_files=[fp_file],corresponding_list_of_whether_descriptors_file_is_actually_a_raw_fp_file=[True],corresponding_list_of_whether_descriptors_file_is_actually_a_jCompoundMapperStringFeatures_file=[False],descriptors_file_name=all_feats_svmlight_file,id2responseVariable=id2class,corresponding_list_of_unique_features_files=record_of_all_feat2IndexFiles)
		
		del our_descriptorsFilesProcessor
		
		print 'PREPARED original files (pre-feature selection) in svmlight format.'
		
		filter_features_for_svmlight_format_files(svmlight_format_train_file=all_feats_svmlight_train_file,svmlight_format_test_file=all_feats_svmlight_test_file,number_of_features_to_retain=2)
		
		filtered_feats_svmlight_train_file = r'%s\contrived_svmlight_train_file_fs_chi2_top_2.txt' % "\\".join(os.path.abspath(__file__).split('\\')[:-1])
		filtered_feats_svmlight_test_file = r'%s\contrived_svmlight_test_file_fs_chi2_top_2.txt' % "\\".join(os.path.abspath(__file__).split('\\')[:-1])
		training_set_feature_name_to_feature_index_file = r'%s\contrived_fp_train_file_fpFeat2InitialIndex.csv' % "\\".join(os.path.abspath(__file__).split('\\')[:-1])
		
		
		all_input_files_required_for_unittesting = [train_fp_file,test_fp_file]
		
		all_orig_output_files_to_be_compared_as_required_for_unittesting = []
		for new_file in [all_feats_svmlight_train_file,all_feats_svmlight_test_file,filtered_feats_svmlight_train_file,filtered_feats_svmlight_test_file,training_set_feature_name_to_feature_index_file]:
			file_ext = new_file.split('.')[-1]
			orig_file = re.sub('(\.%s$)' % file_ext,' - Copy.%s' % file_ext,new_file)
			all_orig_output_files_to_be_compared_as_required_for_unittesting.append(orig_file)
			self.compareOriginalAndNewFiles(orig_file,new_file)
		
		files_not_to_delete = all_input_files_required_for_unittesting+all_orig_output_files_to_be_compared_as_required_for_unittesting
		self.clean_up_if_all_checks_passed(specific_files_not_to_delete=files_not_to_delete)
	def test_15_convert_svmlight_to_csv(self):
		##############################
		print 'Running unittests for this project: ', project_name
		print 'Running this unittest: ', self._testMethodName
		##################################
		
		do_not_to_delete = glob.glob(r'%s\*' % current_dir)
		
		import ml_input_utils
		
		###########
		#c.f. generate_modelling_input.py:
		
		descriptorsFilesProcessorInstance = ml_input_utils.descriptorsFilesProcessor()
		
		for svmlight_file in [r'%s\test15_t14copied_svmlight_REG_train_file_nonDefault_fs_f_regression_top_1.txt' % current_dir,r'%s\test15_t14copied_svmlight_REG_test_file_nonDefault_fs_f_regression_top_1.txt' % current_dir]:
			
			csv_file = re.sub('(\.txt$)','.csv',svmlight_file)
			
			descriptorsFilesProcessorInstance.convert_svmlight_to_csv(svmlight_file,csv_file)
		###########
		
		self.compareAllExpectedAndActualFiles(current_dir)
		
		self.clean_up_if_all_checks_passed(current_dir,specific_files_not_to_delete=do_not_to_delete)
    def test_convert_svmlight_to_csv(self):
        ##############################
        print "Running unittests for this project: ", project_name
        print "Running this unittest: ", self._testMethodName
        ##################################

        # Note to self: BELOW copied verbatim from trial_runs\descriptor_utils\..\test_svmlight_2_csv.py
        # from descriptor_utils import descriptorsFilesProcessor #Note to self - replaced this with following line
        from ml_input_utils import descriptorsFilesProcessor

        svmlight_file = r"%s\contrived_svmlight_train_file_fs_chi2_top_2.txt" % "\\".join(
            os.path.abspath(__file__).split("\\")[:-1]
        )

        our_descriptorsFilesProcessor = descriptorsFilesProcessor()

        our_descriptorsFilesProcessor.convert_svmlight_to_csv(svmlight_file)

        del our_descriptorsFilesProcessor
        # Note to self: ABOVE copied verbatim from trial_runs\descriptor_utils\..\test_svmlight_2_csv.py
        del descriptorsFilesProcessor

        csv_file = r"%s\contrived_svmlight_train_file_fs_chi2_top_2.csv" % "\\".join(
            os.path.abspath(__file__).split("\\")[:-1]
        )

        all_input_files_required_for_unittesting = [svmlight_file]

        all_orig_output_files_to_be_compared_as_required_for_unittesting = []
        for new_file in [csv_file]:
            file_ext = new_file.split(".")[-1]
            orig_file = re.sub("(\.%s$)" % file_ext, " - Copy.%s" % file_ext, new_file)
            all_orig_output_files_to_be_compared_as_required_for_unittesting.append(orig_file)
            self.compareOriginalAndNewFiles(orig_file, new_file)

        files_not_to_delete = (
            all_input_files_required_for_unittesting + all_orig_output_files_to_be_compared_as_required_for_unittesting
        )
        self.clean_up_if_all_checks_passed(specific_files_not_to_delete=files_not_to_delete)
Exemple #4
0
    def test_convert_svmlight_to_csv(self):
        ##############################
        print 'Running unittests for this project: ', project_name
        print 'Running this unittest: ', self._testMethodName
        ##################################

        #Note to self: BELOW copied verbatim from trial_runs\descriptor_utils\..\test_svmlight_2_csv.py
        #from descriptor_utils import descriptorsFilesProcessor #Note to self - replaced this with following line
        from ml_input_utils import descriptorsFilesProcessor

        svmlight_file = r'%s\contrived_svmlight_train_file_fs_chi2_top_2.txt' % "\\".join(
            os.path.abspath(__file__).split('\\')[:-1])

        our_descriptorsFilesProcessor = descriptorsFilesProcessor()

        our_descriptorsFilesProcessor.convert_svmlight_to_csv(svmlight_file)

        del our_descriptorsFilesProcessor
        #Note to self: ABOVE copied verbatim from trial_runs\descriptor_utils\..\test_svmlight_2_csv.py
        del descriptorsFilesProcessor

        csv_file = r'%s\contrived_svmlight_train_file_fs_chi2_top_2.csv' % "\\".join(
            os.path.abspath(__file__).split('\\')[:-1])

        all_input_files_required_for_unittesting = [svmlight_file]

        all_orig_output_files_to_be_compared_as_required_for_unittesting = []
        for new_file in [csv_file]:
            file_ext = new_file.split('.')[-1]
            orig_file = re.sub('(\.%s$)' % file_ext, ' - Copy.%s' % file_ext,
                               new_file)
            all_orig_output_files_to_be_compared_as_required_for_unittesting.append(
                orig_file)
            self.compareOriginalAndNewFiles(orig_file, new_file)

        files_not_to_delete = all_input_files_required_for_unittesting + all_orig_output_files_to_be_compared_as_required_for_unittesting
        self.clean_up_if_all_checks_passed(
            specific_files_not_to_delete=files_not_to_delete)
Exemple #5
0
	def test_univariate_training_set_based_feature_selection_for_raw_fp_representations_of_training_and_test_set_for_a_REGRESSION_dataset(self):
		###############################
		#17/03/13:
		#<N.B.: Using exactly the same input files as per test_4.>
		#<N.B.: For first run, did not clean up output files (which were copied to give the file copies to compare with in later test runs) and turned off comparison to file copies.>
		###############################
		##############################
		print 'Running unittests for this project: ', project_name
		print 'Running this unittest: ', self._testMethodName
		##################################
		
		#Note to self: BELOW taken verbatim from ..\trial_runs\...\trial_run_fs_2.py
		#from descriptor_utils import descriptorsGenerator,descriptorsFilesProcessor #Note to self: replaced with the next line.
		from ml_input_utils import descriptorsFilesProcessor
		from ml_functions import filter_features_for_svmlight_format_files,f_regression
		
		id2TrainYValue = {'mA':1.8,'mB':1.8,'mC':0.1,'mD':1.8,'mE':0.1,'mG':1.8,'mF':0.1,'mH':1.8} #17/03/13: trying to make sure f2 has a perfect correlation with y-values in training set bu no such correlation [due to constant y-values] in the test set (see next line). <<DONE>:D.I.P.T.R>
		id2TestYValue = {'mX':1.0,'mY':1.0,'mZ':1.0}
		#Note to self: as ever, following file names need to be adjusted to make sure files in the directory of this test code Python file are parsed.
		train_fp_file = r'%s\contrived_fp_train_file.txt' % "\\".join(os.path.abspath(__file__).split('\\')[:-1])
		test_fp_file = r'%s\contrived_fp_test_file.txt' % "\\".join(os.path.abspath(__file__).split('\\')[:-1])
		all_feats_svmlight_train_file = r'%s\contrived_svmlight_train_file.txt' % "\\".join(os.path.abspath(__file__).split('\\')[:-1])
		all_feats_svmlight_test_file = r'%s\contrived_svmlight_test_file.txt' % "\\".join(os.path.abspath(__file__).split('\\')[:-1])
		
		
		print 'Preparing original files (pre-feature selection) in svmlight format.'
		
		
		our_descriptorsFilesProcessor = descriptorsFilesProcessor()
		
		record_of_all_feat2IndexFiles = [None]
		
		for TRAIN_OR_TEST_LABEL in ['Train','Test']:
			if 'Train' == TRAIN_OR_TEST_LABEL:
				id2class = id2TrainYValue
				fp_file = train_fp_file
				all_feats_svmlight_file =  all_feats_svmlight_train_file
			else:
				assert 'Test' == TRAIN_OR_TEST_LABEL
				id2class = id2TestYValue
				fp_file = test_fp_file
				all_feats_svmlight_file = all_feats_svmlight_test_file
			
			record_of_all_feat2IndexFiles = our_descriptorsFilesProcessor.write_svmlight_format_modellingFile_from_multiple_descriptors_files(list_of_descriptors_files=[fp_file],corresponding_list_of_whether_descriptors_file_is_actually_a_raw_fp_file=[True],corresponding_list_of_whether_descriptors_file_is_actually_a_jCompoundMapperStringFeatures_file=[False],descriptors_file_name=all_feats_svmlight_file,id2responseVariable=id2class,corresponding_list_of_unique_features_files=record_of_all_feat2IndexFiles)
		
		del our_descriptorsFilesProcessor
		
		print 'PREPARED original files (pre-feature selection) in svmlight format.'
		
		######
		#<10/10/12::16:45: N.B.: INSPECTION OF ABOVE OUTPUT =>  our_descriptorsFilesProcessor.write_svmlight_format_modellingFile_from_multiple_descriptors_files(...) WORKS!>
		######
		
		filter_features_for_svmlight_format_files(svmlight_format_train_file=all_feats_svmlight_train_file,svmlight_format_test_file=all_feats_svmlight_test_file,univariate_scoring_function=f_regression,number_of_features_to_retain=2)
		
		#Note to self: ABOVE taken verbatim from ..\trial_runs\...\trial_run_fs_2.py
		
		filtered_feats_svmlight_train_file = r'%s\contrived_svmlight_train_file_fs_f_regression_top_2.txt' % "\\".join(os.path.abspath(__file__).split('\\')[:-1]) 
		filtered_feats_svmlight_test_file = r'%s\contrived_svmlight_test_file_fs_f_regression_top_2.txt' % "\\".join(os.path.abspath(__file__).split('\\')[:-1]) 
		
		
		#23/03/13: commented out below for first trial runs and then, when output looked as expected<ok>, copied output and re-ran test with the following uncommented:
		all_input_files_required_for_unittesting = [train_fp_file,test_fp_file]
		
		all_orig_output_files_to_be_compared_as_required_for_unittesting = []
		for new_file in [all_feats_svmlight_train_file,all_feats_svmlight_test_file,filtered_feats_svmlight_train_file,filtered_feats_svmlight_test_file]:
			file_ext = new_file.split('.')[-1]
			orig_file = re.sub('(\.%s$)' % file_ext,' - Copy.%s' % file_ext,new_file)
			all_orig_output_files_to_be_compared_as_required_for_unittesting.append(orig_file)
			self.compareOriginalAndNewFiles(orig_file,new_file)
		
		files_not_to_delete = all_input_files_required_for_unittesting+all_orig_output_files_to_be_compared_as_required_for_unittesting
		self.clean_up_if_all_checks_passed(specific_files_not_to_delete=files_not_to_delete)