def __init__(self, balancer=None, normalizer_list=[NormalizerNone], dimension_reduction_list=[DimensionReductionByPCC()], feature_selector_list=[], feature_selector_num_list=[], classifier_list=[], cross_validation=None, is_hyper_parameter=False, logger=None): self.balance = balancer self.normalizer_list = normalizer_list self.dimension_reduction_list = dimension_reduction_list self.feature_selector_list = feature_selector_list self.feature_selector_num_list = feature_selector_num_list self.classifier_list = classifier_list self.cv = cross_validation self.is_hyper_parameter = is_hyper_parameter self.__logger = logger self.version = VERSION self.total_metric = { TRAIN: pd.DataFrame(columns=HEADER), BALANCE_TRAIN: pd.DataFrame(columns=HEADER), TEST: pd.DataFrame(columns=HEADER), CV_TRAIN: pd.DataFrame(columns=HEADER), CV_VAL: pd.DataFrame(columns=HEADER) } self.GenerateAucDict()
def Run(self, train_data_container, test_data_container=DataContainer(), store_folder='', is_hyper_parameter=False): column_list = [ 'sample_number', 'positive_number', 'negative_number', 'auc', 'auc 95% CIs', 'auc std', 'accuracy', 'Youden Index', 'sensitivity', 'specificity', 'positive predictive value', 'negative predictive value' ] train_df = pd.DataFrame(columns=column_list) val_df = pd.DataFrame(columns=column_list) test_df = pd.DataFrame(columns=column_list) all_train_df = pd.DataFrame(columns=column_list) if self.__normalizer_list == []: self.__normalizer_list = [NormalizerNone()] if self._dimension_reduction_list == []: self._dimension_reduction_list = [DimensionReductionByPCC()] self.GenerateMetircDict() self.SavePipelineInfo(store_folder) num = 0 total_num = len(self.__normalizer_list) * \ len(self._dimension_reduction_list) * \ len(self.__feature_selector_list) * \ len(self.__classifier_list) * \ len(self.__feature_selector_num_list) for normalizer, normalizer_index in zip( self.__normalizer_list, range(len(self.__normalizer_list))): for dimension_reductor, dimension_reductor_index in zip( self._dimension_reduction_list, range(len(self._dimension_reduction_list))): for feature_selector, feature_selector_index in zip( self.__feature_selector_list, range(len(self.__feature_selector_list))): for classifier, classifier_index in zip( self.__classifier_list, range(len(self.__classifier_list))): for feature_num, feature_num_index in zip( self.__feature_selector_num_list, range(len(self.__feature_selector_num_list))): num += 1 yield normalizer.GetName(), dimension_reductor.GetName(), feature_selector.GetName(), feature_num, \ classifier.GetName(), num, total_num feature_selector.SetSelectedFeatureNumber( feature_num) one_pipeline = OnePipeline( normalizer=normalizer, dimension_reduction=dimension_reductor, feature_selector=feature_selector, classifier=classifier, cross_validation=self.__cross_validation) case_name = one_pipeline.GetStoreName() case_store_folder = os.path.join( store_folder, case_name) train_cv_metric, val_cv_metric, test_metric, all_train_metric = one_pipeline.Run( train_data_container, test_data_container, case_store_folder, is_hyper_parameter) self.__auc_matrix_dict['train'][ normalizer_index, dimension_reductor_index, feature_selector_index, feature_num_index, classifier_index] = train_cv_metric[ 'train_auc'] self.__auc_std_matrix_dict['train'][ normalizer_index, dimension_reductor_index, feature_selector_index, feature_num_index, classifier_index] = train_cv_metric[ 'train_auc std'] self.__auc_matrix_dict['all_train'][ normalizer_index, dimension_reductor_index, feature_selector_index, feature_num_index, classifier_index] = all_train_metric[ 'all_train_auc'] self.__auc_std_matrix_dict['all_train'][ normalizer_index, dimension_reductor_index, feature_selector_index, feature_num_index, classifier_index] = all_train_metric[ 'all_train_auc std'] self.__auc_matrix_dict['val'][ normalizer_index, dimension_reductor_index, feature_selector_index, feature_num_index, classifier_index] = val_cv_metric['val_auc'] self.__auc_std_matrix_dict['val'][ normalizer_index, dimension_reductor_index, feature_selector_index, feature_num_index, classifier_index] = val_cv_metric[ 'val_auc std'] if store_folder and os.path.isdir(store_folder): store_path = os.path.join( store_folder, 'train_result.csv') save_info = [ train_cv_metric['train_' + index] for index in column_list ] train_df.loc[case_name] = save_info train_df.to_csv(store_path) store_path = os.path.join( store_folder, 'all_train_result.csv') save_info = [ all_train_metric['all_train_' + index] for index in column_list ] all_train_df.loc[case_name] = save_info all_train_df.to_csv(store_path) store_path = os.path.join( store_folder, 'val_result.csv') save_info = [ val_cv_metric['val_' + index] for index in column_list ] val_df.loc[case_name] = save_info val_df.to_csv(store_path) if not test_data_container.IsEmpty(): self.__auc_matrix_dict['test'][ normalizer_index, dimension_reductor_index, feature_selector_index, feature_num_index, classifier_index] = test_metric[ 'test_auc'] self.__auc_std_matrix_dict['test'][ normalizer_index, dimension_reductor_index, feature_selector_index, feature_num_index, classifier_index] = test_metric[ 'test_auc std'] store_path = os.path.join( store_folder, 'test_result.csv') save_info = [ test_metric['test_' + index] for index in column_list ] test_df.loc[case_name] = save_info test_df.to_csv(store_path) self.SaveMetricDict(store_folder)
input_data_container = output return output def SaveInfo(self, store_folder, all_features): for fs in self.__selector_list: fs.SaveInfo(store_folder, all_features) def SaveDataContainer(self, data_container, store_folder, store_key): for fs in self.__selector_list: fs.SaveDataContainer(data_container, store_folder, store_key) ################################################################ if __name__ == '__main__': from FAE.DataContainer.DataContainer import DataContainer from FAE.FeatureAnalysis.Normalizer import NormalizerZeroCenter from FAE.FeatureAnalysis.DimensionReduction import DimensionReductionByPCC dc = DataContainer() pcc = DimensionReductionByPCC() fs = FeatureSelectByKruskalWallis(selected_feature_number=5) dc.Load(r'..\..\Demo\train_numeric_feature.csv') dc = NormalizerZeroCenter.Run(dc) dc = pcc.Run(dc) print(dc.GetArray().shape) dc = fs.Run(dc) print(dc.GetArray().shape)
def Run(self, train_data_container, test_data_container=DataContainer(), store_folder='', is_hyper_parameter=False): column_list = [ 'sample_number', 'positive_number', 'negative_number', 'auc', 'auc 95% CIs', 'auc std', 'accuracy', 'Youden Index', 'sensitivity', 'specificity', 'positive predictive value', 'negative predictive value' ] train_df = pd.DataFrame(columns=column_list) val_df = pd.DataFrame(columns=column_list) test_df = pd.DataFrame(columns=column_list) all_train_df = pd.DataFrame(columns=column_list) if self.__normalizer_list == []: self.__normalizer_list = [NormalizerNone()] if self._dimension_reduction_list == []: self._dimension_reduction_list = [DimensionReductionByPCC()] self.GenerateMetircDict() self.SavePipelineInfo(store_folder) num = 0 total_num = len(self.__normalizer_list) * \ len(self._dimension_reduction_list) * \ len(self.__feature_selector_list) * \ len(self.__classifier_list) * \ len(self.__feature_selector_num_list) for normalizer, normalizer_index in zip( self.__normalizer_list, range(len(self.__normalizer_list))): normalized_train_data_container = normalizer.Run( train_data_container) if not test_data_container.IsEmpty(): normalized_test_data_container = normalizer.Run( test_data_container, is_test=True) else: normalized_test_data_container = test_data_container for dimension_reducor, dimension_reductor_index in zip( self._dimension_reduction_list, range(len(self._dimension_reduction_list))): if dimension_reducor: dr_train_data_container = dimension_reducor.Run( normalized_train_data_container) if not test_data_container.IsEmpty(): dr_test_data_container = dimension_reducor.Transform( normalized_test_data_container) else: dr_test_data_container = normalized_test_data_container else: dr_train_data_container = normalized_train_data_container dr_test_data_container = normalized_test_data_container for feature_selector, feature_selector_index in zip( self.__feature_selector_list, range(len(self.__feature_selector_list))): for feature_num, feature_num_index in zip( self.__feature_selector_num_list, range(len(self.__feature_selector_num_list))): feature_selector.SetSelectedFeatureNumber(feature_num) if feature_selector: fs_train_data_container = feature_selector.Run( dr_train_data_container) if not test_data_container.IsEmpty(): selected_feature_name = fs_train_data_container.GetFeatureName( ) fs = FeatureSelector() fs_test_data_container = fs.SelectFeatureByName( dr_test_data_container, selected_feature_name) else: fs_test_data_container = dr_test_data_container else: fs_train_data_container = dr_train_data_container fs_test_data_container = dr_test_data_container for classifier, classifier_index in zip( self.__classifier_list, range(len(self.__classifier_list))): self.__cross_validation.SetClassifier(classifier) num += 1 yield normalizer.GetName(), dimension_reducor.GetName(), feature_selector.GetName(), feature_num, \ classifier.GetName(), num, total_num case_name = self.GetStoreName( normalizer.GetName(), dimension_reducor.GetName(), feature_selector.GetName(), str(feature_num), classifier.GetName()) case_store_folder = os.path.join( store_folder, case_name) if not os.path.exists(case_store_folder): os.mkdir(case_store_folder) # Save normalizer.SaveInfo( case_store_folder, normalized_train_data_container.GetFeatureName( )) normalizer.SaveNormalDataContainer( normalized_train_data_container, case_store_folder, is_test=False) dimension_reducor.SaveInfo(case_store_folder) dimension_reducor.SaveDataContainer( dr_train_data_container, case_store_folder, is_test=False) feature_selector.SaveInfo( case_store_folder, dr_train_data_container.GetFeatureName()) feature_selector.SaveDataContainer( fs_train_data_container, case_store_folder, is_test=False) if not test_data_container.IsEmpty(): normalizer.SaveNormalDataContainer( normalized_test_data_container, case_store_folder, is_test=True) dimension_reducor.SaveDataContainer( dr_test_data_container, case_store_folder, is_test=True) feature_selector.SaveDataContainer( fs_test_data_container, case_store_folder, is_test=True) train_cv_metric, val_cv_metric, test_metric, all_train_metric = self.__cross_validation.Run( fs_train_data_container, fs_test_data_container, case_store_folder, is_hyper_parameter) self.SaveOnePipeline( os.path.join(case_store_folder, 'pipeline_info.csv'), normalizer.GetName(), dimension_reducor.GetName(), feature_selector.GetName(), feature_num, classifier.GetName(), self.__cross_validation.GetName()) # Save Result self.__auc_matrix_dict['train'][ normalizer_index, dimension_reductor_index, feature_selector_index, feature_num_index, classifier_index] = train_cv_metric[ 'train_auc'] self.__auc_std_matrix_dict['train'][ normalizer_index, dimension_reductor_index, feature_selector_index, feature_num_index, classifier_index] = train_cv_metric[ 'train_auc std'] self.__auc_matrix_dict['all_train'][ normalizer_index, dimension_reductor_index, feature_selector_index, feature_num_index, classifier_index] = all_train_metric[ 'all_train_auc'] self.__auc_std_matrix_dict['all_train'][ normalizer_index, dimension_reductor_index, feature_selector_index, feature_num_index, classifier_index] = all_train_metric[ 'all_train_auc std'] self.__auc_matrix_dict['val'][ normalizer_index, dimension_reductor_index, feature_selector_index, feature_num_index, classifier_index] = val_cv_metric['val_auc'] self.__auc_std_matrix_dict['val'][ normalizer_index, dimension_reductor_index, feature_selector_index, feature_num_index, classifier_index] = val_cv_metric[ 'val_auc std'] if store_folder and os.path.isdir(store_folder): store_path = os.path.join( store_folder, 'train_result.csv') save_info = [ train_cv_metric['train_' + index] for index in column_list ] train_df.loc[case_name] = save_info train_df.to_csv(store_path) store_path = os.path.join( store_folder, 'all_train_result.csv') save_info = [ all_train_metric['all_train_' + index] for index in column_list ] all_train_df.loc[case_name] = save_info all_train_df.to_csv(store_path) store_path = os.path.join( store_folder, 'val_result.csv') save_info = [ val_cv_metric['val_' + index] for index in column_list ] val_df.loc[case_name] = save_info val_df.to_csv(store_path) if not test_data_container.IsEmpty(): self.__auc_matrix_dict['test'][ normalizer_index, dimension_reductor_index, feature_selector_index, feature_num_index, classifier_index] = test_metric[ 'test_auc'] self.__auc_std_matrix_dict['test'][ normalizer_index, dimension_reductor_index, feature_selector_index, feature_num_index, classifier_index] = test_metric[ 'test_auc std'] store_path = os.path.join( store_folder, 'test_result.csv') save_info = [ test_metric['test_' + index] for index in column_list ] test_df.loc[case_name] = save_info test_df.to_csv(store_path) self.SaveMetricDict(store_folder)