def RunByTestingReference(self, data_container, testing_ref_data_container, store_folder=''): training_index_list, testing_index_list = [], [] # TODO: assert data_container include all cases which is in the training_ref_data_container. all_name_list = data_container.GetCaseName() testing_name_list = testing_ref_data_container.GetCaseName() for training_name in testing_name_list: if training_name not in all_name_list: print( 'The data container and the training data container are not consistent.' ) return DataContainer(), DataContainer() for name, index in zip(all_name_list, range(len(all_name_list))): if name in testing_name_list: testing_index_list.append(index) else: training_index_list.append(index) train_data_container = self.__SetNewData(data_container, training_index_list) test_data_container = self.__SetNewData(data_container, testing_index_list) if store_folder: train_data_container.Save( os.path.join(store_folder, 'train_numeric_feature.csv')) test_data_container.Save( os.path.join(store_folder, 'test_numeric_feature.csv')) return train_data_container, test_data_container
def DrawFeatureRelationshipAccordingToCsvFile(file_path, selected_feature_name_list, label_name_list, store_path=''): ''' Help draw the feature relationship among different features according to the path of the data container. :param file_path: the file path of the csv file :param selected_feature_name_list: the features that would be drawn :param label_name_list: the name of the label. e.g. ['non-cnacer', 'cancer'] :param store_path: The store path, supporting jpg and eps format. :return: ''' data_container = DataContainer() data_container.Load(file_path) data_container.UsualNormalize() data, label, feature_name, case_name = data_container.GetData() if len(selected_feature_name_list) > 3 or len( selected_feature_name_list) < 1: print( "Please check the length of the feature list. It can only show the relationship of the 1, 2, or 3 features" ) try: index = [feature_name.index(t) for t in selected_feature_name_list] except: print('The selected feature is not in the data container.') return result_data = [] for sub_index in index: result_data.append(data[:, sub_index]) DrawValueRelationship(result_data, selected_feature_name_list, label, label_name_list, store_path)
def __init__(self, parts=30, repeat_times=100, test_ratio=0.3, random_seed=10): self.parts = parts self.repeat_times = repeat_times self.test_ratio = test_ratio self.random_seed = random_seed self.feature_labels = [] self.current_dc = DataContainer()
def GenerateDescription(): training_data_container = DataContainer() training_data_container.Load(r'..\..\Example\numeric_feature.csv') one_pipeline = OnePipeline() one_pipeline.LoadPipeline(r'C:\MyCode\FAEGitHub\FAE\Example\report_temp\NormUnit_Cos_ANOVA_5_SVM\pipeline_info.csv') description = Description() description.Run(training_data_container, one_pipeline, r'..\..\Example\report_temp', r'..\..\Example\report')
def OneHotOneColumn(self, data_container, feature_list): info = data_container.GetFrame() feature_name = data_container.GetFeatureName() for feature in feature_list: assert (feature in feature_name) new_info = pd.get_dummies(info, columns=feature_list) new_data = DataContainer() new_data.SetFrame(new_info) return new_data
def __SetNewData(self, data_container, case_index): array, label, feature_name, case_name = data_container.GetData() new_array = array[case_index, :] new_label = label[case_index] new_case_name = [case_name[i] for i in case_index] new_data_container = DataContainer(array=new_array, label=new_label, case_name=new_case_name, feature_name=feature_name) new_data_container.UpdateFrameByData() return new_data_container
def _MergeClinical(self, dc, cli_df): # Merge DataContainer and a dataframe of clinical if 'label' in cli_df.columns.tolist(): del cli_df['label'] elif 'Label' in cli_df.columns.tolist(): del cli_df['Label'] df = pd.merge(dc.GetFrame(), cli_df, how='left', left_index=True, right_index=True) merge_dc = DataContainer() merge_dc.SetFrame(df) merge_dc.UpdateFrameByData() return merge_dc
def Generate(self, data_container): array, label = data_container.GetArray(), data_container.GetLabel() feature_name, case_name = data_container.GetFeatureName(), data_container.GetCaseName() for train_index, val_index in self._cv.split(array, label): train_array, train_label = array[train_index, :], label[train_index] val_array, val_label = array[val_index, :], label[val_index] sub_train_container = DataContainer(array=train_array, label=train_label, feature_name=feature_name, case_name=[case_name[index] for index in train_index]) sub_val_container = DataContainer(array=val_array, label=val_label, feature_name=feature_name, case_name=[case_name[index] for index in val_index]) yield (sub_train_container, sub_val_container)
def Run(self, data_container, store_folder='', store_key=''): temp_frame = data_container.GetFrame().select_dtypes(include=None, exclude=['object']) new_data_container = DataContainer() new_data_container.SetFrame(temp_frame) if store_folder and os.path.isdir(store_folder): feature_store_path = os.path.join(store_folder, 'numeric_feature.csv') featureinfo_store_path = os.path.join(store_folder, 'feature_select_info.csv') new_data_container.Save(feature_store_path) SaveSelectInfo(new_data_container.GetFeatureName(), featureinfo_store_path, is_merge=False) return new_data_container
def VisualizePartsVariance(self, dc: DataContainer, max_k=None, method='SSE', store_folder=None, is_show=True): # method must be one of SSE or SC. SSE denotes xxxx, SC denotes Silhouette Coefficient # TODO: Normalize the train_data data = dc.GetArray().transpose() if max_k is None: max_k = min(data.shape[0], 50) assert (method in ['SSE', 'SC']) #TODO: plot score = [] for k in range(2, max_k): if method == 'SSE': pass elif method == 'SC': pass if store_folder and os.path.isdir(store_folder): plt.savefig(os.path.join(store_folder, 'ClusteringPlot.jpg')) if is_show: plt.show()
def VisualizePartsVariance(self, dc: DataContainer, max_k=None, method='SSE', store_folder=None, is_show=True): # method must be one of SSE or SC. SSE denotes xxxx, SC denotes Silhouette Coefficient data = dc.GetArray() # get train data processed_data = self._DataPreProcess(dc) if max_k is None: max_k = min(data.shape[0], 50) assert(method in ['SSE', 'SC']) score = [] for k in range(2, max_k): print('make cluster k=', k) estimator = KMeans(n_clusters=k) estimator.fit(processed_data) if method == 'SSE': score.append(estimator.inertia_) elif method == 'SC': score.append(silhouette_score(processed_data, estimator.labels_, metric='euclidean')) X = range(2, max_k) plt.xlabel('k') plt.ylabel(method) plt.plot(X, score, 'o-') if store_folder and os.path.isdir(store_folder): plt.savefig(os.path.join(store_folder, 'ClusteringParameterPlot.jpg')) if is_show: plt.show()
def Run(self, dc: DataContainer, output_folder: str, clinical_feature=None): self.current_dc = dc selected_feature_names, self.feature_labels = self._Cluster(dc) fs = FeatureSelector() selected_dc = fs.SelectFeatureByName(dc, selected_feature_names) if clinical_feature is not None: if isinstance(clinical_feature, str): clinical_feature = pd.read_csv(clinical_feature, index_col=0) assert (isinstance(clinical_feature, pd.DataFrame)) merge_dc = self._MergeClinical(selected_dc, clinical_feature) else: merge_dc = selected_dc feature_distribution_type = self._EstimateAllFeatureDistribution( merge_dc) # a dict splitter = DataSeparate() output_train_dc, output_test_dc = DataContainer(), DataContainer() output_p_value = [] mean_p_value = -1 for _ in range(self.repeat_times): train_dc, test_dc = splitter.RunByTestingPercentage( merge_dc, testing_data_percentage=self.test_ratio) feature_p_value = self._EstimateDcFeaturePvalue( train_dc, test_dc, feature_distribution_type) if np.mean(list(feature_p_value.values())) > mean_p_value: mean_p_value = np.mean(list(feature_p_value.values())) output_train_dc, output_test_dc = train_dc, test_dc output_p_value = feature_p_value if output_folder is not None and os.path.isdir(output_folder): output_train_dc.Save(os.path.join(output_folder, 'train.csv')) output_test_dc.Save(os.path.join(output_folder, 'test.csv')) p_value_df = pd.DataFrame(output_p_value, index=['P Value']) distribute_df = pd.DataFrame(feature_distribution_type, index=['Distribution']) store_df = pd.concat((p_value_df, distribute_df), axis=0) store_df.to_csv(os.path.join(output_folder, 'split_info.csv'))
def Run(self, data_container, store_path=''): data, label, feature_name, label_name = data_container.GetData() data_resampled, label_resampled = self._model.fit_sample(data, label) new_case_name = [ 'Generate' + str(index) for index in range(data_resampled.shape[0]) ] new_data_container = DataContainer(data_resampled, label_resampled, data_container.GetFeatureName(), new_case_name) if store_path != '': if os.path.isdir(store_path): new_data_container.Save( os.path.join(store_path, '{}_features.csv'.format(self._name))) else: new_data_container.Save(store_path) return new_data_container
def Run(self, data_container, store_path=''): data, label, feature_name, label_name = data_container.GetData() data_resampled, label_resampled = self._model.fit_sample(data, label) new_case_name = [] for index in range(data_resampled.shape[0]): new_case_name.append( self.GetCaseNameFromAllCaseNames(data_container, data_resampled[index, :])) new_data_container = DataContainer(data_resampled, label_resampled, data_container.GetFeatureName(), new_case_name) if store_path != '': if os.path.isdir(store_path): new_data_container.Save( os.path.join(store_path, '{}_features.csv'.format(self._name))) else: new_data_container.Save(store_path) return new_data_container
def Run(self, train_data_container, test_data_container=DataContainer(), store_folder='', is_hyper_parameter=False): raw_train_data_container = deepcopy(train_data_container) raw_test_data_conainer = deepcopy(test_data_container) if store_folder: if not os.path.exists(store_folder): os.mkdir(store_folder) if not (self.__cv and self.__classifier): print('Give CV method and classifier') if self.__normalizer: raw_train_data_container = self.__normalizer.Run( raw_train_data_container, store_folder) if not test_data_container.IsEmpty(): raw_test_data_conainer = self.__normalizer.Run( raw_test_data_conainer, store_folder, is_test=True) if self.__dimension_reduction: raw_train_data_container = self.__dimension_reduction.Run( raw_train_data_container, store_folder) if not test_data_container.IsEmpty(): raw_test_data_conainer = self.__dimension_reduction.Transform( raw_test_data_conainer) if self.__feature_selector: raw_train_data_container = self.__feature_selector.Run( raw_train_data_container, store_folder) if not test_data_container.IsEmpty(): selected_feature_name = raw_train_data_container.GetFeatureName( ) fs = FeatureSelector() raw_test_data_conainer = fs.SelectFeatureByName( raw_test_data_conainer, selected_feature_name) self.__cv.SetClassifier(self.__classifier) train_cv_metric, val_cv_metric, test_metric, all_train_metric = self.__cv.Run( raw_train_data_container, raw_test_data_conainer, store_folder, is_hyper_parameter) if store_folder: self.SavePipeline(len(raw_train_data_container.GetFeatureName()), os.path.join(store_folder, 'pipeline_info.csv')) return train_cv_metric, val_cv_metric, test_metric, all_train_metric
def __init__(self, parent=None): super(PrepareConnection, self).__init__(parent) self.setupUi(self) self.data_container = DataContainer() self._filename = os.path.split(__file__)[-1] self.buttonLoad.clicked.connect(self.LoadData) self.buttonRemoveAndExport.clicked.connect(self.RemoveInvalidValue) self.__testing_ref_data_container = DataContainer() self.__clinical_ref = pd.DataFrame() self.radioSplitRandom.clicked.connect(self.ChangeSeparateMethod) self.radioSplitRef.clicked.connect(self.ChangeSeparateMethod) self.checkUseClinicRef.clicked.connect( self.RandomSeparateButtonUpdates) self.loadTestingReference.clicked.connect( self.LoadTestingReferenceDataContainer) self.clearTestingReference.clicked.connect( self.ClearTestingReferenceDataContainer) self.loadClinicRef.clicked.connect(self.LoadClinicalRef) self.clearClinicRef.clicked.connect(self.ClearClinicalRef) self.buttonSave.clicked.connect(self.CheckAndSave)
def TestNewData(NewDataCsv, model_folder, result_save_path=''): ''' :param NewDataCsv: New radiomics feature matrix csv file path :param model_folder:The trained model path :return:classification result ''' train_info = LoadTrainInfo(model_folder) new_data_container = DataContainer() #Normlization new_data_container.Load(NewDataCsv) # feature_selector = FeatureSelector() # feature_selector.SelectFeatureByName(new_data_container, train_info['selected_features'], is_replace=True) new_data_container = train_info['normalizer'].Transform(new_data_container) # data_frame = new_data_container.GetFrame() # data_frame = data_frame[train_info['selected_features']] # new_data_container.SetFrame(data_frame) # new_data_container.UpdateDataByFrame() ##Model train_info['classifier'].SetDataContainer(new_data_container) model = train_info['classifier'].GetModel() predict = model.predict_proba(new_data_container.GetArray())[:, 1] label = new_data_container.GetLabel() case_name = new_data_container.GetCaseName() test_result_info = [['CaseName', 'Pred', 'Label']] for index in range(len(label)): test_result_info.append( [case_name[index], predict[index], label[index]]) metric = EstimateMetirc(predict, label) info = {} info.update(metric) cv = CrossValidation() print(metric) print('\t') if result_save_path: cv.SaveResult(info, result_save_path) np.save(os.path.join(result_save_path, 'test_predict.npy'), predict) np.save(os.path.join(result_save_path, 'test_label.npy'), label) with open(os.path.join(result_save_path, 'test_info.csv'), 'w', newline='') as csvfile: writer = csv.writer(csvfile) writer.writerows(test_result_info) return metric
feature_pvalue = self._EstimateDcFeaturePvalue( train_dc, test_dc, feature_distribution_type) if np.mean(list(feature_pvalue.values())) > mean_pvalue: mean_pvalue = np.mean(list(feature_pvalue.values())) output_train_dc, output_test_dc = train_dc, test_dc output_pvalue = feature_pvalue if output_folder is not None and os.path.isdir(output_folder): output_train_dc.Save(os.path.join(output_folder, 'train.csv')) output_test_dc.Save(os.path.join(output_folder, 'test.csv')) pvalue_df = pd.DataFrame(output_pvalue, index=['P Value']) distibute_df = pd.DataFrame(feature_distribution_type, index=['Distribution']) store_df = pd.concat((pvalue_df, distibute_df), axis=0) store_df.to_csv(os.path.join(output_folder, 'split_info.csv')) if __name__ == '__main__': clinics = pd.read_csv(r'..\..\Demo\simulated_clinics.csv', index_col=0) container = DataContainer() container.Load(r'..\..\Demo\simulated_feature.csv') separator = DataSeparate() train, test = separator.RunByTestingPercentage(container, 0.3, clinic_df=clinics) print(train.GetArray().shape, test.GetArray().shape) print(separator.clinic_split_result)
def RunWithoutCV(self, train_container, test_container=DataContainer(), store_folder=''): self.SavePipelineInfo(store_folder) num = 0 # TODO: Balance后面也可以变成循环处理: balance_train_container = self.balance.Run(train_container, store_folder) for norm_index, normalizer in enumerate(self.normalizer_list): norm_store_folder = MakeFolder(store_folder, normalizer.GetName()) norm_balance_train_container = normalizer.Run( balance_train_container, norm_store_folder, store_key=BALANCE_TRAIN) norm_train_container = normalizer.Transform(train_container, norm_store_folder, store_key=TRAIN) norm_test_container = normalizer.Transform(test_container, norm_store_folder, store_key=TEST) for dr_index, dr in enumerate(self.dimension_reduction_list): dr_store_folder = MakeFolder(norm_store_folder, dr.GetName()) if dr: dr_balance_train_container = dr.Run( norm_balance_train_container, dr_store_folder, BALANCE_TRAIN) dr_train_container = dr.Transform(norm_train_container, dr_store_folder, TRAIN) if not test_container.IsEmpty(): dr_test_container = dr.Transform( norm_test_container, dr_store_folder, TEST) else: dr_test_container = norm_test_container else: dr_balance_train_container = norm_balance_train_container dr_train_container = norm_train_container dr_test_container = norm_test_container for fs_index, fs in enumerate(self.feature_selector_list): for fn_index, fn in enumerate( self.feature_selector_num_list): if fs: fs_store_folder = MakeFolder( dr_store_folder, '{}_{}'.format(fs.GetName(), fn)) fs.SetSelectedFeatureNumber(fn) fs_balance_train_container = fs.Run( dr_balance_train_container, fs_store_folder, BALANCE_TRAIN) fs_train_container = fs.Transform( dr_train_container, fs_store_folder, TRAIN) fs_test_container = fs.Transform( dr_test_container, fs_store_folder, TEST) else: fs_store_folder = dr_store_folder fs_balance_train_container = dr_balance_train_container fs_train_container = dr_train_container fs_test_container = dr_test_container for cls_index, cls in enumerate(self.classifier_list): cls_store_folder = MakeFolder( fs_store_folder, cls.GetName()) model_name = self.GetStoreName( normalizer.GetName(), dr.GetName(), fs.GetName(), str(fn), cls.GetName()) matrics_index = (norm_index, dr_index, fs_index, fn_index, cls_index) num += 1 yield self.total_num, num cls.SetDataContainer(fs_balance_train_container) cls.Fit() cls.Save(cls_store_folder) balance_train_pred = cls.Predict( fs_balance_train_container.GetArray()) balance_train_label = fs_balance_train_container.GetLabel( ) self.SaveOneResult( balance_train_pred, balance_train_label, BALANCE_TRAIN, fs_balance_train_container.GetCaseName(), matrics_index, model_name, store_folder, cls_store_folder) train_data = fs_train_container.GetArray() train_label = fs_train_container.GetLabel() train_pred = cls.Predict(train_data) self.SaveOneResult( train_pred, train_label, TRAIN, fs_train_container.GetCaseName(), matrics_index, model_name, store_folder, cls_store_folder) if not test_container.IsEmpty(): test_data = fs_test_container.GetArray() test_label = fs_test_container.GetLabel() test_pred = cls.Predict(test_data) self.SaveOneResult( test_pred, test_label, TEST, fs_test_container.GetCaseName(), matrics_index, model_name, store_folder, cls_store_folder) self.total_metric[BALANCE_TRAIN].to_csv( os.path.join(store_folder, '{}_results.csv'.format(BALANCE_TRAIN))) self.total_metric[TRAIN].to_csv( os.path.join(store_folder, '{}_results.csv'.format(TRAIN))) if not test_container.IsEmpty(): self.total_metric[TEST].to_csv( os.path.join(store_folder, '{}_results.csv'.format(TEST)))
os.path.join(cls_store_folder, 'metrics.csv')) self._MergeOneMetric(cv_val_metric, CV_VAL, model_name) self.total_metric[CV_TRAIN].to_csv( os.path.join(store_folder, '{}_results.csv'.format(CV_TRAIN))) self.total_metric[CV_VAL].to_csv( os.path.join(store_folder, '{}_results.csv'.format(CV_VAL))) if __name__ == '__main__': manager = PipelinesManager() index_dict = Index2Dict() train = DataContainer() test = DataContainer() train.Load(r'C:\Users\yangs\Desktop\train_numeric_feature.csv') test.Load(r'C:\Users\yangs\Desktop\test_numeric_feature.csv') faps = PipelinesManager( balancer=index_dict.GetInstantByIndex('UpSampling'), normalizer_list=[index_dict.GetInstantByIndex('Mean')], dimension_reduction_list=[index_dict.GetInstantByIndex('PCC')], feature_selector_list=[index_dict.GetInstantByIndex('ANOVA')], feature_selector_num_list=list(np.arange(1, 18)), classifier_list=[index_dict.GetInstantByIndex('SVM')], cross_validation=index_dict.GetInstantByIndex('5-Fold')) # for total, num in faps.RunWithoutCV(train, store_folder=r'..\..\Demo\db2-1'): # print(total, num)
self.__cv.SetClassifier(self.__classifier) train_cv_metric, val_cv_metric, test_metric, all_train_metric = self.__cv.Run( raw_train_data_container, raw_test_data_conainer, store_folder, is_hyper_parameter) if store_folder: self.SavePipeline(len(raw_train_data_container.GetFeatureName()), os.path.join(store_folder, 'pipeline_info.csv')) return train_cv_metric, val_cv_metric, test_metric, all_train_metric if __name__ == '__main__': index_dict = Index2Dict() train = DataContainer() test = DataContainer() train.Load(r'..\..\Demo\zero_center_normalized_training_feature.csv') test.Load(r'..\..\Demo\zero_center_normalized_testing_feature.csv') faps = FeatureAnalysisPipelines( balancer=index_dict.GetInstantByIndex('NoneBalance'), normalizer_list=[index_dict.GetInstantByIndex('None')], dimension_reduction_list=[index_dict.GetInstantByIndex('PCC')], feature_selector_list=[index_dict.GetInstantByIndex('RFE')], feature_selector_num_list=[15], classifier_list=[index_dict.GetInstantByIndex('LR')], cross_validation=index_dict.GetInstantByIndex('5-Folder')) for temp in faps.Run(train, test, store_folder=r'..\..\Demo\db2-2'): print(temp)
input_data_container = output return output def SaveInfo(self, store_folder, all_features): for fs in self.__selector_list: fs.SaveInfo(store_folder, all_features) def SaveDataContainer(self, data_container, store_folder, store_key): for fs in self.__selector_list: fs.SaveDataContainer(data_container, store_folder, store_key) ################################################################ if __name__ == '__main__': from BC.DataContainer.DataContainer import DataContainer from BC.FeatureAnalysis.Normalizer import NormalizerZeroCenter from BC.FeatureAnalysis.DimensionReduction import DimensionReductionByPCC dc = DataContainer() pcc = DimensionReductionByPCC() fs = FeatureSelectByKruskalWallis(selected_feature_number=5) dc.Load(r'..\..\Demo\train_numeric_feature.csv') dc = NormalizerZeroCenter.Run(dc) dc = pcc.Run(dc) print(dc.GetArray().shape) dc = fs.Run(dc) print(dc.GetArray().shape)
def __init__(self): self.__model = None self._x = np.array([]) self._y = np.array([]) self._data_container = DataContainer() self.logger = eclog(os.path.split(__file__)[-1]).GetLogger()
def Run(self, train_data_container, test_data_container=DataContainer(), store_folder='', is_hyper_parameter=False): column_list = [ 'sample_number', 'positive_number', 'negative_number', 'auc', 'auc 95% CIs', 'auc std', 'accuracy', 'Youden Index', 'sensitivity', 'specificity', 'positive predictive value', 'negative predictive value' ] train_df = pd.DataFrame(columns=column_list) val_df = pd.DataFrame(columns=column_list) test_df = pd.DataFrame(columns=column_list) all_train_df = pd.DataFrame(columns=column_list) if self.__normalizer_list == []: self.__normalizer_list = [NormalizerNone()] if self._dimension_reduction_list == []: self._dimension_reduction_list = [DimensionReductionByPCC()] self.GenerateMetircDict() self.SavePipelineInfo(store_folder) num = 0 total_num = len(self.__normalizer_list) * \ len(self._dimension_reduction_list) * \ len(self.__feature_selector_list) * \ len(self.__classifier_list) * \ len(self.__feature_selector_num_list) #TODO: Replace with enumerate for normalizer, normalizer_index in zip( self.__normalizer_list, range(len(self.__normalizer_list))): normalized_train_data_container = normalizer.Run( train_data_container) if not test_data_container.IsEmpty(): normalized_test_data_container = normalizer.Run( test_data_container, is_test=True) else: normalized_test_data_container = test_data_container for dimension_reducor, dimension_reductor_index in zip( self._dimension_reduction_list, range(len(self._dimension_reduction_list))): if dimension_reducor: dr_train_data_container = dimension_reducor.Run( normalized_train_data_container) if not test_data_container.IsEmpty(): dr_test_data_container = dimension_reducor.Transform( normalized_test_data_container) else: dr_test_data_container = normalized_test_data_container else: dr_train_data_container = normalized_train_data_container dr_test_data_container = normalized_test_data_container for feature_selector, feature_selector_index in zip( self.__feature_selector_list, range(len(self.__feature_selector_list))): for feature_num, feature_num_index in zip( self.__feature_selector_num_list, range(len(self.__feature_selector_num_list))): feature_selector.SetSelectedFeatureNumber(feature_num) if feature_selector: fs_train_data_container = feature_selector.Run( dr_train_data_container) if not test_data_container.IsEmpty(): selected_feature_name = fs_train_data_container.GetFeatureName( ) fs = FeatureSelector() fs_test_data_container = fs.SelectFeatureByName( dr_test_data_container, selected_feature_name) else: fs_test_data_container = dr_test_data_container else: fs_train_data_container = dr_train_data_container fs_test_data_container = dr_test_data_container for classifier, classifier_index in zip( self.__classifier_list, range(len(self.__classifier_list))): self.__cross_validation.SetClassifier(classifier) num += 1 yield normalizer.GetName(), dimension_reducor.GetName(), feature_selector.GetName(), feature_num, \ classifier.GetName(), num, total_num case_name = self.GetStoreName( normalizer.GetName(), dimension_reducor.GetName(), feature_selector.GetName(), str(feature_num), classifier.GetName()) case_store_folder = os.path.join( store_folder, case_name) if not os.path.exists(case_store_folder): os.mkdir(case_store_folder) # Save normalizer.SaveInfo( case_store_folder, normalized_train_data_container.GetFeatureName( )) normalizer.SaveNormalDataContainer( normalized_train_data_container, case_store_folder, is_test=False) dimension_reducor.SaveInfo(case_store_folder) dimension_reducor.SaveDataContainer( dr_train_data_container, case_store_folder, is_test=False) feature_selector.SaveInfo( case_store_folder, dr_train_data_container.GetFeatureName()) feature_selector.SaveDataContainer( fs_train_data_container, case_store_folder, is_test=False) if not test_data_container.IsEmpty(): normalizer.SaveNormalDataContainer( normalized_test_data_container, case_store_folder, is_test=True) dimension_reducor.SaveDataContainer( dr_test_data_container, case_store_folder, is_test=True) feature_selector.SaveDataContainer( fs_test_data_container, case_store_folder, is_test=True) train_cv_metric, val_cv_metric, test_metric, all_train_metric = self.__cross_validation.Run( fs_train_data_container, fs_test_data_container, case_store_folder, is_hyper_parameter, self.__balance) self.SaveOnePipeline( os.path.join(case_store_folder, 'pipeline_info.csv'), normalizer.GetName(), dimension_reducor.GetName(), feature_selector.GetName(), feature_num, classifier.GetName(), self.__cross_validation.GetName()) # Save Result self.__auc_matrix_dict['train'][ normalizer_index, dimension_reductor_index, feature_selector_index, feature_num_index, classifier_index] = train_cv_metric[ 'train_auc'] self.__auc_std_matrix_dict['train'][ normalizer_index, dimension_reductor_index, feature_selector_index, feature_num_index, classifier_index] = train_cv_metric[ 'train_auc std'] self.__auc_matrix_dict['all_train'][ normalizer_index, dimension_reductor_index, feature_selector_index, feature_num_index, classifier_index] = all_train_metric[ 'all_train_auc'] self.__auc_std_matrix_dict['all_train'][ normalizer_index, dimension_reductor_index, feature_selector_index, feature_num_index, classifier_index] = all_train_metric[ 'all_train_auc std'] self.__auc_matrix_dict['val'][ normalizer_index, dimension_reductor_index, feature_selector_index, feature_num_index, classifier_index] = val_cv_metric['val_auc'] self.__auc_std_matrix_dict['val'][ normalizer_index, dimension_reductor_index, feature_selector_index, feature_num_index, classifier_index] = val_cv_metric[ 'val_auc std'] if store_folder and os.path.isdir(store_folder): store_path = os.path.join( store_folder, 'train_result.csv') save_info = [ train_cv_metric['train_' + index] for index in column_list ] train_df.loc[case_name] = save_info train_df.to_csv(store_path) store_path = os.path.join( store_folder, 'all_train_result.csv') save_info = [ all_train_metric['all_train_' + index] for index in column_list ] all_train_df.loc[case_name] = save_info all_train_df.to_csv(store_path) store_path = os.path.join( store_folder, 'val_result.csv') save_info = [ val_cv_metric['val_' + index] for index in column_list ] val_df.loc[case_name] = save_info val_df.to_csv(store_path) if not test_data_container.IsEmpty(): self.__auc_matrix_dict['test'][ normalizer_index, dimension_reductor_index, feature_selector_index, feature_num_index, classifier_index] = test_metric[ 'test_auc'] self.__auc_std_matrix_dict['test'][ normalizer_index, dimension_reductor_index, feature_selector_index, feature_num_index, classifier_index] = test_metric[ 'test_auc std'] store_path = os.path.join( store_folder, 'test_result.csv') save_info = [ test_metric['test_' + index] for index in column_list ] test_df.loc[case_name] = save_info test_df.to_csv(store_path) self.SaveMetricDict(store_folder) if store_folder: hidden_file_path = os.path.join(store_folder, '.FAEresult4129074093819729087') with open(hidden_file_path, 'wb') as file: pass file_hidden = os.popen('attrib +h ' + hidden_file_path) file_hidden.close()
def _MergeClinical(self, dc, cli_df): # Merge DataContainer and a dataframe of clinical return DataContainer()
class DataSplitterByFeatureCluster(object): def __init__(self, parts=30, repeat_times=100, test_ratio=0.3, random_seed=10): self.parts = parts self.repeat_times = repeat_times self.test_ratio = test_ratio self.random_seed = random_seed self.feature_labels = [] self.current_dc = DataContainer() ################################################# def _DataPreProcess(self, dc): data = dc.GetArray() # get train data # min_max, Process the features of each column min_max_scaler = preprocessing.MinMaxScaler() processed_data = min_max_scaler.fit_transform(data).T return processed_data def _Cluster(self, dc): # According Cluster to selecte features and combine them into a DataContainer processed_data = self._DataPreProcess(dc) feature_name_list = dc.GetFeatureName() k_means = KMeans(n_clusters=self.parts, random_state=self.random_seed, init='k-means++') k_means.fit(processed_data) # training count_label = [0 for _ in range(self.parts)] count_feature = [[] for _ in range(self.parts)] count_distance = [[] for _ in range(self.parts)] feature_predict = k_means.labels_ cluster_centers = k_means.cluster_centers_ for j in range(len(feature_name_list)): count_label[feature_predict[j]] += 1 count_feature[feature_predict[j]].append(feature_name_list[j]) cluster_center = cluster_centers[feature_predict[j]] distance = np.square(processed_data[j] - cluster_center).sum() count_distance[feature_predict[j]].append(distance) print('The number of feature in each class \n', count_label) min_distance_feature = [] for k in range(self.parts): k_feature = count_feature[k] k_distance = count_distance[k] idx = k_distance.index(min(k_distance)) selected_feature = k_feature[idx] min_distance_feature.append(selected_feature) print('min distance feature in this class {} is {}'.format( k, selected_feature)) print('its distance is', min(k_distance), 'while max distance is', max(k_distance)) return min_distance_feature, feature_predict def _MergeClinical(self, dc, cli_df): # Merge DataContainer and a dataframe of clinical if 'label' in cli_df.columns.tolist(): del cli_df['label'] elif 'Label' in cli_df.columns.tolist(): del cli_df['Label'] df = pd.merge(dc.GetFrame(), cli_df, how='left', left_index=True, right_index=True) merge_dc = DataContainer() merge_dc.SetFrame(df) merge_dc.UpdateFrameByData() return merge_dc def _EstimateAllFeatureDistribution(self, dc): feature_name_list = dc.GetFeatureName() distribution = dict() for i in range(len(feature_name_list)): feature = feature_name_list[i] feature_data = dc.GetFrame()[feature] _, normal_p = normaltest(feature_data, axis=0) if len( set(feature_data) ) < 10: # TODO: a better way to distinguish discrete numeric values distribution[feature] = 'Category' elif normal_p > 0.05: distribution[feature] = 'Normal' else: distribution[feature] = 'Non-normal' # return a dict {"AGE": 'Normal', 'Gender': 'Category', ... } return distribution def _EstimateDcFeaturePvalue(self, dc1, dc2, feature_type): array1, array2 = dc1.GetArray(), dc2.GetArray() p_values = {} for index, feature in enumerate(dc1.GetFeatureName()): p_values[feature] = GetPvalue(array1[:, index], array2[:, index], feature_type[feature]) return p_values ################################################# def VisualizePartsVariance(self, dc: DataContainer, max_k=None, method='SSE', store_folder=None, is_show=True): # method must be one of SSE or SC. SSE denotes xxxx, SC denotes Silhouette Coefficient data = dc.GetArray() # get train data processed_data = self._DataPreProcess(dc) if max_k is None: max_k = min(data.shape[0], 50) assert (method in ['SSE', 'SC']) score = [] for k in range(2, max_k): print('make cluster k=', k) estimator = KMeans(n_clusters=k) estimator.fit(processed_data) if method == 'SSE': score.append(estimator.inertia_) elif method == 'SC': score.append( silhouette_score(processed_data, estimator.labels_, metric='euclidean')) X = range(2, max_k) plt.xlabel('k') plt.ylabel(method) plt.plot(X, score, 'o-') if store_folder and os.path.isdir(store_folder): plt.savefig( os.path.join(store_folder, 'ClusteringParameterPlot.jpg')) if is_show: plt.show() def VisualizeCluster(self, dimension='2d', select_feature=None, store_folder=None, is_show=True): if len(self.feature_labels) != 0 and self.current_dc.GetFrame( ).size != 0: processed_data = self._DataPreProcess(self.current_dc) if select_feature is None: select_feature = [0, 1, 2] assert dimension in ['2d', '3d'] if dimension == '2d': plt.scatter(processed_data[:, select_feature[0]], processed_data[:, select_feature[1]], s=5, c=self.feature_labels) elif dimension == '3d': ax = plt.figure().add_subplot(111, projection='3d') ax.scatter(processed_data[:, select_feature[0]], processed_data[:, select_feature[1]], processed_data[:, select_feature[2]], s=10, c=self.feature_labels, marker='^') ax.set_title('Cluster Result 3D') if store_folder and os.path.isdir(store_folder): plt.savefig( os.path.join( store_folder, 'ClusteringProcessPlot{}.jpg'.format(dimension))) if is_show: plt.show() def Run(self, dc: DataContainer, output_folder: str, clinical_feature=None): self.current_dc = dc selected_feature_names, self.feature_labels = self._Cluster(dc) fs = FeatureSelector() selected_dc = fs.SelectFeatureByName(dc, selected_feature_names) if clinical_feature is not None: if isinstance(clinical_feature, str): clinical_feature = pd.read_csv(clinical_feature, index_col=0) assert (isinstance(clinical_feature, pd.DataFrame)) merge_dc = self._MergeClinical(selected_dc, clinical_feature) else: merge_dc = selected_dc feature_distribution_type = self._EstimateAllFeatureDistribution( merge_dc) # a dict splitter = DataSeparate() output_train_dc, output_test_dc = DataContainer(), DataContainer() output_p_value = [] mean_p_value = -1 for _ in range(self.repeat_times): train_dc, test_dc = splitter.RunByTestingPercentage( merge_dc, testing_data_percentage=self.test_ratio) feature_p_value = self._EstimateDcFeaturePvalue( train_dc, test_dc, feature_distribution_type) if np.mean(list(feature_p_value.values())) > mean_p_value: mean_p_value = np.mean(list(feature_p_value.values())) output_train_dc, output_test_dc = train_dc, test_dc output_p_value = feature_p_value if output_folder is not None and os.path.isdir(output_folder): output_train_dc.Save(os.path.join(output_folder, 'train.csv')) output_test_dc.Save(os.path.join(output_folder, 'test.csv')) p_value_df = pd.DataFrame(output_p_value, index=['P Value']) distribute_df = pd.DataFrame(feature_distribution_type, index=['Distribution']) store_df = pd.concat((p_value_df, distribute_df), axis=0) store_df.to_csv(os.path.join(output_folder, 'split_info.csv'))
if output_folder is not None and os.path.isdir(output_folder): output_train_dc.Save(os.path.join(output_folder, 'train.csv')) output_test_dc.Save(os.path.join(output_folder, 'test.csv')) p_value_df = pd.DataFrame(output_p_value, index=['P Value']) distribute_df = pd.DataFrame(feature_distribution_type, index=['Distribution']) store_df = pd.concat((p_value_df, distribute_df), axis=0) store_df.to_csv(os.path.join(output_folder, 'split_info.csv')) if __name__ == '__main__': # clinics = pd.read_csv(r'..\..\Demo\simulated_clinics.csv', index_col=0) # container = DataContainer() # container.Load(r'..\..\Demo\simulated_feature.csv') # # separator = DataSeparate() # train, test = separator.RunByTestingPercentage(container, 0.3, clinic_df=clinics) # # print(train.GetArray().shape, test.GetArray().shape) # print(separator.clinic_split_result) cluster_split = DataSplitterByFeatureCluster() container = DataContainer() container.Load(r'.\all_feature.csv') output_path = r'.\output' clinical_path = r'.\clinical.csv' cluster_split.VisualizePartsVariance(container, store_folder=output_path) cluster_split.Run(container, output_path, clinical_feature=clinical_path) cluster_split.VisualizeCluster(dimension='2d', store_folder=output_path) cluster_split.VisualizeCluster(dimension='3d', store_folder=output_path)
class PrepareConnection(QWidget, Ui_Prepare): close_signal = pyqtSignal(bool) def __init__(self, parent=None): super(PrepareConnection, self).__init__(parent) self.setupUi(self) self.data_container = DataContainer() self._filename = os.path.split(__file__)[-1] self.buttonLoad.clicked.connect(self.LoadData) self.buttonRemoveAndExport.clicked.connect(self.RemoveInvalidValue) self.__testing_ref_data_container = DataContainer() self.__clinical_ref = pd.DataFrame() self.radioSplitRandom.clicked.connect(self.ChangeSeparateMethod) self.radioSplitRef.clicked.connect(self.ChangeSeparateMethod) self.checkUseClinicRef.clicked.connect( self.RandomSeparateButtonUpdates) self.loadTestingReference.clicked.connect( self.LoadTestingReferenceDataContainer) self.clearTestingReference.clicked.connect( self.ClearTestingReferenceDataContainer) self.loadClinicRef.clicked.connect(self.LoadClinicalRef) self.clearClinicRef.clicked.connect(self.ClearClinicalRef) self.buttonSave.clicked.connect(self.CheckAndSave) def closeEvent(self, QCloseEvent): self.close_signal.emit(True) QCloseEvent.accept() def UpdateTable(self): self.tableFeature.setRowCount(self.data_container.GetFrame().shape[0]) header_name = deepcopy(list(self.data_container.GetFrame().columns)) min_col = np.min([len(header_name), 100]) if min_col == 100: header_name = header_name[:100] header_name[-1] = '...' self.tableFeature.setColumnCount(min_col) self.tableFeature.setHorizontalHeaderLabels(header_name) self.tableFeature.setVerticalHeaderLabels( list(map(str, self.data_container.GetFrame().index))) for row_index in range(self.data_container.GetFrame().shape[0]): for col_index in range(min_col): if col_index < 99: self.tableFeature.setItem( row_index, col_index, QTableWidgetItem( str(self.data_container.GetFrame().iloc[ row_index, col_index]))) else: self.tableFeature.setItem(row_index, col_index, QTableWidgetItem('...')) text = "The number of cases: {:d}\n".format( self.data_container.GetFrame().shape[0]) # To process Label temporally if 'label' in self.data_container.GetFrame().columns: label_name = 'label' text += "The number of features: {:d}\n".format( self.data_container.GetFrame().shape[1] - 1) elif 'Label' in self.data_container.GetFrame().columns: label_name = 'Label' text += "The number of features: {:d}\n".format( self.data_container.GetFrame().shape[1] - 1) else: label_name = '' text += "The number of features: {:d}\n".format( self.data_container.GetFrame().shape[1]) if label_name: labels = np.asarray( self.data_container.GetFrame()[label_name].values, dtype=np.int) if len(np.unique(labels)) == 2: positive_number = len(np.where(labels == np.max(labels))[0]) negative_number = len(labels) - positive_number assert (positive_number + negative_number == len(labels)) text += "The number of positive samples: {:d}\n".format( positive_number) text += "The number of negative samples: {:d}\n".format( negative_number) self.textInformation.setText(text) def SetButtonsState(self, state): self.buttonRemoveAndExport.setEnabled(state) self.buttonSave.setEnabled(state) self.checkExport.setEnabled(state) self.radioRemoveNone.setEnabled(state) self.radioRemoveNonvalidCases.setEnabled(state) self.radioRemoveNonvalidFeatures.setEnabled(state) self.radioSplitRandom.setEnabled(state) self.radioSplitRef.setEnabled(state) def LoadData(self): dlg = QFileDialog() file_name, _ = dlg.getOpenFileName(self, 'Open SCV file', filter="csv files (*.csv)") if file_name: try: if self.data_container.Load(file_name, is_update=False): self.UpdateTable() self.SetButtonsState(True) except OSError as reason: eclog(self._filename).GetLogger().error( 'Load CSV Error: {}'.format(reason)) QMessageBox.about(self, 'Load data Error', reason.__str__()) print('Error!' + str(reason)) except ValueError: eclog(self._filename).GetLogger().error( 'Open CSV Error: {}'.format(file_name)) QMessageBox.information(self, 'Error', 'The selected data file mismatch.') def LoadTestingReferenceDataContainer(self): dlg = QFileDialog() file_name, _ = dlg.getOpenFileName(self, 'Open SCV file', filter="csv files (*.csv)") if file_name: try: self.__testing_ref_data_container.Load(file_name) self.loadTestingReference.setEnabled(False) self.clearTestingReference.setEnabled(True) self.spinBoxSeparate.setEnabled(False) except OSError as reason: eclog(self._filename).GetLogger().error( 'Load Testing Ref Error: {}'.format(reason)) print('Error!' + str(reason)) except ValueError: eclog(self._filename).GetLogger().error( 'Open CSV Error: {}'.format(file_name)) QMessageBox.information(self, 'Error', 'The selected data file mismatch.') def ClearTestingReferenceDataContainer(self): del self.__testing_ref_data_container self.__testing_ref_data_container = DataContainer() self.loadTestingReference.setEnabled(True) self.clearTestingReference.setEnabled(False) self.spinBoxSeparate.setEnabled(False) def LoadClinicalRef(self): dlg = QFileDialog() file_name, _ = dlg.getOpenFileName(self, 'Open SCV file', filter="csv files (*.csv)") if file_name: try: self.__clinical_ref = pd.read_csv(file_name, index_col=0) if list(self.__clinical_ref.index) != list( self.data_container.GetFrame().index): QMessageBox.information( self, 'Error', 'The index of clinical features is not consistent to the data' ) return None self.loadClinicRef.setEnabled(False) self.clearClinicRef.setEnabled(True) except OSError as reason: eclog(self._filename).GetLogger().error( 'Load Clinical Ref Error: {}'.format(reason)) QMessageBox.information(self, 'Error', 'Can not Open the Files') except ValueError: eclog(self._filename).GetLogger().error( 'OpenCSV Error: {}'.format(file_name)) QMessageBox.information(self, 'Error', 'The selected data file mismatch.') return None def ClearClinicalRef(self): del self.__clinical_ref self.__clinical_ref = pd.DataFrame() self.loadClinicRef.setEnabled(True) self.clearClinicRef.setEnabled(False) def RemoveInvalidValue(self): if not self.data_container.IsEmpty(): if self.checkExport.isChecked(): dlg = QFileDialog() store_path, _ = dlg.getSaveFileName(self, 'Save CSV feature files', 'features.csv', filter="CSV files (*.csv)") # folder_name = QFileDialog.getExistingDirectory(self, "Save Invalid data") # store_path = os.path.join(folder_name, 'invalid_feature.csv') else: store_path = '' if self.radioRemoveNonvalidCases.isChecked(): self.data_container.RemoveInvalid(store_path=store_path, remove_index=REMOVE_CASE) elif self.radioRemoveNonvalidFeatures.isChecked(): self.data_container.RemoveInvalid(store_path=store_path, remove_index=REMOVE_FEATURE) self.UpdateTable() def ChangeSeparateMethod(self): if self.radioSplitRandom.isChecked(): self.spinBoxSeparate.setEnabled(True) self.checkUseClinicRef.setEnabled(True) self.loadTestingReference.setEnabled(False) self.clearTestingReference.setEnabled(False) elif self.radioSplitRef.isChecked(): self.spinBoxSeparate.setEnabled(False) self.checkUseClinicRef.setEnabled(False) if self.__testing_ref_data_container.IsEmpty(): self.loadTestingReference.setEnabled(True) self.clearTestingReference.setEnabled(False) else: self.loadTestingReference.setEnabled(False) self.clearTestingReference.setEnabled(True) self.RandomSeparateButtonUpdates() def RandomSeparateButtonUpdates(self): if self.checkUseClinicRef.isChecked(): if self.__clinical_ref.size > 0: self.loadClinicRef.setEnabled(False) self.clearClinicRef.setEnabled(True) else: self.loadClinicRef.setEnabled(True) self.clearClinicRef.setEnabled(False) else: self.loadClinicRef.setEnabled(False) self.clearClinicRef.setEnabled(False) def CheckAndSave(self): if self.data_container.IsEmpty(): QMessageBox.warning(self, "Warning", "There is no data", QMessageBox.Ok) return None if self.data_container.HasInvalidNumber(): QMessageBox.warning(self, "Warning", "There are nan items", QMessageBox.Ok) non_valid_number_index = self.data_container.FindInvalidNumberIndex( ) old_edit_triggers = self.tableFeature.editTriggers() self.tableFeature.setEditTriggers(QAbstractItemView.CurrentChanged) self.tableFeature.setCurrentCell(non_valid_number_index[0], non_valid_number_index[1]) self.tableFeature.setEditTriggers(old_edit_triggers) return None self.data_container.UpdateDataByFrame() if not self.data_container.IsBinaryLabel(): QMessageBox.warning(self, "Warning", "There are not 2 Labels", QMessageBox.Ok) return None remove_features_with_same_value = RemoveSameFeatures() self.data_container = remove_features_with_same_value.Run( self.data_container) if self.radioSplitRandom.isChecked() or self.radioSplitRef.isChecked(): folder_name = QFileDialog.getExistingDirectory(self, "Save data") if folder_name != '': data_separate = DataSeparate.DataSeparate() try: if self.__testing_ref_data_container.IsEmpty(): testing_data_percentage = self.spinBoxSeparate.value() if self.__clinical_ref.size == 0: training_data_container, _, = \ data_separate.RunByTestingPercentage(self.data_container, testing_data_percentage, store_folder=folder_name) else: training_data_container, _, = \ data_separate.RunByTestingPercentage(self.data_container, testing_data_percentage, clinic_df=self.__clinical_ref, store_folder=folder_name) else: training_data_container, _, = \ data_separate.RunByTestingReference(self.data_container, self.__testing_ref_data_container, folder_name) if training_data_container.IsEmpty(): QMessageBox.information( self, 'Error', 'The testing data does not mismatch, please check the testing data ' 'really exists in current data') return None os.system("explorer.exe {:s}".format( os.path.normpath(folder_name))) except Exception as e: content = 'PrepareConnection, splitting failed: ' eclog(self._filename).GetLogger().error('Split Error: ' + e.__str__()) QMessageBox.about(self, content, e.__str__()) else: file_name, _ = QFileDialog.getSaveFileName( self, "Save data", filter="csv files (*.csv)") if file_name: self.data_container.Save(file_name)
def ClearTestingReferenceDataContainer(self): del self.__testing_ref_data_container self.__testing_ref_data_container = DataContainer() self.loadTestingReference.setEnabled(True) self.clearTestingReference.setEnabled(False) self.spinBoxSeparate.setEnabled(False)
vif_dict[exog] = vif # calculate tolerance tolerance = 1 - r_squared tolerance_dict[exog] = tolerance # return VIF DataFrame df_vif = pd.DataFrame({'VIF': vif_dict, 'Tolerance': tolerance_dict}) return df_vif if __name__ == '__main__': data_path = r'..\..\Demo\train_numeric_feature.csv' from BC.DataContainer.DataContainer import DataContainer from BC.FeatureAnalysis.Normalizer import NormalizerZeroCenter pca = DimensionReductionByPCA() dc = DataContainer() dc.Load(data_path) dc = NormalizerZeroCenter.Run(dc) # dc = pca.Run(dc) df = pd.DataFrame(dc.GetArray(), index=dc.GetCaseName(), columns=dc.GetFeatureName()) dr = DimensionReductionByVIF() new_df = dr.CalculateVIF(df) print(dc.GetArray().shape, new_df.shape)