def VisualizePartsVariance(self, dc: DataContainer, max_k=None, method='SSE', store_folder=None, is_show=True): # method must be one of SSE or SC. SSE denotes xxxx, SC denotes Silhouette Coefficient # TODO: Normalize the train_data data = dc.GetArray().transpose() if max_k is None: max_k = min(data.shape[0], 50) assert (method in ['SSE', 'SC']) #TODO: plot score = [] for k in range(2, max_k): if method == 'SSE': pass elif method == 'SC': pass if store_folder and os.path.isdir(store_folder): plt.savefig(os.path.join(store_folder, 'ClusteringPlot.jpg')) if is_show: plt.show()
def VisualizePartsVariance(self, dc: DataContainer, max_k=None, method='SSE', store_folder=None, is_show=True): # method must be one of SSE or SC. SSE denotes xxxx, SC denotes Silhouette Coefficient data = dc.GetArray() # get train data processed_data = self._DataPreProcess(dc) if max_k is None: max_k = min(data.shape[0], 50) assert(method in ['SSE', 'SC']) score = [] for k in range(2, max_k): print('make cluster k=', k) estimator = KMeans(n_clusters=k) estimator.fit(processed_data) if method == 'SSE': score.append(estimator.inertia_) elif method == 'SC': score.append(silhouette_score(processed_data, estimator.labels_, metric='euclidean')) X = range(2, max_k) plt.xlabel('k') plt.ylabel(method) plt.plot(X, score, 'o-') if store_folder and os.path.isdir(store_folder): plt.savefig(os.path.join(store_folder, 'ClusteringParameterPlot.jpg')) if is_show: plt.show()
def TestNewData(NewDataCsv, model_folder, result_save_path=''): ''' :param NewDataCsv: New radiomics feature matrix csv file path :param model_folder:The trained model path :return:classification result ''' train_info = LoadTrainInfo(model_folder) new_data_container = DataContainer() #Normlization new_data_container.Load(NewDataCsv) # feature_selector = FeatureSelector() # feature_selector.SelectFeatureByName(new_data_container, train_info['selected_features'], is_replace=True) new_data_container = train_info['normalizer'].Transform(new_data_container) # data_frame = new_data_container.GetFrame() # data_frame = data_frame[train_info['selected_features']] # new_data_container.SetFrame(data_frame) # new_data_container.UpdateDataByFrame() ##Model train_info['classifier'].SetDataContainer(new_data_container) model = train_info['classifier'].GetModel() predict = model.predict_proba(new_data_container.GetArray())[:, 1] label = new_data_container.GetLabel() case_name = new_data_container.GetCaseName() test_result_info = [['CaseName', 'Pred', 'Label']] for index in range(len(label)): test_result_info.append( [case_name[index], predict[index], label[index]]) metric = EstimateMetirc(predict, label) info = {} info.update(metric) cv = CrossValidation() print(metric) print('\t') if result_save_path: cv.SaveResult(info, result_save_path) np.save(os.path.join(result_save_path, 'test_predict.npy'), predict) np.save(os.path.join(result_save_path, 'test_label.npy'), label) with open(os.path.join(result_save_path, 'test_info.csv'), 'w', newline='') as csvfile: writer = csv.writer(csvfile) writer.writerows(test_result_info) return metric
input_data_container = output return output def SaveInfo(self, store_folder, all_features): for fs in self.__selector_list: fs.SaveInfo(store_folder, all_features) def SaveDataContainer(self, data_container, store_folder, store_key): for fs in self.__selector_list: fs.SaveDataContainer(data_container, store_folder, store_key) ################################################################ if __name__ == '__main__': from BC.DataContainer.DataContainer import DataContainer from BC.FeatureAnalysis.Normalizer import NormalizerZeroCenter from BC.FeatureAnalysis.DimensionReduction import DimensionReductionByPCC dc = DataContainer() pcc = DimensionReductionByPCC() fs = FeatureSelectByKruskalWallis(selected_feature_number=5) dc.Load(r'..\..\Demo\train_numeric_feature.csv') dc = NormalizerZeroCenter.Run(dc) dc = pcc.Run(dc) print(dc.GetArray().shape) dc = fs.Run(dc) print(dc.GetArray().shape)
vif_dict[exog] = vif # calculate tolerance tolerance = 1 - r_squared tolerance_dict[exog] = tolerance # return VIF DataFrame df_vif = pd.DataFrame({'VIF': vif_dict, 'Tolerance': tolerance_dict}) return df_vif if __name__ == '__main__': data_path = r'..\..\Demo\train_numeric_feature.csv' from BC.DataContainer.DataContainer import DataContainer from BC.FeatureAnalysis.Normalizer import NormalizerZeroCenter pca = DimensionReductionByPCA() dc = DataContainer() dc.Load(data_path) dc = NormalizerZeroCenter.Run(dc) # dc = pca.Run(dc) df = pd.DataFrame(dc.GetArray(), index=dc.GetCaseName(), columns=dc.GetFeatureName()) dr = DimensionReductionByVIF() new_df = dr.CalculateVIF(df) print(dc.GetArray().shape, new_df.shape)
return "To Remove the unbalance of the training data set, we applied an Tomek link after the " \ "Synthetic Minority Oversampling TEchnique (SMOTE) to make positive/negative samples balance. " def Run(self, data_container, store_path=''): data, label, feature_name, label_name = data_container.GetData() data_resampled, label_resampled = self._model.fit_sample(data, label) new_case_name = [ 'Generate' + str(index) for index in range(data_resampled.shape[0]) ] new_data_container = DataContainer(data_resampled, label_resampled, data_container.GetFeatureName(), new_case_name) if store_path != '': if os.path.isdir(store_path): new_data_container.Save( os.path.join(store_path, '{}_features.csv'.format(self._name))) else: new_data_container.Save(store_path) return new_data_container if __name__ == '__main__': dc = DataContainer() dc.Load(r'..\..\Example\numeric_feature.csv') print(dc.GetArray().shape, np.sum(dc.GetLabel())) b = SmoteTomekSampling() new = b.Run(dc) print(new.GetArray().shape, np.sum(new.GetLabel()))