def Run(self, data_container, store_path=''): data, label, feature_name, label_name = data_container.GetData() data_resampled, label_resampled = self._model.fit_sample(data, label) new_case_name = [ 'Generate' + str(index) for index in range(data_resampled.shape[0]) ] new_data_container = DataContainer(data_resampled, label_resampled, data_container.GetFeatureName(), new_case_name) if store_path != '': if os.path.isdir(store_path): new_data_container.Save( os.path.join(store_path, '{}_features.csv'.format(self._name))) else: new_data_container.Save(store_path) return new_data_container
def Run(self, data_container, store_folder='', store_key=''): temp_frame = data_container.GetFrame().select_dtypes(include=None, exclude=['object']) new_data_container = DataContainer() new_data_container.SetFrame(temp_frame) if store_folder and os.path.isdir(store_folder): feature_store_path = os.path.join(store_folder, 'numeric_feature.csv') featureinfo_store_path = os.path.join(store_folder, 'feature_select_info.csv') new_data_container.Save(feature_store_path) SaveSelectInfo(new_data_container.GetFeatureName(), featureinfo_store_path, is_merge=False) return new_data_container
def Run(self, data_container, store_path=''): data, label, feature_name, label_name = data_container.GetData() data_resampled, label_resampled = self._model.fit_sample(data, label) new_case_name = [] for index in range(data_resampled.shape[0]): new_case_name.append( self.GetCaseNameFromAllCaseNames(data_container, data_resampled[index, :])) new_data_container = DataContainer(data_resampled, label_resampled, data_container.GetFeatureName(), new_case_name) if store_path != '': if os.path.isdir(store_path): new_data_container.Save( os.path.join(store_path, '{}_features.csv'.format(self._name))) else: new_data_container.Save(store_path) return new_data_container
def Run(self, dc: DataContainer, output_folder: str, clinical_feature=None): self.current_dc = dc selected_feature_names, self.feature_labels = self._Cluster(dc) fs = FeatureSelector() selected_dc = fs.SelectFeatureByName(dc, selected_feature_names) if clinical_feature is not None: if isinstance(clinical_feature, str): clinical_feature = pd.read_csv(clinical_feature, index_col=0) assert (isinstance(clinical_feature, pd.DataFrame)) merge_dc = self._MergeClinical(selected_dc, clinical_feature) else: merge_dc = selected_dc feature_distribution_type = self._EstimateAllFeatureDistribution( merge_dc) # a dict splitter = DataSeparate() output_train_dc, output_test_dc = DataContainer(), DataContainer() output_p_value = [] mean_p_value = -1 for _ in range(self.repeat_times): train_dc, test_dc = splitter.RunByTestingPercentage( merge_dc, testing_data_percentage=self.test_ratio) feature_p_value = self._EstimateDcFeaturePvalue( train_dc, test_dc, feature_distribution_type) if np.mean(list(feature_p_value.values())) > mean_p_value: mean_p_value = np.mean(list(feature_p_value.values())) output_train_dc, output_test_dc = train_dc, test_dc output_p_value = feature_p_value if output_folder is not None and os.path.isdir(output_folder): output_train_dc.Save(os.path.join(output_folder, 'train.csv')) output_test_dc.Save(os.path.join(output_folder, 'test.csv')) p_value_df = pd.DataFrame(output_p_value, index=['P Value']) distribute_df = pd.DataFrame(feature_distribution_type, index=['Distribution']) store_df = pd.concat((p_value_df, distribute_df), axis=0) store_df.to_csv(os.path.join(output_folder, 'split_info.csv'))
class PrepareConnection(QWidget, Ui_Prepare): close_signal = pyqtSignal(bool) def __init__(self, parent=None): super(PrepareConnection, self).__init__(parent) self.setupUi(self) self.data_container = DataContainer() self._filename = os.path.split(__file__)[-1] self.buttonLoad.clicked.connect(self.LoadData) self.buttonRemoveAndExport.clicked.connect(self.RemoveInvalidValue) self.__testing_ref_data_container = DataContainer() self.__clinical_ref = pd.DataFrame() self.radioSplitRandom.clicked.connect(self.ChangeSeparateMethod) self.radioSplitRef.clicked.connect(self.ChangeSeparateMethod) self.checkUseClinicRef.clicked.connect( self.RandomSeparateButtonUpdates) self.loadTestingReference.clicked.connect( self.LoadTestingReferenceDataContainer) self.clearTestingReference.clicked.connect( self.ClearTestingReferenceDataContainer) self.loadClinicRef.clicked.connect(self.LoadClinicalRef) self.clearClinicRef.clicked.connect(self.ClearClinicalRef) self.buttonSave.clicked.connect(self.CheckAndSave) def closeEvent(self, QCloseEvent): self.close_signal.emit(True) QCloseEvent.accept() def UpdateTable(self): self.tableFeature.setRowCount(self.data_container.GetFrame().shape[0]) header_name = deepcopy(list(self.data_container.GetFrame().columns)) min_col = np.min([len(header_name), 100]) if min_col == 100: header_name = header_name[:100] header_name[-1] = '...' self.tableFeature.setColumnCount(min_col) self.tableFeature.setHorizontalHeaderLabels(header_name) self.tableFeature.setVerticalHeaderLabels( list(map(str, self.data_container.GetFrame().index))) for row_index in range(self.data_container.GetFrame().shape[0]): for col_index in range(min_col): if col_index < 99: self.tableFeature.setItem( row_index, col_index, QTableWidgetItem( str(self.data_container.GetFrame().iloc[ row_index, col_index]))) else: self.tableFeature.setItem(row_index, col_index, QTableWidgetItem('...')) text = "The number of cases: {:d}\n".format( self.data_container.GetFrame().shape[0]) # To process Label temporally if 'label' in self.data_container.GetFrame().columns: label_name = 'label' text += "The number of features: {:d}\n".format( self.data_container.GetFrame().shape[1] - 1) elif 'Label' in self.data_container.GetFrame().columns: label_name = 'Label' text += "The number of features: {:d}\n".format( self.data_container.GetFrame().shape[1] - 1) else: label_name = '' text += "The number of features: {:d}\n".format( self.data_container.GetFrame().shape[1]) if label_name: labels = np.asarray( self.data_container.GetFrame()[label_name].values, dtype=np.int) if len(np.unique(labels)) == 2: positive_number = len(np.where(labels == np.max(labels))[0]) negative_number = len(labels) - positive_number assert (positive_number + negative_number == len(labels)) text += "The number of positive samples: {:d}\n".format( positive_number) text += "The number of negative samples: {:d}\n".format( negative_number) self.textInformation.setText(text) def SetButtonsState(self, state): self.buttonRemoveAndExport.setEnabled(state) self.buttonSave.setEnabled(state) self.checkExport.setEnabled(state) self.radioRemoveNone.setEnabled(state) self.radioRemoveNonvalidCases.setEnabled(state) self.radioRemoveNonvalidFeatures.setEnabled(state) self.radioSplitRandom.setEnabled(state) self.radioSplitRef.setEnabled(state) def LoadData(self): dlg = QFileDialog() file_name, _ = dlg.getOpenFileName(self, 'Open SCV file', filter="csv files (*.csv)") if file_name: try: if self.data_container.Load(file_name, is_update=False): self.UpdateTable() self.SetButtonsState(True) except OSError as reason: eclog(self._filename).GetLogger().error( 'Load CSV Error: {}'.format(reason)) QMessageBox.about(self, 'Load data Error', reason.__str__()) print('Error!' + str(reason)) except ValueError: eclog(self._filename).GetLogger().error( 'Open CSV Error: {}'.format(file_name)) QMessageBox.information(self, 'Error', 'The selected data file mismatch.') def LoadTestingReferenceDataContainer(self): dlg = QFileDialog() file_name, _ = dlg.getOpenFileName(self, 'Open SCV file', filter="csv files (*.csv)") if file_name: try: self.__testing_ref_data_container.Load(file_name) self.loadTestingReference.setEnabled(False) self.clearTestingReference.setEnabled(True) self.spinBoxSeparate.setEnabled(False) except OSError as reason: eclog(self._filename).GetLogger().error( 'Load Testing Ref Error: {}'.format(reason)) print('Error!' + str(reason)) except ValueError: eclog(self._filename).GetLogger().error( 'Open CSV Error: {}'.format(file_name)) QMessageBox.information(self, 'Error', 'The selected data file mismatch.') def ClearTestingReferenceDataContainer(self): del self.__testing_ref_data_container self.__testing_ref_data_container = DataContainer() self.loadTestingReference.setEnabled(True) self.clearTestingReference.setEnabled(False) self.spinBoxSeparate.setEnabled(False) def LoadClinicalRef(self): dlg = QFileDialog() file_name, _ = dlg.getOpenFileName(self, 'Open SCV file', filter="csv files (*.csv)") if file_name: try: self.__clinical_ref = pd.read_csv(file_name, index_col=0) if list(self.__clinical_ref.index) != list( self.data_container.GetFrame().index): QMessageBox.information( self, 'Error', 'The index of clinical features is not consistent to the data' ) return None self.loadClinicRef.setEnabled(False) self.clearClinicRef.setEnabled(True) except OSError as reason: eclog(self._filename).GetLogger().error( 'Load Clinical Ref Error: {}'.format(reason)) QMessageBox.information(self, 'Error', 'Can not Open the Files') except ValueError: eclog(self._filename).GetLogger().error( 'OpenCSV Error: {}'.format(file_name)) QMessageBox.information(self, 'Error', 'The selected data file mismatch.') return None def ClearClinicalRef(self): del self.__clinical_ref self.__clinical_ref = pd.DataFrame() self.loadClinicRef.setEnabled(True) self.clearClinicRef.setEnabled(False) def RemoveInvalidValue(self): if not self.data_container.IsEmpty(): if self.checkExport.isChecked(): dlg = QFileDialog() store_path, _ = dlg.getSaveFileName(self, 'Save CSV feature files', 'features.csv', filter="CSV files (*.csv)") # folder_name = QFileDialog.getExistingDirectory(self, "Save Invalid data") # store_path = os.path.join(folder_name, 'invalid_feature.csv') else: store_path = '' if self.radioRemoveNonvalidCases.isChecked(): self.data_container.RemoveInvalid(store_path=store_path, remove_index=REMOVE_CASE) elif self.radioRemoveNonvalidFeatures.isChecked(): self.data_container.RemoveInvalid(store_path=store_path, remove_index=REMOVE_FEATURE) self.UpdateTable() def ChangeSeparateMethod(self): if self.radioSplitRandom.isChecked(): self.spinBoxSeparate.setEnabled(True) self.checkUseClinicRef.setEnabled(True) self.loadTestingReference.setEnabled(False) self.clearTestingReference.setEnabled(False) elif self.radioSplitRef.isChecked(): self.spinBoxSeparate.setEnabled(False) self.checkUseClinicRef.setEnabled(False) if self.__testing_ref_data_container.IsEmpty(): self.loadTestingReference.setEnabled(True) self.clearTestingReference.setEnabled(False) else: self.loadTestingReference.setEnabled(False) self.clearTestingReference.setEnabled(True) self.RandomSeparateButtonUpdates() def RandomSeparateButtonUpdates(self): if self.checkUseClinicRef.isChecked(): if self.__clinical_ref.size > 0: self.loadClinicRef.setEnabled(False) self.clearClinicRef.setEnabled(True) else: self.loadClinicRef.setEnabled(True) self.clearClinicRef.setEnabled(False) else: self.loadClinicRef.setEnabled(False) self.clearClinicRef.setEnabled(False) def CheckAndSave(self): if self.data_container.IsEmpty(): QMessageBox.warning(self, "Warning", "There is no data", QMessageBox.Ok) return None if self.data_container.HasInvalidNumber(): QMessageBox.warning(self, "Warning", "There are nan items", QMessageBox.Ok) non_valid_number_index = self.data_container.FindInvalidNumberIndex( ) old_edit_triggers = self.tableFeature.editTriggers() self.tableFeature.setEditTriggers(QAbstractItemView.CurrentChanged) self.tableFeature.setCurrentCell(non_valid_number_index[0], non_valid_number_index[1]) self.tableFeature.setEditTriggers(old_edit_triggers) return None self.data_container.UpdateDataByFrame() if not self.data_container.IsBinaryLabel(): QMessageBox.warning(self, "Warning", "There are not 2 Labels", QMessageBox.Ok) return None remove_features_with_same_value = RemoveSameFeatures() self.data_container = remove_features_with_same_value.Run( self.data_container) if self.radioSplitRandom.isChecked() or self.radioSplitRef.isChecked(): folder_name = QFileDialog.getExistingDirectory(self, "Save data") if folder_name != '': data_separate = DataSeparate.DataSeparate() try: if self.__testing_ref_data_container.IsEmpty(): testing_data_percentage = self.spinBoxSeparate.value() if self.__clinical_ref.size == 0: training_data_container, _, = \ data_separate.RunByTestingPercentage(self.data_container, testing_data_percentage, store_folder=folder_name) else: training_data_container, _, = \ data_separate.RunByTestingPercentage(self.data_container, testing_data_percentage, clinic_df=self.__clinical_ref, store_folder=folder_name) else: training_data_container, _, = \ data_separate.RunByTestingReference(self.data_container, self.__testing_ref_data_container, folder_name) if training_data_container.IsEmpty(): QMessageBox.information( self, 'Error', 'The testing data does not mismatch, please check the testing data ' 'really exists in current data') return None os.system("explorer.exe {:s}".format( os.path.normpath(folder_name))) except Exception as e: content = 'PrepareConnection, splitting failed: ' eclog(self._filename).GetLogger().error('Split Error: ' + e.__str__()) QMessageBox.about(self, content, e.__str__()) else: file_name, _ = QFileDialog.getSaveFileName( self, "Save data", filter="csv files (*.csv)") if file_name: self.data_container.Save(file_name)