Beispiel #1
0
    def Run(self, data_container, store_path=''):
        data, label, feature_name, label_name = data_container.GetData()
        data_resampled, label_resampled = self._model.fit_sample(data, label)

        new_case_name = [
            'Generate' + str(index) for index in range(data_resampled.shape[0])
        ]
        new_data_container = DataContainer(data_resampled, label_resampled,
                                           data_container.GetFeatureName(),
                                           new_case_name)
        if store_path != '':
            if os.path.isdir(store_path):
                new_data_container.Save(
                    os.path.join(store_path,
                                 '{}_features.csv'.format(self._name)))
            else:
                new_data_container.Save(store_path)
        return new_data_container
Beispiel #2
0
    def Run(self, data_container, store_folder='', store_key=''):
        temp_frame = data_container.GetFrame().select_dtypes(include=None, exclude=['object'])
        new_data_container = DataContainer()
        new_data_container.SetFrame(temp_frame)
        if store_folder and os.path.isdir(store_folder):
            feature_store_path = os.path.join(store_folder, 'numeric_feature.csv')
            featureinfo_store_path = os.path.join(store_folder, 'feature_select_info.csv')

            new_data_container.Save(feature_store_path)
            SaveSelectInfo(new_data_container.GetFeatureName(), featureinfo_store_path, is_merge=False)

        return new_data_container
Beispiel #3
0
    def Run(self, data_container, store_path=''):
        data, label, feature_name, label_name = data_container.GetData()
        data_resampled, label_resampled = self._model.fit_sample(data, label)

        new_case_name = []
        for index in range(data_resampled.shape[0]):
            new_case_name.append(
                self.GetCaseNameFromAllCaseNames(data_container,
                                                 data_resampled[index, :]))

        new_data_container = DataContainer(data_resampled, label_resampled,
                                           data_container.GetFeatureName(),
                                           new_case_name)
        if store_path != '':
            if os.path.isdir(store_path):
                new_data_container.Save(
                    os.path.join(store_path,
                                 '{}_features.csv'.format(self._name)))
            else:
                new_data_container.Save(store_path)
        return new_data_container
Beispiel #4
0
    def Run(self,
            dc: DataContainer,
            output_folder: str,
            clinical_feature=None):
        self.current_dc = dc
        selected_feature_names, self.feature_labels = self._Cluster(dc)

        fs = FeatureSelector()
        selected_dc = fs.SelectFeatureByName(dc, selected_feature_names)

        if clinical_feature is not None:
            if isinstance(clinical_feature, str):
                clinical_feature = pd.read_csv(clinical_feature, index_col=0)
            assert (isinstance(clinical_feature, pd.DataFrame))

            merge_dc = self._MergeClinical(selected_dc, clinical_feature)
        else:
            merge_dc = selected_dc

        feature_distribution_type = self._EstimateAllFeatureDistribution(
            merge_dc)  # a dict

        splitter = DataSeparate()

        output_train_dc, output_test_dc = DataContainer(), DataContainer()
        output_p_value = []
        mean_p_value = -1

        for _ in range(self.repeat_times):
            train_dc, test_dc = splitter.RunByTestingPercentage(
                merge_dc, testing_data_percentage=self.test_ratio)
            feature_p_value = self._EstimateDcFeaturePvalue(
                train_dc, test_dc, feature_distribution_type)
            if np.mean(list(feature_p_value.values())) > mean_p_value:
                mean_p_value = np.mean(list(feature_p_value.values()))
                output_train_dc, output_test_dc = train_dc, test_dc
                output_p_value = feature_p_value

        if output_folder is not None and os.path.isdir(output_folder):
            output_train_dc.Save(os.path.join(output_folder, 'train.csv'))
            output_test_dc.Save(os.path.join(output_folder, 'test.csv'))

            p_value_df = pd.DataFrame(output_p_value, index=['P Value'])
            distribute_df = pd.DataFrame(feature_distribution_type,
                                         index=['Distribution'])
            store_df = pd.concat((p_value_df, distribute_df), axis=0)
            store_df.to_csv(os.path.join(output_folder, 'split_info.csv'))
Beispiel #5
0
class PrepareConnection(QWidget, Ui_Prepare):
    close_signal = pyqtSignal(bool)

    def __init__(self, parent=None):
        super(PrepareConnection, self).__init__(parent)
        self.setupUi(self)
        self.data_container = DataContainer()
        self._filename = os.path.split(__file__)[-1]

        self.buttonLoad.clicked.connect(self.LoadData)
        self.buttonRemoveAndExport.clicked.connect(self.RemoveInvalidValue)

        self.__testing_ref_data_container = DataContainer()
        self.__clinical_ref = pd.DataFrame()

        self.radioSplitRandom.clicked.connect(self.ChangeSeparateMethod)
        self.radioSplitRef.clicked.connect(self.ChangeSeparateMethod)
        self.checkUseClinicRef.clicked.connect(
            self.RandomSeparateButtonUpdates)
        self.loadTestingReference.clicked.connect(
            self.LoadTestingReferenceDataContainer)
        self.clearTestingReference.clicked.connect(
            self.ClearTestingReferenceDataContainer)
        self.loadClinicRef.clicked.connect(self.LoadClinicalRef)
        self.clearClinicRef.clicked.connect(self.ClearClinicalRef)

        self.buttonSave.clicked.connect(self.CheckAndSave)

    def closeEvent(self, QCloseEvent):
        self.close_signal.emit(True)
        QCloseEvent.accept()

    def UpdateTable(self):
        self.tableFeature.setRowCount(self.data_container.GetFrame().shape[0])
        header_name = deepcopy(list(self.data_container.GetFrame().columns))

        min_col = np.min([len(header_name), 100])
        if min_col == 100:
            header_name = header_name[:100]
            header_name[-1] = '...'

        self.tableFeature.setColumnCount(min_col)
        self.tableFeature.setHorizontalHeaderLabels(header_name)
        self.tableFeature.setVerticalHeaderLabels(
            list(map(str,
                     self.data_container.GetFrame().index)))

        for row_index in range(self.data_container.GetFrame().shape[0]):
            for col_index in range(min_col):
                if col_index < 99:
                    self.tableFeature.setItem(
                        row_index, col_index,
                        QTableWidgetItem(
                            str(self.data_container.GetFrame().iloc[
                                row_index, col_index])))
                else:
                    self.tableFeature.setItem(row_index, col_index,
                                              QTableWidgetItem('...'))

        text = "The number of cases: {:d}\n".format(
            self.data_container.GetFrame().shape[0])
        # To process Label temporally
        if 'label' in self.data_container.GetFrame().columns:
            label_name = 'label'
            text += "The number of features: {:d}\n".format(
                self.data_container.GetFrame().shape[1] - 1)
        elif 'Label' in self.data_container.GetFrame().columns:
            label_name = 'Label'
            text += "The number of features: {:d}\n".format(
                self.data_container.GetFrame().shape[1] - 1)
        else:
            label_name = ''
            text += "The number of features: {:d}\n".format(
                self.data_container.GetFrame().shape[1])
        if label_name:
            labels = np.asarray(
                self.data_container.GetFrame()[label_name].values,
                dtype=np.int)
            if len(np.unique(labels)) == 2:
                positive_number = len(np.where(labels == np.max(labels))[0])
                negative_number = len(labels) - positive_number
                assert (positive_number + negative_number == len(labels))
                text += "The number of positive samples: {:d}\n".format(
                    positive_number)
                text += "The number of negative samples: {:d}\n".format(
                    negative_number)
        self.textInformation.setText(text)

    def SetButtonsState(self, state):
        self.buttonRemoveAndExport.setEnabled(state)
        self.buttonSave.setEnabled(state)
        self.checkExport.setEnabled(state)
        self.radioRemoveNone.setEnabled(state)
        self.radioRemoveNonvalidCases.setEnabled(state)
        self.radioRemoveNonvalidFeatures.setEnabled(state)
        self.radioSplitRandom.setEnabled(state)
        self.radioSplitRef.setEnabled(state)

    def LoadData(self):
        dlg = QFileDialog()
        file_name, _ = dlg.getOpenFileName(self,
                                           'Open SCV file',
                                           filter="csv files (*.csv)")
        if file_name:
            try:
                if self.data_container.Load(file_name, is_update=False):
                    self.UpdateTable()
                    self.SetButtonsState(True)

            except OSError as reason:
                eclog(self._filename).GetLogger().error(
                    'Load CSV Error: {}'.format(reason))
                QMessageBox.about(self, 'Load data Error', reason.__str__())
                print('Error!' + str(reason))
            except ValueError:
                eclog(self._filename).GetLogger().error(
                    'Open CSV Error: {}'.format(file_name))
                QMessageBox.information(self, 'Error',
                                        'The selected data file mismatch.')

    def LoadTestingReferenceDataContainer(self):
        dlg = QFileDialog()
        file_name, _ = dlg.getOpenFileName(self,
                                           'Open SCV file',
                                           filter="csv files (*.csv)")
        if file_name:
            try:
                self.__testing_ref_data_container.Load(file_name)
                self.loadTestingReference.setEnabled(False)
                self.clearTestingReference.setEnabled(True)
                self.spinBoxSeparate.setEnabled(False)
            except OSError as reason:
                eclog(self._filename).GetLogger().error(
                    'Load Testing Ref Error: {}'.format(reason))
                print('Error!' + str(reason))
            except ValueError:
                eclog(self._filename).GetLogger().error(
                    'Open CSV Error: {}'.format(file_name))
                QMessageBox.information(self, 'Error',
                                        'The selected data file mismatch.')

    def ClearTestingReferenceDataContainer(self):
        del self.__testing_ref_data_container
        self.__testing_ref_data_container = DataContainer()
        self.loadTestingReference.setEnabled(True)
        self.clearTestingReference.setEnabled(False)
        self.spinBoxSeparate.setEnabled(False)

    def LoadClinicalRef(self):
        dlg = QFileDialog()
        file_name, _ = dlg.getOpenFileName(self,
                                           'Open SCV file',
                                           filter="csv files (*.csv)")
        if file_name:
            try:
                self.__clinical_ref = pd.read_csv(file_name, index_col=0)
                if list(self.__clinical_ref.index) != list(
                        self.data_container.GetFrame().index):
                    QMessageBox.information(
                        self, 'Error',
                        'The index of clinical features is not consistent to the data'
                    )
                    return None
                self.loadClinicRef.setEnabled(False)
                self.clearClinicRef.setEnabled(True)
            except OSError as reason:
                eclog(self._filename).GetLogger().error(
                    'Load Clinical Ref Error: {}'.format(reason))
                QMessageBox.information(self, 'Error',
                                        'Can not Open the Files')
            except ValueError:
                eclog(self._filename).GetLogger().error(
                    'OpenCSV Error: {}'.format(file_name))
                QMessageBox.information(self, 'Error',
                                        'The selected data file mismatch.')
            return None

    def ClearClinicalRef(self):
        del self.__clinical_ref
        self.__clinical_ref = pd.DataFrame()
        self.loadClinicRef.setEnabled(True)
        self.clearClinicRef.setEnabled(False)

    def RemoveInvalidValue(self):
        if not self.data_container.IsEmpty():
            if self.checkExport.isChecked():
                dlg = QFileDialog()
                store_path, _ = dlg.getSaveFileName(self,
                                                    'Save CSV feature files',
                                                    'features.csv',
                                                    filter="CSV files (*.csv)")

                # folder_name = QFileDialog.getExistingDirectory(self, "Save Invalid data")
                # store_path = os.path.join(folder_name, 'invalid_feature.csv')
            else:
                store_path = ''

            if self.radioRemoveNonvalidCases.isChecked():
                self.data_container.RemoveInvalid(store_path=store_path,
                                                  remove_index=REMOVE_CASE)
            elif self.radioRemoveNonvalidFeatures.isChecked():
                self.data_container.RemoveInvalid(store_path=store_path,
                                                  remove_index=REMOVE_FEATURE)
            self.UpdateTable()

    def ChangeSeparateMethod(self):
        if self.radioSplitRandom.isChecked():
            self.spinBoxSeparate.setEnabled(True)
            self.checkUseClinicRef.setEnabled(True)
            self.loadTestingReference.setEnabled(False)
            self.clearTestingReference.setEnabled(False)
        elif self.radioSplitRef.isChecked():
            self.spinBoxSeparate.setEnabled(False)
            self.checkUseClinicRef.setEnabled(False)
            if self.__testing_ref_data_container.IsEmpty():
                self.loadTestingReference.setEnabled(True)
                self.clearTestingReference.setEnabled(False)
            else:
                self.loadTestingReference.setEnabled(False)
                self.clearTestingReference.setEnabled(True)
        self.RandomSeparateButtonUpdates()

    def RandomSeparateButtonUpdates(self):
        if self.checkUseClinicRef.isChecked():
            if self.__clinical_ref.size > 0:
                self.loadClinicRef.setEnabled(False)
                self.clearClinicRef.setEnabled(True)
            else:
                self.loadClinicRef.setEnabled(True)
                self.clearClinicRef.setEnabled(False)
        else:
            self.loadClinicRef.setEnabled(False)
            self.clearClinicRef.setEnabled(False)

    def CheckAndSave(self):
        if self.data_container.IsEmpty():
            QMessageBox.warning(self, "Warning", "There is no data",
                                QMessageBox.Ok)
            return None

        if self.data_container.HasInvalidNumber():
            QMessageBox.warning(self, "Warning", "There are nan items",
                                QMessageBox.Ok)
            non_valid_number_index = self.data_container.FindInvalidNumberIndex(
            )
            old_edit_triggers = self.tableFeature.editTriggers()
            self.tableFeature.setEditTriggers(QAbstractItemView.CurrentChanged)
            self.tableFeature.setCurrentCell(non_valid_number_index[0],
                                             non_valid_number_index[1])
            self.tableFeature.setEditTriggers(old_edit_triggers)
            return None

        self.data_container.UpdateDataByFrame()

        if not self.data_container.IsBinaryLabel():
            QMessageBox.warning(self, "Warning", "There are not 2 Labels",
                                QMessageBox.Ok)
            return None

        remove_features_with_same_value = RemoveSameFeatures()
        self.data_container = remove_features_with_same_value.Run(
            self.data_container)

        if self.radioSplitRandom.isChecked() or self.radioSplitRef.isChecked():
            folder_name = QFileDialog.getExistingDirectory(self, "Save data")
            if folder_name != '':
                data_separate = DataSeparate.DataSeparate()
                try:
                    if self.__testing_ref_data_container.IsEmpty():
                        testing_data_percentage = self.spinBoxSeparate.value()
                        if self.__clinical_ref.size == 0:
                            training_data_container, _, = \
                                data_separate.RunByTestingPercentage(self.data_container,
                                                                     testing_data_percentage,
                                                                     store_folder=folder_name)
                        else:
                            training_data_container, _, = \
                                data_separate.RunByTestingPercentage(self.data_container,
                                                                     testing_data_percentage,
                                                                     clinic_df=self.__clinical_ref,
                                                                     store_folder=folder_name)
                    else:
                        training_data_container, _, = \
                            data_separate.RunByTestingReference(self.data_container,
                                                                self.__testing_ref_data_container,
                                                                folder_name)
                        if training_data_container.IsEmpty():
                            QMessageBox.information(
                                self, 'Error',
                                'The testing data does not mismatch, please check the testing data '
                                'really exists in current data')
                            return None
                    os.system("explorer.exe {:s}".format(
                        os.path.normpath(folder_name)))
                except Exception as e:
                    content = 'PrepareConnection, splitting failed: '
                    eclog(self._filename).GetLogger().error('Split Error:  ' +
                                                            e.__str__())
                    QMessageBox.about(self, content, e.__str__())

        else:
            file_name, _ = QFileDialog.getSaveFileName(
                self, "Save data", filter="csv files (*.csv)")
            if file_name:
                self.data_container.Save(file_name)