Example #1
0
class PrepareConnection(QWidget, Ui_Prepare):
    close_signal = pyqtSignal(bool)

    def __init__(self, parent=None):
        super(PrepareConnection, self).__init__(parent)
        self.setupUi(self)
        self.data_container = DataContainer()
        self._filename = os.path.split(__file__)[-1]

        self.buttonLoad.clicked.connect(self.LoadData)
        self.buttonRemoveAndExport.clicked.connect(self.RemoveInvalidValue)

        self.__testing_ref_data_container = DataContainer()
        self.__clinical_ref = pd.DataFrame()

        self.radioSplitRandom.clicked.connect(self.ChangeSeparateMethod)
        self.radioSplitRef.clicked.connect(self.ChangeSeparateMethod)
        self.checkUseClinicRef.clicked.connect(
            self.RandomSeparateButtonUpdates)
        self.loadTestingReference.clicked.connect(
            self.LoadTestingReferenceDataContainer)
        self.clearTestingReference.clicked.connect(
            self.ClearTestingReferenceDataContainer)
        self.loadClinicRef.clicked.connect(self.LoadClinicalRef)
        self.clearClinicRef.clicked.connect(self.ClearClinicalRef)

        self.buttonSave.clicked.connect(self.CheckAndSave)

    def closeEvent(self, QCloseEvent):
        self.close_signal.emit(True)
        QCloseEvent.accept()

    def UpdateTable(self):
        self.tableFeature.setRowCount(self.data_container.GetFrame().shape[0])
        header_name = deepcopy(list(self.data_container.GetFrame().columns))

        min_col = np.min([len(header_name), 100])
        if min_col == 100:
            header_name = header_name[:100]
            header_name[-1] = '...'

        self.tableFeature.setColumnCount(min_col)
        self.tableFeature.setHorizontalHeaderLabels(header_name)
        self.tableFeature.setVerticalHeaderLabels(
            list(map(str,
                     self.data_container.GetFrame().index)))

        for row_index in range(self.data_container.GetFrame().shape[0]):
            for col_index in range(min_col):
                if col_index < 99:
                    self.tableFeature.setItem(
                        row_index, col_index,
                        QTableWidgetItem(
                            str(self.data_container.GetFrame().iloc[
                                row_index, col_index])))
                else:
                    self.tableFeature.setItem(row_index, col_index,
                                              QTableWidgetItem('...'))

        text = "The number of cases: {:d}\n".format(
            self.data_container.GetFrame().shape[0])
        # To process Label temporally
        if 'label' in self.data_container.GetFrame().columns:
            label_name = 'label'
            text += "The number of features: {:d}\n".format(
                self.data_container.GetFrame().shape[1] - 1)
        elif 'Label' in self.data_container.GetFrame().columns:
            label_name = 'Label'
            text += "The number of features: {:d}\n".format(
                self.data_container.GetFrame().shape[1] - 1)
        else:
            label_name = ''
            text += "The number of features: {:d}\n".format(
                self.data_container.GetFrame().shape[1])
        if label_name:
            labels = np.asarray(
                self.data_container.GetFrame()[label_name].values,
                dtype=np.int)
            if len(np.unique(labels)) == 2:
                positive_number = len(np.where(labels == np.max(labels))[0])
                negative_number = len(labels) - positive_number
                assert (positive_number + negative_number == len(labels))
                text += "The number of positive samples: {:d}\n".format(
                    positive_number)
                text += "The number of negative samples: {:d}\n".format(
                    negative_number)
        self.textInformation.setText(text)

    def SetButtonsState(self, state):
        self.buttonRemoveAndExport.setEnabled(state)
        self.buttonSave.setEnabled(state)
        self.checkExport.setEnabled(state)
        self.radioRemoveNone.setEnabled(state)
        self.radioRemoveNonvalidCases.setEnabled(state)
        self.radioRemoveNonvalidFeatures.setEnabled(state)
        self.radioSplitRandom.setEnabled(state)
        self.radioSplitRef.setEnabled(state)

    def LoadData(self):
        dlg = QFileDialog()
        file_name, _ = dlg.getOpenFileName(self,
                                           'Open SCV file',
                                           filter="csv files (*.csv)")
        if file_name:
            try:
                if self.data_container.Load(file_name, is_update=False):
                    self.UpdateTable()
                    self.SetButtonsState(True)

            except OSError as reason:
                eclog(self._filename).GetLogger().error(
                    'Load CSV Error: {}'.format(reason))
                QMessageBox.about(self, 'Load data Error', reason.__str__())
                print('Error!' + str(reason))
            except ValueError:
                eclog(self._filename).GetLogger().error(
                    'Open CSV Error: {}'.format(file_name))
                QMessageBox.information(self, 'Error',
                                        'The selected data file mismatch.')

    def LoadTestingReferenceDataContainer(self):
        dlg = QFileDialog()
        file_name, _ = dlg.getOpenFileName(self,
                                           'Open SCV file',
                                           filter="csv files (*.csv)")
        if file_name:
            try:
                self.__testing_ref_data_container.Load(file_name)
                self.loadTestingReference.setEnabled(False)
                self.clearTestingReference.setEnabled(True)
                self.spinBoxSeparate.setEnabled(False)
            except OSError as reason:
                eclog(self._filename).GetLogger().error(
                    'Load Testing Ref Error: {}'.format(reason))
                print('Error!' + str(reason))
            except ValueError:
                eclog(self._filename).GetLogger().error(
                    'Open CSV Error: {}'.format(file_name))
                QMessageBox.information(self, 'Error',
                                        'The selected data file mismatch.')

    def ClearTestingReferenceDataContainer(self):
        del self.__testing_ref_data_container
        self.__testing_ref_data_container = DataContainer()
        self.loadTestingReference.setEnabled(True)
        self.clearTestingReference.setEnabled(False)
        self.spinBoxSeparate.setEnabled(False)

    def LoadClinicalRef(self):
        dlg = QFileDialog()
        file_name, _ = dlg.getOpenFileName(self,
                                           'Open SCV file',
                                           filter="csv files (*.csv)")
        if file_name:
            try:
                self.__clinical_ref = pd.read_csv(file_name, index_col=0)
                if list(self.__clinical_ref.index) != list(
                        self.data_container.GetFrame().index):
                    QMessageBox.information(
                        self, 'Error',
                        'The index of clinical features is not consistent to the data'
                    )
                    return None
                self.loadClinicRef.setEnabled(False)
                self.clearClinicRef.setEnabled(True)
            except OSError as reason:
                eclog(self._filename).GetLogger().error(
                    'Load Clinical Ref Error: {}'.format(reason))
                QMessageBox.information(self, 'Error',
                                        'Can not Open the Files')
            except ValueError:
                eclog(self._filename).GetLogger().error(
                    'OpenCSV Error: {}'.format(file_name))
                QMessageBox.information(self, 'Error',
                                        'The selected data file mismatch.')
            return None

    def ClearClinicalRef(self):
        del self.__clinical_ref
        self.__clinical_ref = pd.DataFrame()
        self.loadClinicRef.setEnabled(True)
        self.clearClinicRef.setEnabled(False)

    def RemoveInvalidValue(self):
        if not self.data_container.IsEmpty():
            if self.checkExport.isChecked():
                dlg = QFileDialog()
                store_path, _ = dlg.getSaveFileName(self,
                                                    'Save CSV feature files',
                                                    'features.csv',
                                                    filter="CSV files (*.csv)")

                # folder_name = QFileDialog.getExistingDirectory(self, "Save Invalid data")
                # store_path = os.path.join(folder_name, 'invalid_feature.csv')
            else:
                store_path = ''

            if self.radioRemoveNonvalidCases.isChecked():
                self.data_container.RemoveInvalid(store_path=store_path,
                                                  remove_index=REMOVE_CASE)
            elif self.radioRemoveNonvalidFeatures.isChecked():
                self.data_container.RemoveInvalid(store_path=store_path,
                                                  remove_index=REMOVE_FEATURE)
            self.UpdateTable()

    def ChangeSeparateMethod(self):
        if self.radioSplitRandom.isChecked():
            self.spinBoxSeparate.setEnabled(True)
            self.checkUseClinicRef.setEnabled(True)
            self.loadTestingReference.setEnabled(False)
            self.clearTestingReference.setEnabled(False)
        elif self.radioSplitRef.isChecked():
            self.spinBoxSeparate.setEnabled(False)
            self.checkUseClinicRef.setEnabled(False)
            if self.__testing_ref_data_container.IsEmpty():
                self.loadTestingReference.setEnabled(True)
                self.clearTestingReference.setEnabled(False)
            else:
                self.loadTestingReference.setEnabled(False)
                self.clearTestingReference.setEnabled(True)
        self.RandomSeparateButtonUpdates()

    def RandomSeparateButtonUpdates(self):
        if self.checkUseClinicRef.isChecked():
            if self.__clinical_ref.size > 0:
                self.loadClinicRef.setEnabled(False)
                self.clearClinicRef.setEnabled(True)
            else:
                self.loadClinicRef.setEnabled(True)
                self.clearClinicRef.setEnabled(False)
        else:
            self.loadClinicRef.setEnabled(False)
            self.clearClinicRef.setEnabled(False)

    def CheckAndSave(self):
        if self.data_container.IsEmpty():
            QMessageBox.warning(self, "Warning", "There is no data",
                                QMessageBox.Ok)
            return None

        if self.data_container.HasInvalidNumber():
            QMessageBox.warning(self, "Warning", "There are nan items",
                                QMessageBox.Ok)
            non_valid_number_index = self.data_container.FindInvalidNumberIndex(
            )
            old_edit_triggers = self.tableFeature.editTriggers()
            self.tableFeature.setEditTriggers(QAbstractItemView.CurrentChanged)
            self.tableFeature.setCurrentCell(non_valid_number_index[0],
                                             non_valid_number_index[1])
            self.tableFeature.setEditTriggers(old_edit_triggers)
            return None

        self.data_container.UpdateDataByFrame()

        if not self.data_container.IsBinaryLabel():
            QMessageBox.warning(self, "Warning", "There are not 2 Labels",
                                QMessageBox.Ok)
            return None

        remove_features_with_same_value = RemoveSameFeatures()
        self.data_container = remove_features_with_same_value.Run(
            self.data_container)

        if self.radioSplitRandom.isChecked() or self.radioSplitRef.isChecked():
            folder_name = QFileDialog.getExistingDirectory(self, "Save data")
            if folder_name != '':
                data_separate = DataSeparate.DataSeparate()
                try:
                    if self.__testing_ref_data_container.IsEmpty():
                        testing_data_percentage = self.spinBoxSeparate.value()
                        if self.__clinical_ref.size == 0:
                            training_data_container, _, = \
                                data_separate.RunByTestingPercentage(self.data_container,
                                                                     testing_data_percentage,
                                                                     store_folder=folder_name)
                        else:
                            training_data_container, _, = \
                                data_separate.RunByTestingPercentage(self.data_container,
                                                                     testing_data_percentage,
                                                                     clinic_df=self.__clinical_ref,
                                                                     store_folder=folder_name)
                    else:
                        training_data_container, _, = \
                            data_separate.RunByTestingReference(self.data_container,
                                                                self.__testing_ref_data_container,
                                                                folder_name)
                        if training_data_container.IsEmpty():
                            QMessageBox.information(
                                self, 'Error',
                                'The testing data does not mismatch, please check the testing data '
                                'really exists in current data')
                            return None
                    os.system("explorer.exe {:s}".format(
                        os.path.normpath(folder_name)))
                except Exception as e:
                    content = 'PrepareConnection, splitting failed: '
                    eclog(self._filename).GetLogger().error('Split Error:  ' +
                                                            e.__str__())
                    QMessageBox.about(self, content, e.__str__())

        else:
            file_name, _ = QFileDialog.getSaveFileName(
                self, "Save data", filter="csv files (*.csv)")
            if file_name:
                self.data_container.Save(file_name)
Example #2
0
class DataSplitterByFeatureCluster(object):
    def __init__(self,
                 parts=30,
                 repeat_times=100,
                 test_ratio=0.3,
                 random_seed=10):
        self.parts = parts
        self.repeat_times = repeat_times
        self.test_ratio = test_ratio
        self.random_seed = random_seed

        self.feature_labels = []
        self.current_dc = DataContainer()

    #################################################
    def _DataPreProcess(self, dc):
        data = dc.GetArray()  # get train data
        # min_max, Process the features of each column
        min_max_scaler = preprocessing.MinMaxScaler()
        processed_data = min_max_scaler.fit_transform(data).T
        return processed_data

    def _Cluster(self, dc):
        # According Cluster to selecte features and combine them into a DataContainer
        processed_data = self._DataPreProcess(dc)
        feature_name_list = dc.GetFeatureName()
        k_means = KMeans(n_clusters=self.parts,
                         random_state=self.random_seed,
                         init='k-means++')
        k_means.fit(processed_data)  # training

        count_label = [0 for _ in range(self.parts)]
        count_feature = [[] for _ in range(self.parts)]
        count_distance = [[] for _ in range(self.parts)]

        feature_predict = k_means.labels_
        cluster_centers = k_means.cluster_centers_

        for j in range(len(feature_name_list)):
            count_label[feature_predict[j]] += 1
            count_feature[feature_predict[j]].append(feature_name_list[j])

            cluster_center = cluster_centers[feature_predict[j]]
            distance = np.square(processed_data[j] - cluster_center).sum()
            count_distance[feature_predict[j]].append(distance)

        print('The number of feature in each class \n', count_label)
        min_distance_feature = []
        for k in range(self.parts):
            k_feature = count_feature[k]
            k_distance = count_distance[k]
            idx = k_distance.index(min(k_distance))
            selected_feature = k_feature[idx]
            min_distance_feature.append(selected_feature)
            print('min distance feature in this class {} is {}'.format(
                k, selected_feature))
            print('its distance is', min(k_distance), 'while max distance is',
                  max(k_distance))
        return min_distance_feature, feature_predict

    def _MergeClinical(self, dc, cli_df):
        # Merge DataContainer and a dataframe of clinical
        if 'label' in cli_df.columns.tolist():
            del cli_df['label']
        elif 'Label' in cli_df.columns.tolist():
            del cli_df['Label']
        df = pd.merge(dc.GetFrame(),
                      cli_df,
                      how='left',
                      left_index=True,
                      right_index=True)
        merge_dc = DataContainer()
        merge_dc.SetFrame(df)
        merge_dc.UpdateFrameByData()
        return merge_dc

    def _EstimateAllFeatureDistribution(self, dc):
        feature_name_list = dc.GetFeatureName()
        distribution = dict()
        for i in range(len(feature_name_list)):
            feature = feature_name_list[i]
            feature_data = dc.GetFrame()[feature]
            _, normal_p = normaltest(feature_data, axis=0)
            if len(
                    set(feature_data)
            ) < 10:  # TODO: a better way to distinguish discrete numeric values
                distribution[feature] = 'Category'
            elif normal_p > 0.05:
                distribution[feature] = 'Normal'
            else:
                distribution[feature] = 'Non-normal'
        # return a dict {"AGE": 'Normal', 'Gender': 'Category', ... }
        return distribution

    def _EstimateDcFeaturePvalue(self, dc1, dc2, feature_type):
        array1, array2 = dc1.GetArray(), dc2.GetArray()
        p_values = {}
        for index, feature in enumerate(dc1.GetFeatureName()):
            p_values[feature] = GetPvalue(array1[:, index], array2[:, index],
                                          feature_type[feature])

        return p_values

    #################################################
    def VisualizePartsVariance(self,
                               dc: DataContainer,
                               max_k=None,
                               method='SSE',
                               store_folder=None,
                               is_show=True):
        # method must be one of SSE or SC. SSE denotes xxxx, SC denotes Silhouette Coefficient

        data = dc.GetArray()  # get train data
        processed_data = self._DataPreProcess(dc)

        if max_k is None:
            max_k = min(data.shape[0], 50)

        assert (method in ['SSE', 'SC'])

        score = []
        for k in range(2, max_k):
            print('make cluster k=', k)
            estimator = KMeans(n_clusters=k)
            estimator.fit(processed_data)
            if method == 'SSE':
                score.append(estimator.inertia_)
            elif method == 'SC':
                score.append(
                    silhouette_score(processed_data,
                                     estimator.labels_,
                                     metric='euclidean'))
        X = range(2, max_k)
        plt.xlabel('k')
        plt.ylabel(method)
        plt.plot(X, score, 'o-')

        if store_folder and os.path.isdir(store_folder):
            plt.savefig(
                os.path.join(store_folder, 'ClusteringParameterPlot.jpg'))

        if is_show:
            plt.show()

    def VisualizeCluster(self,
                         dimension='2d',
                         select_feature=None,
                         store_folder=None,
                         is_show=True):
        if len(self.feature_labels) != 0 and self.current_dc.GetFrame(
        ).size != 0:
            processed_data = self._DataPreProcess(self.current_dc)

            if select_feature is None:
                select_feature = [0, 1, 2]

            assert dimension in ['2d', '3d']
            if dimension == '2d':
                plt.scatter(processed_data[:, select_feature[0]],
                            processed_data[:, select_feature[1]],
                            s=5,
                            c=self.feature_labels)
            elif dimension == '3d':
                ax = plt.figure().add_subplot(111, projection='3d')
                ax.scatter(processed_data[:, select_feature[0]],
                           processed_data[:, select_feature[1]],
                           processed_data[:, select_feature[2]],
                           s=10,
                           c=self.feature_labels,
                           marker='^')
                ax.set_title('Cluster Result 3D')

            if store_folder and os.path.isdir(store_folder):
                plt.savefig(
                    os.path.join(
                        store_folder,
                        'ClusteringProcessPlot{}.jpg'.format(dimension)))
            if is_show:
                plt.show()

    def Run(self,
            dc: DataContainer,
            output_folder: str,
            clinical_feature=None):
        self.current_dc = dc
        selected_feature_names, self.feature_labels = self._Cluster(dc)

        fs = FeatureSelector()
        selected_dc = fs.SelectFeatureByName(dc, selected_feature_names)

        if clinical_feature is not None:
            if isinstance(clinical_feature, str):
                clinical_feature = pd.read_csv(clinical_feature, index_col=0)
            assert (isinstance(clinical_feature, pd.DataFrame))

            merge_dc = self._MergeClinical(selected_dc, clinical_feature)
        else:
            merge_dc = selected_dc

        feature_distribution_type = self._EstimateAllFeatureDistribution(
            merge_dc)  # a dict

        splitter = DataSeparate()

        output_train_dc, output_test_dc = DataContainer(), DataContainer()
        output_p_value = []
        mean_p_value = -1

        for _ in range(self.repeat_times):
            train_dc, test_dc = splitter.RunByTestingPercentage(
                merge_dc, testing_data_percentage=self.test_ratio)
            feature_p_value = self._EstimateDcFeaturePvalue(
                train_dc, test_dc, feature_distribution_type)
            if np.mean(list(feature_p_value.values())) > mean_p_value:
                mean_p_value = np.mean(list(feature_p_value.values()))
                output_train_dc, output_test_dc = train_dc, test_dc
                output_p_value = feature_p_value

        if output_folder is not None and os.path.isdir(output_folder):
            output_train_dc.Save(os.path.join(output_folder, 'train.csv'))
            output_test_dc.Save(os.path.join(output_folder, 'test.csv'))

            p_value_df = pd.DataFrame(output_p_value, index=['P Value'])
            distribute_df = pd.DataFrame(feature_distribution_type,
                                         index=['Distribution'])
            store_df = pd.concat((p_value_df, distribute_df), axis=0)
            store_df.to_csv(os.path.join(output_folder, 'split_info.csv'))
Example #3
0
'''

import pandas as pd
from copy import deepcopy
from BC.DataContainer.DataContainer import DataContainer


class FeatureEncodingOneHot():
    def __init__(self):
        pass

    def OneHotOneColumn(self, data_container, feature_list):
        info = data_container.GetFrame()
        feature_name = data_container.GetFeatureName()
        for feature in feature_list:
            assert (feature in feature_name)

        new_info = pd.get_dummies(info, columns=feature_list)
        new_data = DataContainer()
        new_data.SetFrame(new_info)
        return new_data


if __name__ == '__main__':
    data = DataContainer()
    data.Load(r'c:\Users\yangs\Desktop\test.csv')
    info = data.GetFrame()

    new_info = pd.get_dummies(
        info, columns=['bGs', 'PIRADS', 't2score', 'DWIscore', 'MR_stage'])
    new_info.to_csv(r'c:\Users\yangs\Desktop\test_onehot.csv')