예제 #1
0
    def RunByTestingReference(self,
                              data_container,
                              testing_ref_data_container,
                              store_folder=''):
        training_index_list, testing_index_list = [], []

        # TODO: assert data_container include all cases which is in the training_ref_data_container.
        all_name_list = data_container.GetCaseName()
        testing_name_list = testing_ref_data_container.GetCaseName()
        for training_name in testing_name_list:
            if training_name not in all_name_list:
                print(
                    'The data container and the training data container are not consistent.'
                )
                return DataContainer(), DataContainer()

        for name, index in zip(all_name_list, range(len(all_name_list))):
            if name in testing_name_list:
                testing_index_list.append(index)
            else:
                training_index_list.append(index)

        train_data_container = self.__SetNewData(data_container,
                                                 training_index_list)
        test_data_container = self.__SetNewData(data_container,
                                                testing_index_list)

        if store_folder:
            train_data_container.Save(
                os.path.join(store_folder, 'train_numeric_feature.csv'))
            test_data_container.Save(
                os.path.join(store_folder, 'test_numeric_feature.csv'))

        return train_data_container, test_data_container
예제 #2
0
def DrawFeatureRelationshipAccordingToCsvFile(file_path,
                                              selected_feature_name_list,
                                              label_name_list,
                                              store_path=''):
    '''
    Help draw the feature relationship among different features according to the path of the data container.
    :param file_path: the file path of the csv file
    :param selected_feature_name_list: the features that would be drawn
    :param label_name_list: the name of the label. e.g. ['non-cnacer', 'cancer']
    :param store_path: The store path, supporting jpg and eps format.
    :return:
    '''
    data_container = DataContainer()
    data_container.Load(file_path)
    data_container.UsualNormalize()
    data, label, feature_name, case_name = data_container.GetData()

    if len(selected_feature_name_list) > 3 or len(
            selected_feature_name_list) < 1:
        print(
            "Please check the length of the feature list. It can only show the relationship of the 1, 2, or 3 features"
        )

    try:
        index = [feature_name.index(t) for t in selected_feature_name_list]
    except:
        print('The selected feature is not in the data container.')
        return
    result_data = []
    for sub_index in index:
        result_data.append(data[:, sub_index])
    DrawValueRelationship(result_data, selected_feature_name_list, label,
                          label_name_list, store_path)
예제 #3
0
    def __init__(self, parts=30, repeat_times=100, test_ratio=0.3, random_seed=10):
        self.parts = parts
        self.repeat_times = repeat_times
        self.test_ratio = test_ratio
        self.random_seed = random_seed

        self.feature_labels = []
        self.current_dc = DataContainer()
예제 #4
0
파일: Description.py 프로젝트: salan668/FAE
def GenerateDescription():
    training_data_container = DataContainer()
    training_data_container.Load(r'..\..\Example\numeric_feature.csv')

    one_pipeline = OnePipeline()
    one_pipeline.LoadPipeline(r'C:\MyCode\FAEGitHub\FAE\Example\report_temp\NormUnit_Cos_ANOVA_5_SVM\pipeline_info.csv')

    description = Description()
    description.Run(training_data_container, one_pipeline, r'..\..\Example\report_temp', r'..\..\Example\report')
예제 #5
0
    def OneHotOneColumn(self, data_container, feature_list):
        info = data_container.GetFrame()
        feature_name = data_container.GetFeatureName()
        for feature in feature_list:
            assert (feature in feature_name)

        new_info = pd.get_dummies(info, columns=feature_list)
        new_data = DataContainer()
        new_data.SetFrame(new_info)
        return new_data
예제 #6
0
    def __SetNewData(self, data_container, case_index):
        array, label, feature_name, case_name = data_container.GetData()

        new_array = array[case_index, :]
        new_label = label[case_index]
        new_case_name = [case_name[i] for i in case_index]

        new_data_container = DataContainer(array=new_array, label=new_label, case_name=new_case_name, feature_name=feature_name)
        new_data_container.UpdateFrameByData()
        return new_data_container
예제 #7
0
 def _MergeClinical(self, dc, cli_df):
     # Merge DataContainer and a dataframe of clinical
     if 'label' in cli_df.columns.tolist():
         del cli_df['label']
     elif 'Label' in cli_df.columns.tolist():
         del cli_df['Label']
     df = pd.merge(dc.GetFrame(), cli_df, how='left', left_index=True, right_index=True)
     merge_dc = DataContainer()
     merge_dc.SetFrame(df)
     merge_dc.UpdateFrameByData()
     return merge_dc
예제 #8
0
    def Generate(self, data_container):
        array, label = data_container.GetArray(), data_container.GetLabel()
        feature_name, case_name = data_container.GetFeatureName(), data_container.GetCaseName()
        for train_index, val_index in self._cv.split(array, label):
            train_array, train_label = array[train_index, :], label[train_index]
            val_array, val_label = array[val_index, :], label[val_index]

            sub_train_container = DataContainer(array=train_array, label=train_label, feature_name=feature_name,
                                                case_name=[case_name[index] for index in train_index])
            sub_val_container = DataContainer(array=val_array, label=val_label, feature_name=feature_name,
                                              case_name=[case_name[index] for index in val_index])
            yield (sub_train_container, sub_val_container)
예제 #9
0
    def Run(self, data_container, store_folder='', store_key=''):
        temp_frame = data_container.GetFrame().select_dtypes(include=None, exclude=['object'])
        new_data_container = DataContainer()
        new_data_container.SetFrame(temp_frame)
        if store_folder and os.path.isdir(store_folder):
            feature_store_path = os.path.join(store_folder, 'numeric_feature.csv')
            featureinfo_store_path = os.path.join(store_folder, 'feature_select_info.csv')

            new_data_container.Save(feature_store_path)
            SaveSelectInfo(new_data_container.GetFeatureName(), featureinfo_store_path, is_merge=False)

        return new_data_container
예제 #10
0
    def VisualizePartsVariance(self,
                               dc: DataContainer,
                               max_k=None,
                               method='SSE',
                               store_folder=None,
                               is_show=True):
        # method must be one of SSE or SC. SSE denotes xxxx, SC denotes Silhouette Coefficient

        # TODO: Normalize the train_data
        data = dc.GetArray().transpose()

        if max_k is None:
            max_k = min(data.shape[0], 50)

        assert (method in ['SSE', 'SC'])

        #TODO: plot
        score = []
        for k in range(2, max_k):
            if method == 'SSE':
                pass
            elif method == 'SC':
                pass

        if store_folder and os.path.isdir(store_folder):
            plt.savefig(os.path.join(store_folder, 'ClusteringPlot.jpg'))

        if is_show:
            plt.show()
예제 #11
0
    def VisualizePartsVariance(self, dc: DataContainer, max_k=None, method='SSE',
                               store_folder=None, is_show=True):
        # method must be one of SSE or SC. SSE denotes xxxx, SC denotes Silhouette Coefficient

        data = dc.GetArray()  # get train data
        processed_data = self._DataPreProcess(dc)

        if max_k is None:
            max_k = min(data.shape[0], 50)

        assert(method in ['SSE', 'SC'])

        score = []
        for k in range(2, max_k):
            print('make cluster k=', k)
            estimator = KMeans(n_clusters=k) 
            estimator.fit(processed_data)
            if method == 'SSE':
                score.append(estimator.inertia_)
            elif method == 'SC':
                score.append(silhouette_score(processed_data, estimator.labels_, metric='euclidean'))
        X = range(2, max_k)
        plt.xlabel('k')
        plt.ylabel(method)
        plt.plot(X, score, 'o-')

        if store_folder and os.path.isdir(store_folder):
            plt.savefig(os.path.join(store_folder, 'ClusteringParameterPlot.jpg'))

        if is_show:
            plt.show()
예제 #12
0
    def Run(self,
            dc: DataContainer,
            output_folder: str,
            clinical_feature=None):
        self.current_dc = dc
        selected_feature_names, self.feature_labels = self._Cluster(dc)

        fs = FeatureSelector()
        selected_dc = fs.SelectFeatureByName(dc, selected_feature_names)

        if clinical_feature is not None:
            if isinstance(clinical_feature, str):
                clinical_feature = pd.read_csv(clinical_feature, index_col=0)
            assert (isinstance(clinical_feature, pd.DataFrame))

            merge_dc = self._MergeClinical(selected_dc, clinical_feature)
        else:
            merge_dc = selected_dc

        feature_distribution_type = self._EstimateAllFeatureDistribution(
            merge_dc)  # a dict

        splitter = DataSeparate()

        output_train_dc, output_test_dc = DataContainer(), DataContainer()
        output_p_value = []
        mean_p_value = -1

        for _ in range(self.repeat_times):
            train_dc, test_dc = splitter.RunByTestingPercentage(
                merge_dc, testing_data_percentage=self.test_ratio)
            feature_p_value = self._EstimateDcFeaturePvalue(
                train_dc, test_dc, feature_distribution_type)
            if np.mean(list(feature_p_value.values())) > mean_p_value:
                mean_p_value = np.mean(list(feature_p_value.values()))
                output_train_dc, output_test_dc = train_dc, test_dc
                output_p_value = feature_p_value

        if output_folder is not None and os.path.isdir(output_folder):
            output_train_dc.Save(os.path.join(output_folder, 'train.csv'))
            output_test_dc.Save(os.path.join(output_folder, 'test.csv'))

            p_value_df = pd.DataFrame(output_p_value, index=['P Value'])
            distribute_df = pd.DataFrame(feature_distribution_type,
                                         index=['Distribution'])
            store_df = pd.concat((p_value_df, distribute_df), axis=0)
            store_df.to_csv(os.path.join(output_folder, 'split_info.csv'))
예제 #13
0
    def Run(self, data_container, store_path=''):
        data, label, feature_name, label_name = data_container.GetData()
        data_resampled, label_resampled = self._model.fit_sample(data, label)

        new_case_name = [
            'Generate' + str(index) for index in range(data_resampled.shape[0])
        ]
        new_data_container = DataContainer(data_resampled, label_resampled,
                                           data_container.GetFeatureName(),
                                           new_case_name)
        if store_path != '':
            if os.path.isdir(store_path):
                new_data_container.Save(
                    os.path.join(store_path,
                                 '{}_features.csv'.format(self._name)))
            else:
                new_data_container.Save(store_path)
        return new_data_container
예제 #14
0
    def Run(self, data_container, store_path=''):
        data, label, feature_name, label_name = data_container.GetData()
        data_resampled, label_resampled = self._model.fit_sample(data, label)

        new_case_name = []
        for index in range(data_resampled.shape[0]):
            new_case_name.append(
                self.GetCaseNameFromAllCaseNames(data_container,
                                                 data_resampled[index, :]))

        new_data_container = DataContainer(data_resampled, label_resampled,
                                           data_container.GetFeatureName(),
                                           new_case_name)
        if store_path != '':
            if os.path.isdir(store_path):
                new_data_container.Save(
                    os.path.join(store_path,
                                 '{}_features.csv'.format(self._name)))
            else:
                new_data_container.Save(store_path)
        return new_data_container
예제 #15
0
    def Run(self,
            train_data_container,
            test_data_container=DataContainer(),
            store_folder='',
            is_hyper_parameter=False):
        raw_train_data_container = deepcopy(train_data_container)
        raw_test_data_conainer = deepcopy(test_data_container)

        if store_folder:
            if not os.path.exists(store_folder):
                os.mkdir(store_folder)

        if not (self.__cv and self.__classifier):
            print('Give CV method and classifier')

        if self.__normalizer:
            raw_train_data_container = self.__normalizer.Run(
                raw_train_data_container, store_folder)
            if not test_data_container.IsEmpty():
                raw_test_data_conainer = self.__normalizer.Run(
                    raw_test_data_conainer, store_folder, is_test=True)

        if self.__dimension_reduction:
            raw_train_data_container = self.__dimension_reduction.Run(
                raw_train_data_container, store_folder)
            if not test_data_container.IsEmpty():
                raw_test_data_conainer = self.__dimension_reduction.Transform(
                    raw_test_data_conainer)

        if self.__feature_selector:
            raw_train_data_container = self.__feature_selector.Run(
                raw_train_data_container, store_folder)
            if not test_data_container.IsEmpty():
                selected_feature_name = raw_train_data_container.GetFeatureName(
                )
                fs = FeatureSelector()
                raw_test_data_conainer = fs.SelectFeatureByName(
                    raw_test_data_conainer, selected_feature_name)

        self.__cv.SetClassifier(self.__classifier)
        train_cv_metric, val_cv_metric, test_metric, all_train_metric = self.__cv.Run(
            raw_train_data_container, raw_test_data_conainer, store_folder,
            is_hyper_parameter)

        if store_folder:
            self.SavePipeline(len(raw_train_data_container.GetFeatureName()),
                              os.path.join(store_folder, 'pipeline_info.csv'))

        return train_cv_metric, val_cv_metric, test_metric, all_train_metric
예제 #16
0
    def __init__(self, parent=None):
        super(PrepareConnection, self).__init__(parent)
        self.setupUi(self)
        self.data_container = DataContainer()
        self._filename = os.path.split(__file__)[-1]

        self.buttonLoad.clicked.connect(self.LoadData)
        self.buttonRemoveAndExport.clicked.connect(self.RemoveInvalidValue)

        self.__testing_ref_data_container = DataContainer()
        self.__clinical_ref = pd.DataFrame()

        self.radioSplitRandom.clicked.connect(self.ChangeSeparateMethod)
        self.radioSplitRef.clicked.connect(self.ChangeSeparateMethod)
        self.checkUseClinicRef.clicked.connect(
            self.RandomSeparateButtonUpdates)
        self.loadTestingReference.clicked.connect(
            self.LoadTestingReferenceDataContainer)
        self.clearTestingReference.clicked.connect(
            self.ClearTestingReferenceDataContainer)
        self.loadClinicRef.clicked.connect(self.LoadClinicalRef)
        self.clearClinicRef.clicked.connect(self.ClearClinicalRef)

        self.buttonSave.clicked.connect(self.CheckAndSave)
예제 #17
0
def TestNewData(NewDataCsv, model_folder, result_save_path=''):
    '''

    :param NewDataCsv: New radiomics feature matrix csv file path
    :param model_folder:The trained model path
    :return:classification result
    '''
    train_info = LoadTrainInfo(model_folder)
    new_data_container = DataContainer()

    #Normlization

    new_data_container.Load(NewDataCsv)

    # feature_selector = FeatureSelector()
    # feature_selector.SelectFeatureByName(new_data_container, train_info['selected_features'], is_replace=True)

    new_data_container = train_info['normalizer'].Transform(new_data_container)

    # data_frame = new_data_container.GetFrame()
    # data_frame = data_frame[train_info['selected_features']]
    # new_data_container.SetFrame(data_frame)
    # new_data_container.UpdateDataByFrame()

    ##Model
    train_info['classifier'].SetDataContainer(new_data_container)
    model = train_info['classifier'].GetModel()
    predict = model.predict_proba(new_data_container.GetArray())[:, 1]

    label = new_data_container.GetLabel()
    case_name = new_data_container.GetCaseName()

    test_result_info = [['CaseName', 'Pred', 'Label']]
    for index in range(len(label)):
        test_result_info.append(
            [case_name[index], predict[index], label[index]])

    metric = EstimateMetirc(predict, label)
    info = {}
    info.update(metric)
    cv = CrossValidation()

    print(metric)
    print('\t')

    if result_save_path:
        cv.SaveResult(info, result_save_path)
        np.save(os.path.join(result_save_path, 'test_predict.npy'), predict)
        np.save(os.path.join(result_save_path, 'test_label.npy'), label)
        with open(os.path.join(result_save_path, 'test_info.csv'),
                  'w',
                  newline='') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerows(test_result_info)

    return metric
예제 #18
0
            feature_pvalue = self._EstimateDcFeaturePvalue(
                train_dc, test_dc, feature_distribution_type)
            if np.mean(list(feature_pvalue.values())) > mean_pvalue:
                mean_pvalue = np.mean(list(feature_pvalue.values()))
                output_train_dc, output_test_dc = train_dc, test_dc
                output_pvalue = feature_pvalue

        if output_folder is not None and os.path.isdir(output_folder):
            output_train_dc.Save(os.path.join(output_folder, 'train.csv'))
            output_test_dc.Save(os.path.join(output_folder, 'test.csv'))

            pvalue_df = pd.DataFrame(output_pvalue, index=['P Value'])
            distibute_df = pd.DataFrame(feature_distribution_type,
                                        index=['Distribution'])
            store_df = pd.concat((pvalue_df, distibute_df), axis=0)
            store_df.to_csv(os.path.join(output_folder, 'split_info.csv'))


if __name__ == '__main__':
    clinics = pd.read_csv(r'..\..\Demo\simulated_clinics.csv', index_col=0)
    container = DataContainer()
    container.Load(r'..\..\Demo\simulated_feature.csv')

    separator = DataSeparate()
    train, test = separator.RunByTestingPercentage(container,
                                                   0.3,
                                                   clinic_df=clinics)

    print(train.GetArray().shape, test.GetArray().shape)
    print(separator.clinic_split_result)
예제 #19
0
    def RunWithoutCV(self,
                     train_container,
                     test_container=DataContainer(),
                     store_folder=''):
        self.SavePipelineInfo(store_folder)
        num = 0

        # TODO: Balance后面也可以变成循环处理:
        balance_train_container = self.balance.Run(train_container,
                                                   store_folder)

        for norm_index, normalizer in enumerate(self.normalizer_list):
            norm_store_folder = MakeFolder(store_folder, normalizer.GetName())
            norm_balance_train_container = normalizer.Run(
                balance_train_container,
                norm_store_folder,
                store_key=BALANCE_TRAIN)
            norm_train_container = normalizer.Transform(train_container,
                                                        norm_store_folder,
                                                        store_key=TRAIN)
            norm_test_container = normalizer.Transform(test_container,
                                                       norm_store_folder,
                                                       store_key=TEST)

            for dr_index, dr in enumerate(self.dimension_reduction_list):
                dr_store_folder = MakeFolder(norm_store_folder, dr.GetName())
                if dr:
                    dr_balance_train_container = dr.Run(
                        norm_balance_train_container, dr_store_folder,
                        BALANCE_TRAIN)
                    dr_train_container = dr.Transform(norm_train_container,
                                                      dr_store_folder, TRAIN)
                    if not test_container.IsEmpty():
                        dr_test_container = dr.Transform(
                            norm_test_container, dr_store_folder, TEST)
                    else:
                        dr_test_container = norm_test_container
                else:
                    dr_balance_train_container = norm_balance_train_container
                    dr_train_container = norm_train_container
                    dr_test_container = norm_test_container

                for fs_index, fs in enumerate(self.feature_selector_list):
                    for fn_index, fn in enumerate(
                            self.feature_selector_num_list):
                        if fs:
                            fs_store_folder = MakeFolder(
                                dr_store_folder,
                                '{}_{}'.format(fs.GetName(), fn))
                            fs.SetSelectedFeatureNumber(fn)
                            fs_balance_train_container = fs.Run(
                                dr_balance_train_container, fs_store_folder,
                                BALANCE_TRAIN)
                            fs_train_container = fs.Transform(
                                dr_train_container, fs_store_folder, TRAIN)
                            fs_test_container = fs.Transform(
                                dr_test_container, fs_store_folder, TEST)
                        else:
                            fs_store_folder = dr_store_folder
                            fs_balance_train_container = dr_balance_train_container
                            fs_train_container = dr_train_container
                            fs_test_container = dr_test_container

                        for cls_index, cls in enumerate(self.classifier_list):
                            cls_store_folder = MakeFolder(
                                fs_store_folder, cls.GetName())
                            model_name = self.GetStoreName(
                                normalizer.GetName(), dr.GetName(),
                                fs.GetName(), str(fn), cls.GetName())
                            matrics_index = (norm_index, dr_index, fs_index,
                                             fn_index, cls_index)
                            num += 1
                            yield self.total_num, num

                            cls.SetDataContainer(fs_balance_train_container)
                            cls.Fit()
                            cls.Save(cls_store_folder)

                            balance_train_pred = cls.Predict(
                                fs_balance_train_container.GetArray())
                            balance_train_label = fs_balance_train_container.GetLabel(
                            )
                            self.SaveOneResult(
                                balance_train_pred, balance_train_label,
                                BALANCE_TRAIN,
                                fs_balance_train_container.GetCaseName(),
                                matrics_index, model_name, store_folder,
                                cls_store_folder)

                            train_data = fs_train_container.GetArray()
                            train_label = fs_train_container.GetLabel()
                            train_pred = cls.Predict(train_data)
                            self.SaveOneResult(
                                train_pred, train_label, TRAIN,
                                fs_train_container.GetCaseName(),
                                matrics_index, model_name, store_folder,
                                cls_store_folder)

                            if not test_container.IsEmpty():
                                test_data = fs_test_container.GetArray()
                                test_label = fs_test_container.GetLabel()
                                test_pred = cls.Predict(test_data)
                                self.SaveOneResult(
                                    test_pred, test_label, TEST,
                                    fs_test_container.GetCaseName(),
                                    matrics_index, model_name, store_folder,
                                    cls_store_folder)

        self.total_metric[BALANCE_TRAIN].to_csv(
            os.path.join(store_folder, '{}_results.csv'.format(BALANCE_TRAIN)))
        self.total_metric[TRAIN].to_csv(
            os.path.join(store_folder, '{}_results.csv'.format(TRAIN)))
        if not test_container.IsEmpty():
            self.total_metric[TEST].to_csv(
                os.path.join(store_folder, '{}_results.csv'.format(TEST)))
예제 #20
0
                                os.path.join(cls_store_folder, 'metrics.csv'))
                            self._MergeOneMetric(cv_val_metric, CV_VAL,
                                                 model_name)

        self.total_metric[CV_TRAIN].to_csv(
            os.path.join(store_folder, '{}_results.csv'.format(CV_TRAIN)))
        self.total_metric[CV_VAL].to_csv(
            os.path.join(store_folder, '{}_results.csv'.format(CV_VAL)))


if __name__ == '__main__':
    manager = PipelinesManager()

    index_dict = Index2Dict()

    train = DataContainer()
    test = DataContainer()
    train.Load(r'C:\Users\yangs\Desktop\train_numeric_feature.csv')
    test.Load(r'C:\Users\yangs\Desktop\test_numeric_feature.csv')

    faps = PipelinesManager(
        balancer=index_dict.GetInstantByIndex('UpSampling'),
        normalizer_list=[index_dict.GetInstantByIndex('Mean')],
        dimension_reduction_list=[index_dict.GetInstantByIndex('PCC')],
        feature_selector_list=[index_dict.GetInstantByIndex('ANOVA')],
        feature_selector_num_list=list(np.arange(1, 18)),
        classifier_list=[index_dict.GetInstantByIndex('SVM')],
        cross_validation=index_dict.GetInstantByIndex('5-Fold'))

    # for total, num in faps.RunWithoutCV(train, store_folder=r'..\..\Demo\db2-1'):
    #     print(total, num)
예제 #21
0
        self.__cv.SetClassifier(self.__classifier)
        train_cv_metric, val_cv_metric, test_metric, all_train_metric = self.__cv.Run(
            raw_train_data_container, raw_test_data_conainer, store_folder,
            is_hyper_parameter)

        if store_folder:
            self.SavePipeline(len(raw_train_data_container.GetFeatureName()),
                              os.path.join(store_folder, 'pipeline_info.csv'))

        return train_cv_metric, val_cv_metric, test_metric, all_train_metric


if __name__ == '__main__':
    index_dict = Index2Dict()

    train = DataContainer()
    test = DataContainer()
    train.Load(r'..\..\Demo\zero_center_normalized_training_feature.csv')
    test.Load(r'..\..\Demo\zero_center_normalized_testing_feature.csv')

    faps = FeatureAnalysisPipelines(
        balancer=index_dict.GetInstantByIndex('NoneBalance'),
        normalizer_list=[index_dict.GetInstantByIndex('None')],
        dimension_reduction_list=[index_dict.GetInstantByIndex('PCC')],
        feature_selector_list=[index_dict.GetInstantByIndex('RFE')],
        feature_selector_num_list=[15],
        classifier_list=[index_dict.GetInstantByIndex('LR')],
        cross_validation=index_dict.GetInstantByIndex('5-Folder'))

    for temp in faps.Run(train, test, store_folder=r'..\..\Demo\db2-2'):
        print(temp)
예제 #22
0
            input_data_container = output
        return output

    def SaveInfo(self, store_folder, all_features):
        for fs in self.__selector_list:
            fs.SaveInfo(store_folder, all_features)

    def SaveDataContainer(self, data_container, store_folder, store_key):
        for fs in self.__selector_list:
            fs.SaveDataContainer(data_container, store_folder, store_key)


################################################################

if __name__ == '__main__':
    from BC.DataContainer.DataContainer import DataContainer
    from BC.FeatureAnalysis.Normalizer import NormalizerZeroCenter
    from BC.FeatureAnalysis.DimensionReduction import DimensionReductionByPCC

    dc = DataContainer()
    pcc = DimensionReductionByPCC()
    fs = FeatureSelectByKruskalWallis(selected_feature_number=5)

    dc.Load(r'..\..\Demo\train_numeric_feature.csv')

    dc = NormalizerZeroCenter.Run(dc)
    dc = pcc.Run(dc)
    print(dc.GetArray().shape)
    dc = fs.Run(dc)
    print(dc.GetArray().shape)
예제 #23
0
 def __init__(self):
     self.__model = None
     self._x = np.array([])
     self._y = np.array([])
     self._data_container = DataContainer()
     self.logger = eclog(os.path.split(__file__)[-1]).GetLogger()
예제 #24
0
    def Run(self,
            train_data_container,
            test_data_container=DataContainer(),
            store_folder='',
            is_hyper_parameter=False):
        column_list = [
            'sample_number', 'positive_number', 'negative_number', 'auc',
            'auc 95% CIs', 'auc std', 'accuracy', 'Youden Index',
            'sensitivity', 'specificity', 'positive predictive value',
            'negative predictive value'
        ]
        train_df = pd.DataFrame(columns=column_list)
        val_df = pd.DataFrame(columns=column_list)
        test_df = pd.DataFrame(columns=column_list)
        all_train_df = pd.DataFrame(columns=column_list)

        if self.__normalizer_list == []:
            self.__normalizer_list = [NormalizerNone()]

        if self._dimension_reduction_list == []:
            self._dimension_reduction_list = [DimensionReductionByPCC()]

        self.GenerateMetircDict()
        self.SavePipelineInfo(store_folder)

        num = 0
        total_num = len(self.__normalizer_list) * \
                    len(self._dimension_reduction_list) * \
                    len(self.__feature_selector_list) * \
                    len(self.__classifier_list) * \
                    len(self.__feature_selector_num_list)

        #TODO: Replace with enumerate
        for normalizer, normalizer_index in zip(
                self.__normalizer_list, range(len(self.__normalizer_list))):
            normalized_train_data_container = normalizer.Run(
                train_data_container)
            if not test_data_container.IsEmpty():
                normalized_test_data_container = normalizer.Run(
                    test_data_container, is_test=True)
            else:
                normalized_test_data_container = test_data_container

            for dimension_reducor, dimension_reductor_index in zip(
                    self._dimension_reduction_list,
                    range(len(self._dimension_reduction_list))):
                if dimension_reducor:
                    dr_train_data_container = dimension_reducor.Run(
                        normalized_train_data_container)
                    if not test_data_container.IsEmpty():
                        dr_test_data_container = dimension_reducor.Transform(
                            normalized_test_data_container)
                    else:
                        dr_test_data_container = normalized_test_data_container
                else:
                    dr_train_data_container = normalized_train_data_container
                    dr_test_data_container = normalized_test_data_container

                for feature_selector, feature_selector_index in zip(
                        self.__feature_selector_list,
                        range(len(self.__feature_selector_list))):
                    for feature_num, feature_num_index in zip(
                            self.__feature_selector_num_list,
                            range(len(self.__feature_selector_num_list))):
                        feature_selector.SetSelectedFeatureNumber(feature_num)
                        if feature_selector:
                            fs_train_data_container = feature_selector.Run(
                                dr_train_data_container)
                            if not test_data_container.IsEmpty():
                                selected_feature_name = fs_train_data_container.GetFeatureName(
                                )
                                fs = FeatureSelector()
                                fs_test_data_container = fs.SelectFeatureByName(
                                    dr_test_data_container,
                                    selected_feature_name)
                            else:
                                fs_test_data_container = dr_test_data_container
                        else:
                            fs_train_data_container = dr_train_data_container
                            fs_test_data_container = dr_test_data_container

                        for classifier, classifier_index in zip(
                                self.__classifier_list,
                                range(len(self.__classifier_list))):
                            self.__cross_validation.SetClassifier(classifier)

                            num += 1
                            yield normalizer.GetName(), dimension_reducor.GetName(), feature_selector.GetName(), feature_num, \
                                  classifier.GetName(), num, total_num

                            case_name = self.GetStoreName(
                                normalizer.GetName(),
                                dimension_reducor.GetName(),
                                feature_selector.GetName(), str(feature_num),
                                classifier.GetName())
                            case_store_folder = os.path.join(
                                store_folder, case_name)
                            if not os.path.exists(case_store_folder):
                                os.mkdir(case_store_folder)

                            # Save
                            normalizer.SaveInfo(
                                case_store_folder,
                                normalized_train_data_container.GetFeatureName(
                                ))
                            normalizer.SaveNormalDataContainer(
                                normalized_train_data_container,
                                case_store_folder,
                                is_test=False)
                            dimension_reducor.SaveInfo(case_store_folder)
                            dimension_reducor.SaveDataContainer(
                                dr_train_data_container,
                                case_store_folder,
                                is_test=False)
                            feature_selector.SaveInfo(
                                case_store_folder,
                                dr_train_data_container.GetFeatureName())
                            feature_selector.SaveDataContainer(
                                fs_train_data_container,
                                case_store_folder,
                                is_test=False)
                            if not test_data_container.IsEmpty():
                                normalizer.SaveNormalDataContainer(
                                    normalized_test_data_container,
                                    case_store_folder,
                                    is_test=True)
                                dimension_reducor.SaveDataContainer(
                                    dr_test_data_container,
                                    case_store_folder,
                                    is_test=True)
                                feature_selector.SaveDataContainer(
                                    fs_test_data_container,
                                    case_store_folder,
                                    is_test=True)

                            train_cv_metric, val_cv_metric, test_metric, all_train_metric = self.__cross_validation.Run(
                                fs_train_data_container,
                                fs_test_data_container, case_store_folder,
                                is_hyper_parameter, self.__balance)

                            self.SaveOnePipeline(
                                os.path.join(case_store_folder,
                                             'pipeline_info.csv'),
                                normalizer.GetName(),
                                dimension_reducor.GetName(),
                                feature_selector.GetName(), feature_num,
                                classifier.GetName(),
                                self.__cross_validation.GetName())

                            # Save Result
                            self.__auc_matrix_dict['train'][
                                normalizer_index, dimension_reductor_index,
                                feature_selector_index, feature_num_index,
                                classifier_index] = train_cv_metric[
                                    'train_auc']
                            self.__auc_std_matrix_dict['train'][
                                normalizer_index, dimension_reductor_index,
                                feature_selector_index, feature_num_index,
                                classifier_index] = train_cv_metric[
                                    'train_auc std']
                            self.__auc_matrix_dict['all_train'][
                                normalizer_index, dimension_reductor_index,
                                feature_selector_index, feature_num_index,
                                classifier_index] = all_train_metric[
                                    'all_train_auc']
                            self.__auc_std_matrix_dict['all_train'][
                                normalizer_index, dimension_reductor_index,
                                feature_selector_index, feature_num_index,
                                classifier_index] = all_train_metric[
                                    'all_train_auc std']
                            self.__auc_matrix_dict['val'][
                                normalizer_index, dimension_reductor_index,
                                feature_selector_index, feature_num_index,
                                classifier_index] = val_cv_metric['val_auc']
                            self.__auc_std_matrix_dict['val'][
                                normalizer_index, dimension_reductor_index,
                                feature_selector_index, feature_num_index,
                                classifier_index] = val_cv_metric[
                                    'val_auc std']

                            if store_folder and os.path.isdir(store_folder):
                                store_path = os.path.join(
                                    store_folder, 'train_result.csv')
                                save_info = [
                                    train_cv_metric['train_' + index]
                                    for index in column_list
                                ]
                                train_df.loc[case_name] = save_info
                                train_df.to_csv(store_path)

                                store_path = os.path.join(
                                    store_folder, 'all_train_result.csv')
                                save_info = [
                                    all_train_metric['all_train_' + index]
                                    for index in column_list
                                ]
                                all_train_df.loc[case_name] = save_info
                                all_train_df.to_csv(store_path)

                                store_path = os.path.join(
                                    store_folder, 'val_result.csv')
                                save_info = [
                                    val_cv_metric['val_' + index]
                                    for index in column_list
                                ]
                                val_df.loc[case_name] = save_info
                                val_df.to_csv(store_path)

                                if not test_data_container.IsEmpty():
                                    self.__auc_matrix_dict['test'][
                                        normalizer_index,
                                        dimension_reductor_index,
                                        feature_selector_index,
                                        feature_num_index,
                                        classifier_index] = test_metric[
                                            'test_auc']
                                    self.__auc_std_matrix_dict['test'][
                                        normalizer_index,
                                        dimension_reductor_index,
                                        feature_selector_index,
                                        feature_num_index,
                                        classifier_index] = test_metric[
                                            'test_auc std']

                                    store_path = os.path.join(
                                        store_folder, 'test_result.csv')
                                    save_info = [
                                        test_metric['test_' + index]
                                        for index in column_list
                                    ]
                                    test_df.loc[case_name] = save_info
                                    test_df.to_csv(store_path)

                                self.SaveMetricDict(store_folder)

        if store_folder:
            hidden_file_path = os.path.join(store_folder,
                                            '.FAEresult4129074093819729087')
            with open(hidden_file_path, 'wb') as file:
                pass
            file_hidden = os.popen('attrib +h ' + hidden_file_path)
            file_hidden.close()
예제 #25
0
 def _MergeClinical(self, dc, cli_df):
     # Merge DataContainer and a dataframe of clinical
     return DataContainer()
예제 #26
0
class DataSplitterByFeatureCluster(object):
    def __init__(self,
                 parts=30,
                 repeat_times=100,
                 test_ratio=0.3,
                 random_seed=10):
        self.parts = parts
        self.repeat_times = repeat_times
        self.test_ratio = test_ratio
        self.random_seed = random_seed

        self.feature_labels = []
        self.current_dc = DataContainer()

    #################################################
    def _DataPreProcess(self, dc):
        data = dc.GetArray()  # get train data
        # min_max, Process the features of each column
        min_max_scaler = preprocessing.MinMaxScaler()
        processed_data = min_max_scaler.fit_transform(data).T
        return processed_data

    def _Cluster(self, dc):
        # According Cluster to selecte features and combine them into a DataContainer
        processed_data = self._DataPreProcess(dc)
        feature_name_list = dc.GetFeatureName()
        k_means = KMeans(n_clusters=self.parts,
                         random_state=self.random_seed,
                         init='k-means++')
        k_means.fit(processed_data)  # training

        count_label = [0 for _ in range(self.parts)]
        count_feature = [[] for _ in range(self.parts)]
        count_distance = [[] for _ in range(self.parts)]

        feature_predict = k_means.labels_
        cluster_centers = k_means.cluster_centers_

        for j in range(len(feature_name_list)):
            count_label[feature_predict[j]] += 1
            count_feature[feature_predict[j]].append(feature_name_list[j])

            cluster_center = cluster_centers[feature_predict[j]]
            distance = np.square(processed_data[j] - cluster_center).sum()
            count_distance[feature_predict[j]].append(distance)

        print('The number of feature in each class \n', count_label)
        min_distance_feature = []
        for k in range(self.parts):
            k_feature = count_feature[k]
            k_distance = count_distance[k]
            idx = k_distance.index(min(k_distance))
            selected_feature = k_feature[idx]
            min_distance_feature.append(selected_feature)
            print('min distance feature in this class {} is {}'.format(
                k, selected_feature))
            print('its distance is', min(k_distance), 'while max distance is',
                  max(k_distance))
        return min_distance_feature, feature_predict

    def _MergeClinical(self, dc, cli_df):
        # Merge DataContainer and a dataframe of clinical
        if 'label' in cli_df.columns.tolist():
            del cli_df['label']
        elif 'Label' in cli_df.columns.tolist():
            del cli_df['Label']
        df = pd.merge(dc.GetFrame(),
                      cli_df,
                      how='left',
                      left_index=True,
                      right_index=True)
        merge_dc = DataContainer()
        merge_dc.SetFrame(df)
        merge_dc.UpdateFrameByData()
        return merge_dc

    def _EstimateAllFeatureDistribution(self, dc):
        feature_name_list = dc.GetFeatureName()
        distribution = dict()
        for i in range(len(feature_name_list)):
            feature = feature_name_list[i]
            feature_data = dc.GetFrame()[feature]
            _, normal_p = normaltest(feature_data, axis=0)
            if len(
                    set(feature_data)
            ) < 10:  # TODO: a better way to distinguish discrete numeric values
                distribution[feature] = 'Category'
            elif normal_p > 0.05:
                distribution[feature] = 'Normal'
            else:
                distribution[feature] = 'Non-normal'
        # return a dict {"AGE": 'Normal', 'Gender': 'Category', ... }
        return distribution

    def _EstimateDcFeaturePvalue(self, dc1, dc2, feature_type):
        array1, array2 = dc1.GetArray(), dc2.GetArray()
        p_values = {}
        for index, feature in enumerate(dc1.GetFeatureName()):
            p_values[feature] = GetPvalue(array1[:, index], array2[:, index],
                                          feature_type[feature])

        return p_values

    #################################################
    def VisualizePartsVariance(self,
                               dc: DataContainer,
                               max_k=None,
                               method='SSE',
                               store_folder=None,
                               is_show=True):
        # method must be one of SSE or SC. SSE denotes xxxx, SC denotes Silhouette Coefficient

        data = dc.GetArray()  # get train data
        processed_data = self._DataPreProcess(dc)

        if max_k is None:
            max_k = min(data.shape[0], 50)

        assert (method in ['SSE', 'SC'])

        score = []
        for k in range(2, max_k):
            print('make cluster k=', k)
            estimator = KMeans(n_clusters=k)
            estimator.fit(processed_data)
            if method == 'SSE':
                score.append(estimator.inertia_)
            elif method == 'SC':
                score.append(
                    silhouette_score(processed_data,
                                     estimator.labels_,
                                     metric='euclidean'))
        X = range(2, max_k)
        plt.xlabel('k')
        plt.ylabel(method)
        plt.plot(X, score, 'o-')

        if store_folder and os.path.isdir(store_folder):
            plt.savefig(
                os.path.join(store_folder, 'ClusteringParameterPlot.jpg'))

        if is_show:
            plt.show()

    def VisualizeCluster(self,
                         dimension='2d',
                         select_feature=None,
                         store_folder=None,
                         is_show=True):
        if len(self.feature_labels) != 0 and self.current_dc.GetFrame(
        ).size != 0:
            processed_data = self._DataPreProcess(self.current_dc)

            if select_feature is None:
                select_feature = [0, 1, 2]

            assert dimension in ['2d', '3d']
            if dimension == '2d':
                plt.scatter(processed_data[:, select_feature[0]],
                            processed_data[:, select_feature[1]],
                            s=5,
                            c=self.feature_labels)
            elif dimension == '3d':
                ax = plt.figure().add_subplot(111, projection='3d')
                ax.scatter(processed_data[:, select_feature[0]],
                           processed_data[:, select_feature[1]],
                           processed_data[:, select_feature[2]],
                           s=10,
                           c=self.feature_labels,
                           marker='^')
                ax.set_title('Cluster Result 3D')

            if store_folder and os.path.isdir(store_folder):
                plt.savefig(
                    os.path.join(
                        store_folder,
                        'ClusteringProcessPlot{}.jpg'.format(dimension)))
            if is_show:
                plt.show()

    def Run(self,
            dc: DataContainer,
            output_folder: str,
            clinical_feature=None):
        self.current_dc = dc
        selected_feature_names, self.feature_labels = self._Cluster(dc)

        fs = FeatureSelector()
        selected_dc = fs.SelectFeatureByName(dc, selected_feature_names)

        if clinical_feature is not None:
            if isinstance(clinical_feature, str):
                clinical_feature = pd.read_csv(clinical_feature, index_col=0)
            assert (isinstance(clinical_feature, pd.DataFrame))

            merge_dc = self._MergeClinical(selected_dc, clinical_feature)
        else:
            merge_dc = selected_dc

        feature_distribution_type = self._EstimateAllFeatureDistribution(
            merge_dc)  # a dict

        splitter = DataSeparate()

        output_train_dc, output_test_dc = DataContainer(), DataContainer()
        output_p_value = []
        mean_p_value = -1

        for _ in range(self.repeat_times):
            train_dc, test_dc = splitter.RunByTestingPercentage(
                merge_dc, testing_data_percentage=self.test_ratio)
            feature_p_value = self._EstimateDcFeaturePvalue(
                train_dc, test_dc, feature_distribution_type)
            if np.mean(list(feature_p_value.values())) > mean_p_value:
                mean_p_value = np.mean(list(feature_p_value.values()))
                output_train_dc, output_test_dc = train_dc, test_dc
                output_p_value = feature_p_value

        if output_folder is not None and os.path.isdir(output_folder):
            output_train_dc.Save(os.path.join(output_folder, 'train.csv'))
            output_test_dc.Save(os.path.join(output_folder, 'test.csv'))

            p_value_df = pd.DataFrame(output_p_value, index=['P Value'])
            distribute_df = pd.DataFrame(feature_distribution_type,
                                         index=['Distribution'])
            store_df = pd.concat((p_value_df, distribute_df), axis=0)
            store_df.to_csv(os.path.join(output_folder, 'split_info.csv'))
예제 #27
0
        if output_folder is not None and os.path.isdir(output_folder):
            output_train_dc.Save(os.path.join(output_folder, 'train.csv'))
            output_test_dc.Save(os.path.join(output_folder, 'test.csv'))

            p_value_df = pd.DataFrame(output_p_value, index=['P Value'])
            distribute_df = pd.DataFrame(feature_distribution_type,
                                         index=['Distribution'])
            store_df = pd.concat((p_value_df, distribute_df), axis=0)
            store_df.to_csv(os.path.join(output_folder, 'split_info.csv'))


if __name__ == '__main__':
    # clinics = pd.read_csv(r'..\..\Demo\simulated_clinics.csv', index_col=0)
    # container = DataContainer()
    # container.Load(r'..\..\Demo\simulated_feature.csv')
    #
    # separator = DataSeparate()
    # train, test = separator.RunByTestingPercentage(container, 0.3, clinic_df=clinics)
    #
    # print(train.GetArray().shape, test.GetArray().shape)
    # print(separator.clinic_split_result)
    cluster_split = DataSplitterByFeatureCluster()
    container = DataContainer()
    container.Load(r'.\all_feature.csv')
    output_path = r'.\output'
    clinical_path = r'.\clinical.csv'
    cluster_split.VisualizePartsVariance(container, store_folder=output_path)
    cluster_split.Run(container, output_path, clinical_feature=clinical_path)
    cluster_split.VisualizeCluster(dimension='2d', store_folder=output_path)
    cluster_split.VisualizeCluster(dimension='3d', store_folder=output_path)
예제 #28
0
class PrepareConnection(QWidget, Ui_Prepare):
    close_signal = pyqtSignal(bool)

    def __init__(self, parent=None):
        super(PrepareConnection, self).__init__(parent)
        self.setupUi(self)
        self.data_container = DataContainer()
        self._filename = os.path.split(__file__)[-1]

        self.buttonLoad.clicked.connect(self.LoadData)
        self.buttonRemoveAndExport.clicked.connect(self.RemoveInvalidValue)

        self.__testing_ref_data_container = DataContainer()
        self.__clinical_ref = pd.DataFrame()

        self.radioSplitRandom.clicked.connect(self.ChangeSeparateMethod)
        self.radioSplitRef.clicked.connect(self.ChangeSeparateMethod)
        self.checkUseClinicRef.clicked.connect(
            self.RandomSeparateButtonUpdates)
        self.loadTestingReference.clicked.connect(
            self.LoadTestingReferenceDataContainer)
        self.clearTestingReference.clicked.connect(
            self.ClearTestingReferenceDataContainer)
        self.loadClinicRef.clicked.connect(self.LoadClinicalRef)
        self.clearClinicRef.clicked.connect(self.ClearClinicalRef)

        self.buttonSave.clicked.connect(self.CheckAndSave)

    def closeEvent(self, QCloseEvent):
        self.close_signal.emit(True)
        QCloseEvent.accept()

    def UpdateTable(self):
        self.tableFeature.setRowCount(self.data_container.GetFrame().shape[0])
        header_name = deepcopy(list(self.data_container.GetFrame().columns))

        min_col = np.min([len(header_name), 100])
        if min_col == 100:
            header_name = header_name[:100]
            header_name[-1] = '...'

        self.tableFeature.setColumnCount(min_col)
        self.tableFeature.setHorizontalHeaderLabels(header_name)
        self.tableFeature.setVerticalHeaderLabels(
            list(map(str,
                     self.data_container.GetFrame().index)))

        for row_index in range(self.data_container.GetFrame().shape[0]):
            for col_index in range(min_col):
                if col_index < 99:
                    self.tableFeature.setItem(
                        row_index, col_index,
                        QTableWidgetItem(
                            str(self.data_container.GetFrame().iloc[
                                row_index, col_index])))
                else:
                    self.tableFeature.setItem(row_index, col_index,
                                              QTableWidgetItem('...'))

        text = "The number of cases: {:d}\n".format(
            self.data_container.GetFrame().shape[0])
        # To process Label temporally
        if 'label' in self.data_container.GetFrame().columns:
            label_name = 'label'
            text += "The number of features: {:d}\n".format(
                self.data_container.GetFrame().shape[1] - 1)
        elif 'Label' in self.data_container.GetFrame().columns:
            label_name = 'Label'
            text += "The number of features: {:d}\n".format(
                self.data_container.GetFrame().shape[1] - 1)
        else:
            label_name = ''
            text += "The number of features: {:d}\n".format(
                self.data_container.GetFrame().shape[1])
        if label_name:
            labels = np.asarray(
                self.data_container.GetFrame()[label_name].values,
                dtype=np.int)
            if len(np.unique(labels)) == 2:
                positive_number = len(np.where(labels == np.max(labels))[0])
                negative_number = len(labels) - positive_number
                assert (positive_number + negative_number == len(labels))
                text += "The number of positive samples: {:d}\n".format(
                    positive_number)
                text += "The number of negative samples: {:d}\n".format(
                    negative_number)
        self.textInformation.setText(text)

    def SetButtonsState(self, state):
        self.buttonRemoveAndExport.setEnabled(state)
        self.buttonSave.setEnabled(state)
        self.checkExport.setEnabled(state)
        self.radioRemoveNone.setEnabled(state)
        self.radioRemoveNonvalidCases.setEnabled(state)
        self.radioRemoveNonvalidFeatures.setEnabled(state)
        self.radioSplitRandom.setEnabled(state)
        self.radioSplitRef.setEnabled(state)

    def LoadData(self):
        dlg = QFileDialog()
        file_name, _ = dlg.getOpenFileName(self,
                                           'Open SCV file',
                                           filter="csv files (*.csv)")
        if file_name:
            try:
                if self.data_container.Load(file_name, is_update=False):
                    self.UpdateTable()
                    self.SetButtonsState(True)

            except OSError as reason:
                eclog(self._filename).GetLogger().error(
                    'Load CSV Error: {}'.format(reason))
                QMessageBox.about(self, 'Load data Error', reason.__str__())
                print('Error!' + str(reason))
            except ValueError:
                eclog(self._filename).GetLogger().error(
                    'Open CSV Error: {}'.format(file_name))
                QMessageBox.information(self, 'Error',
                                        'The selected data file mismatch.')

    def LoadTestingReferenceDataContainer(self):
        dlg = QFileDialog()
        file_name, _ = dlg.getOpenFileName(self,
                                           'Open SCV file',
                                           filter="csv files (*.csv)")
        if file_name:
            try:
                self.__testing_ref_data_container.Load(file_name)
                self.loadTestingReference.setEnabled(False)
                self.clearTestingReference.setEnabled(True)
                self.spinBoxSeparate.setEnabled(False)
            except OSError as reason:
                eclog(self._filename).GetLogger().error(
                    'Load Testing Ref Error: {}'.format(reason))
                print('Error!' + str(reason))
            except ValueError:
                eclog(self._filename).GetLogger().error(
                    'Open CSV Error: {}'.format(file_name))
                QMessageBox.information(self, 'Error',
                                        'The selected data file mismatch.')

    def ClearTestingReferenceDataContainer(self):
        del self.__testing_ref_data_container
        self.__testing_ref_data_container = DataContainer()
        self.loadTestingReference.setEnabled(True)
        self.clearTestingReference.setEnabled(False)
        self.spinBoxSeparate.setEnabled(False)

    def LoadClinicalRef(self):
        dlg = QFileDialog()
        file_name, _ = dlg.getOpenFileName(self,
                                           'Open SCV file',
                                           filter="csv files (*.csv)")
        if file_name:
            try:
                self.__clinical_ref = pd.read_csv(file_name, index_col=0)
                if list(self.__clinical_ref.index) != list(
                        self.data_container.GetFrame().index):
                    QMessageBox.information(
                        self, 'Error',
                        'The index of clinical features is not consistent to the data'
                    )
                    return None
                self.loadClinicRef.setEnabled(False)
                self.clearClinicRef.setEnabled(True)
            except OSError as reason:
                eclog(self._filename).GetLogger().error(
                    'Load Clinical Ref Error: {}'.format(reason))
                QMessageBox.information(self, 'Error',
                                        'Can not Open the Files')
            except ValueError:
                eclog(self._filename).GetLogger().error(
                    'OpenCSV Error: {}'.format(file_name))
                QMessageBox.information(self, 'Error',
                                        'The selected data file mismatch.')
            return None

    def ClearClinicalRef(self):
        del self.__clinical_ref
        self.__clinical_ref = pd.DataFrame()
        self.loadClinicRef.setEnabled(True)
        self.clearClinicRef.setEnabled(False)

    def RemoveInvalidValue(self):
        if not self.data_container.IsEmpty():
            if self.checkExport.isChecked():
                dlg = QFileDialog()
                store_path, _ = dlg.getSaveFileName(self,
                                                    'Save CSV feature files',
                                                    'features.csv',
                                                    filter="CSV files (*.csv)")

                # folder_name = QFileDialog.getExistingDirectory(self, "Save Invalid data")
                # store_path = os.path.join(folder_name, 'invalid_feature.csv')
            else:
                store_path = ''

            if self.radioRemoveNonvalidCases.isChecked():
                self.data_container.RemoveInvalid(store_path=store_path,
                                                  remove_index=REMOVE_CASE)
            elif self.radioRemoveNonvalidFeatures.isChecked():
                self.data_container.RemoveInvalid(store_path=store_path,
                                                  remove_index=REMOVE_FEATURE)
            self.UpdateTable()

    def ChangeSeparateMethod(self):
        if self.radioSplitRandom.isChecked():
            self.spinBoxSeparate.setEnabled(True)
            self.checkUseClinicRef.setEnabled(True)
            self.loadTestingReference.setEnabled(False)
            self.clearTestingReference.setEnabled(False)
        elif self.radioSplitRef.isChecked():
            self.spinBoxSeparate.setEnabled(False)
            self.checkUseClinicRef.setEnabled(False)
            if self.__testing_ref_data_container.IsEmpty():
                self.loadTestingReference.setEnabled(True)
                self.clearTestingReference.setEnabled(False)
            else:
                self.loadTestingReference.setEnabled(False)
                self.clearTestingReference.setEnabled(True)
        self.RandomSeparateButtonUpdates()

    def RandomSeparateButtonUpdates(self):
        if self.checkUseClinicRef.isChecked():
            if self.__clinical_ref.size > 0:
                self.loadClinicRef.setEnabled(False)
                self.clearClinicRef.setEnabled(True)
            else:
                self.loadClinicRef.setEnabled(True)
                self.clearClinicRef.setEnabled(False)
        else:
            self.loadClinicRef.setEnabled(False)
            self.clearClinicRef.setEnabled(False)

    def CheckAndSave(self):
        if self.data_container.IsEmpty():
            QMessageBox.warning(self, "Warning", "There is no data",
                                QMessageBox.Ok)
            return None

        if self.data_container.HasInvalidNumber():
            QMessageBox.warning(self, "Warning", "There are nan items",
                                QMessageBox.Ok)
            non_valid_number_index = self.data_container.FindInvalidNumberIndex(
            )
            old_edit_triggers = self.tableFeature.editTriggers()
            self.tableFeature.setEditTriggers(QAbstractItemView.CurrentChanged)
            self.tableFeature.setCurrentCell(non_valid_number_index[0],
                                             non_valid_number_index[1])
            self.tableFeature.setEditTriggers(old_edit_triggers)
            return None

        self.data_container.UpdateDataByFrame()

        if not self.data_container.IsBinaryLabel():
            QMessageBox.warning(self, "Warning", "There are not 2 Labels",
                                QMessageBox.Ok)
            return None

        remove_features_with_same_value = RemoveSameFeatures()
        self.data_container = remove_features_with_same_value.Run(
            self.data_container)

        if self.radioSplitRandom.isChecked() or self.radioSplitRef.isChecked():
            folder_name = QFileDialog.getExistingDirectory(self, "Save data")
            if folder_name != '':
                data_separate = DataSeparate.DataSeparate()
                try:
                    if self.__testing_ref_data_container.IsEmpty():
                        testing_data_percentage = self.spinBoxSeparate.value()
                        if self.__clinical_ref.size == 0:
                            training_data_container, _, = \
                                data_separate.RunByTestingPercentage(self.data_container,
                                                                     testing_data_percentage,
                                                                     store_folder=folder_name)
                        else:
                            training_data_container, _, = \
                                data_separate.RunByTestingPercentage(self.data_container,
                                                                     testing_data_percentage,
                                                                     clinic_df=self.__clinical_ref,
                                                                     store_folder=folder_name)
                    else:
                        training_data_container, _, = \
                            data_separate.RunByTestingReference(self.data_container,
                                                                self.__testing_ref_data_container,
                                                                folder_name)
                        if training_data_container.IsEmpty():
                            QMessageBox.information(
                                self, 'Error',
                                'The testing data does not mismatch, please check the testing data '
                                'really exists in current data')
                            return None
                    os.system("explorer.exe {:s}".format(
                        os.path.normpath(folder_name)))
                except Exception as e:
                    content = 'PrepareConnection, splitting failed: '
                    eclog(self._filename).GetLogger().error('Split Error:  ' +
                                                            e.__str__())
                    QMessageBox.about(self, content, e.__str__())

        else:
            file_name, _ = QFileDialog.getSaveFileName(
                self, "Save data", filter="csv files (*.csv)")
            if file_name:
                self.data_container.Save(file_name)
예제 #29
0
 def ClearTestingReferenceDataContainer(self):
     del self.__testing_ref_data_container
     self.__testing_ref_data_container = DataContainer()
     self.loadTestingReference.setEnabled(True)
     self.clearTestingReference.setEnabled(False)
     self.spinBoxSeparate.setEnabled(False)
예제 #30
0
            vif_dict[exog] = vif

            # calculate tolerance
            tolerance = 1 - r_squared
            tolerance_dict[exog] = tolerance

        # return VIF DataFrame
        df_vif = pd.DataFrame({'VIF': vif_dict, 'Tolerance': tolerance_dict})

        return df_vif


if __name__ == '__main__':
    data_path = r'..\..\Demo\train_numeric_feature.csv'
    from BC.DataContainer.DataContainer import DataContainer
    from BC.FeatureAnalysis.Normalizer import NormalizerZeroCenter
    pca = DimensionReductionByPCA()

    dc = DataContainer()
    dc.Load(data_path)
    dc = NormalizerZeroCenter.Run(dc)
    # dc = pca.Run(dc)

    df = pd.DataFrame(dc.GetArray(),
                      index=dc.GetCaseName(),
                      columns=dc.GetFeatureName())
    dr = DimensionReductionByVIF()

    new_df = dr.CalculateVIF(df)

    print(dc.GetArray().shape, new_df.shape)