コード例 #1
0
ファイル: FeatureRelationship.py プロジェクト: zhukequan/FAE
def DrawFeatureRelationshipAccordingToCsvFile(file_path,
                                              selected_feature_name_list,
                                              label_name_list,
                                              store_path=''):
    '''
    Help draw the feature relationship among different features according to the path of the data container.
    :param file_path: the file path of the csv file
    :param selected_feature_name_list: the features that would be drawn
    :param label_name_list: the name of the label. e.g. ['non-cnacer', 'cancer']
    :param store_path: The store path, supporting jpg and eps format.
    :return:
    '''
    data_container = DataContainer()
    data_container.Load(file_path)
    data_container.UsualNormalize()
    data, label, feature_name, case_name = data_container.GetData()

    if len(selected_feature_name_list) > 3 or len(
            selected_feature_name_list) < 1:
        print(
            "Please check the length of the feature list. It can only show the relationship of the 1, 2, or 3 features"
        )

    try:
        index = [feature_name.index(t) for t in selected_feature_name_list]
    except:
        print('The selected feature is not in the data container.')
        return
    result_data = []
    for sub_index in index:
        result_data.append(data[:, sub_index])
    DrawValueRelationship(result_data, selected_feature_name_list, label,
                          label_name_list, store_path)
コード例 #2
0
ファイル: Description.py プロジェクト: Eggiverse/FAE
def GenerateDescription():
    training_data_container = DataContainer()
    training_data_container.Load(r'..\..\Example\numeric_feature.csv')

    one_pipeline = OnePipeline()
    one_pipeline.LoadPipeline(r'C:\MyCode\FAEGitHub\FAE\Example\report_temp\NormUnit_Cos_ANOVA_5_SVM\pipeline_info.csv')

    description = Description()
    description.Run(training_data_container, one_pipeline, r'..\..\Example\report_temp', r'..\..\Example\report')
コード例 #3
0
def TestNewData(NewDataCsv, model_folder, result_save_path=''):
    '''

    :param NewDataCsv: New radiomics feature matrix csv file path
    :param model_folder:The trained model path
    :return:classification result
    '''
    train_info = LoadTrainInfo(model_folder)
    new_data_container = DataContainer()

    #Normlization

    new_data_container.Load(NewDataCsv)

    # feature_selector = FeatureSelector()
    # feature_selector.SelectFeatureByName(new_data_container, train_info['selected_features'], is_replace=True)

    new_data_container = train_info['normalizer'].Transform(new_data_container)

    # data_frame = new_data_container.GetFrame()
    # data_frame = data_frame[train_info['selected_features']]
    # new_data_container.SetFrame(data_frame)
    # new_data_container.UpdateDataByFrame()

    ##Model
    train_info['classifier'].SetDataContainer(new_data_container)
    model = train_info['classifier'].GetModel()
    predict = model.predict_proba(new_data_container.GetArray())[:, 1]

    label = new_data_container.GetLabel()
    case_name = new_data_container.GetCaseName()

    test_result_info = [['CaseName', 'Pred', 'Label']]
    for index in range(len(label)):
        test_result_info.append(
            [case_name[index], predict[index], label[index]])

    metric = EstimateMetirc(predict, label)
    info = {}
    info.update(metric)
    cv = CrossValidation()

    print(metric)
    print('\t')

    if result_save_path:
        cv.SaveResult(info, result_save_path)
        np.save(os.path.join(result_save_path, 'test_predict.npy'), predict)
        np.save(os.path.join(result_save_path, 'test_label.npy'), label)
        with open(os.path.join(result_save_path, 'test_info.csv'),
                  'w',
                  newline='') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerows(test_result_info)

    return metric
コード例 #4
0
ファイル: DataSeparate.py プロジェクト: yetong170728/FAE
                os.path.join(store_folder, 'train_numeric_feature.csv'))
            df_training = pd.DataFrame(
                data=self._training_index,
                columns=['training_index'],
                index=[case_name[index] for index in self._training_index])

            df_training.to_csv(os.path.join(store_folder,
                                            'training_index.csv'),
                               sep=',',
                               quotechar='"')

            test_data_container.Save(
                os.path.join(store_folder, 'test_numeric_feature.csv'))
            df_testing = pd.DataFrame(
                data=testing_index_list,
                columns=['training_index'],
                index=[case_name[index] for index in testing_index_list])
            df_testing.to_csv(os.path.join(store_folder, 'testing_index.csv'),
                              sep=',',
                              quotechar='"')

        return train_data_container, test_data_container


if __name__ == '__main__':
    data = DataContainer()
    data.Load(r'..\..\Example\numeric_feature.csv')

    data_separator = DataSeparate()
    data_separator.Run(data, store_folder=r'..\..\Example')
コード例 #5
0
ファイル: Normalizer.py プロジェクト: Yukyin/FAE
            if not is_test:
                data_container.Save(
                    os.path.join(store_folder,
                                 'zero_center_unit_training_feature.csv'))
                self.Save(store_path=os.path.join(
                    store_folder,
                    'zero_center_unit_normalization_training.csv'),
                          feature_name=data_container.GetFeatureName())
            else:
                data_container.Save(
                    os.path.join(
                        store_folder,
                        'zero_center_unit_normalized_testing_feature.csv'))
        return data_container

    def GetDescription(self):
        text = "We applied the normalization on the feature matrix.  Each feature vector was subtracted by the mean " \
               "value of the vector and was divided by the length of it. "
        return text


if __name__ == '__main__':
    from FAE.DataContainer.DataContainer import DataContainer

    data_container = DataContainer()
    file_path = os.path.abspath(r'..\..\Example\numeric_feature.csv')
    print(file_path)
    data_container.Load(file_path)

    normalizer = NormalizerZeroCenterAndUnit()
    normalizer.Run(data_container, store_folder=r'..\..\Example\one_pipeline')
コード例 #6
0
ファイル: PrepareConnection.py プロジェクト: zhukequan/FAE
class PrepareConnection(QWidget, Ui_Prepare):
    def __init__(self, parent=None):
        super(PrepareConnection, self).__init__(parent)
        self.setupUi(self)
        self.data_container = DataContainer()

        self.buttonLoad.clicked.connect(self.LoadData)
        self.buttonRemove.clicked.connect(self.RemoveNonValidValue)
        self.checkSeparate.clicked.connect(self.SetSeparateStatus)
        self.spinBoxSeparate.setEnabled(False)

        self.buttonSave.clicked.connect(self.CheckAndSave)

    def UpdateTable(self):
        if self.data_container.GetArray().size == 0:
            return

        self.tableFeature.setRowCount(len(self.data_container.GetCaseName()))
        header_name = deepcopy(self.data_container.GetFeatureName())
        header_name.insert(0, 'Label')
        self.tableFeature.setColumnCount(len(header_name))
        self.tableFeature.setHorizontalHeaderLabels(header_name)
        self.tableFeature.setVerticalHeaderLabels(
            list(map(str, self.data_container.GetCaseName())))

        for row_index in range(len(self.data_container.GetCaseName())):
            for col_index in range(len(header_name)):
                if col_index == 0:
                    self.tableFeature.setItem(
                        row_index, col_index,
                        QTableWidgetItem(
                            str(self.data_container.GetLabel()[row_index])))
                else:
                    self.tableFeature.setItem(
                        row_index, col_index,
                        QTableWidgetItem(
                            str(self.data_container.GetArray()[row_index,
                                                               col_index -
                                                               1])))

        text = "The number of cases: {:d}\n".format(
            len(self.data_container.GetCaseName()))
        text += "The number of features: {:d}\n".format(
            len(self.data_container.GetFeatureName()))
        if len(np.unique(self.data_container.GetLabel())) == 2:
            positive_number = len(
                np.where(self.data_container.GetLabel() == np.max(
                    self.data_container.GetLabel()))[0])
            negative_number = len(
                self.data_container.GetLabel()) - positive_number
            assert (positive_number + negative_number == len(
                self.data_container.GetLabel()))
            text += "The number of positive samples: {:d}\n".format(
                positive_number)
            text += "The number of negative samples: {:d}\n".format(
                negative_number)
        self.textInformation.setText(text)

    def LoadData(self):
        dlg = QFileDialog()
        file_name, _ = dlg.getOpenFileName(self,
                                           'Open SCV file',
                                           filter="csv files (*.csv)")
        try:
            self.data_container.Load(file_name)
        except:
            print('Error')

        self.UpdateTable()

    def RemoveNonValidValue(self):
        if self.radioRemoveNonvalidCases.isChecked():
            self.data_container.RemoveUneffectiveCases()
        elif self.radioRemoveNonvalidFeatures.isChecked():
            self.data_container.RemoveUneffectiveFeatures()

        self.UpdateTable()

    def SetSeparateStatus(self):
        if self.checkSeparate.isChecked():
            self.spinBoxSeparate.setEnabled(True)
        else:
            self.spinBoxSeparate.setEnabled(False)

    def CheckAndSave(self):
        if self.data_container.IsEmpty():
            QMessageBox.warning(self, "Warning", "There is no data",
                                QMessageBox.Ok)
        elif self.data_container.HasNonValidNumber():
            QMessageBox.warning(self, "Warning", "There are nan items",
                                QMessageBox.Ok)
            non_valid_number_Index = self.data_container.FindNonValidNumberIndex(
            )
            old_edit_triggers = self.tableFeature.editTriggers()
            self.tableFeature.setEditTriggers(QAbstractItemView.CurrentChanged)
            self.tableFeature.setCurrentCell(non_valid_number_Index[0],
                                             non_valid_number_Index[1] + 1)
            self.tableFeature.setEditTriggers(old_edit_triggers)
        elif self.checkSeparate.isChecked():
            percentage_testing_data = self.spinBoxSeparate.value()
            folder_name = QFileDialog.getExistingDirectory(self, "Save data")
            if folder_name != '':
                data_seperate = DataSeparate.DataSeparate(
                    percentage_testing_data)
                data_seperate.Run(self.data_container, folder_name)
        else:
            file_name, _ = QFileDialog.getSaveFileName(
                self, "Save data", filter="csv files (*.csv)")
            if file_name != '':
                self.data_container.Save(file_name)
コード例 #7
0
ファイル: CrossValidation.py プロジェクト: amberjiest/FAE
            self.classifier.Save(store_folder)
            self.SaveResult(info, store_folder)

        return train_cv_metric, val_cv_metric, test_metric, all_train_metric


if __name__ == '__main__':
    from FAE.DataContainer.DataContainer import DataContainer
    from FAE.FeatureAnalysis.Normalizer import NormalizerZeroCenter
    from FAE.FeatureAnalysis.Classifier import SVM, LR, LDA, LRLasso, GaussianProcess, NaiveBayes, DecisionTree, RandomForest, AE, AdaBoost
    import numpy as np

    train_data_container = DataContainer()
    train_data_container.Load(
        r'C:\MyCode\FAEGitHub\FAE\Example\withoutshape\non_balance_features.csv'
    )

    normalizer = NormalizerZeroCenter()
    train_data_container = normalizer.Run(train_data_container)

    data = train_data_container.GetArray()
    label = np.asarray(train_data_container.GetLabel())

    #     param_list = [
    # {"hidden_layer_sizes": [(30,), (100,)],
    # "solver": ["adam"],
    # "alpha": [0.0001, 0.001],
    # "learning_rate_init": [0.001, 0.01]}
    # ]
    #     from sklearn.model_selection import ParameterGrid
コード例 #8
0
class PrepareConnection(QWidget, Ui_Prepare):
    def __init__(self, parent=None):
        super(PrepareConnection, self).__init__(parent)
        self.setupUi(self)
        self.data_container = DataContainer()

        self.buttonLoad.clicked.connect(self.LoadData)
        self.buttonRemove.clicked.connect(self.RemoveNonValidValue)
        self.loadTestingReference.clicked.connect(
            self.LoadTestingReferenceDataContainer)
        self.clearTestingReference.clicked.connect(
            self.ClearTestingReferenceDataContainer)
        self.__testing_ref_data_container = DataContainer()
        self.checkSeparate.clicked.connect(self.SetSeparateStatus)

        self.spinBoxSeparate.setEnabled(False)
        self.logger = eclog(os.path.split(__file__)[-1]).GetLogger()

        self.loadTestingReference.setEnabled(False)
        self.clearTestingReference.setEnabled(False)

        self.buttonSave.clicked.connect(self.CheckAndSave)

    def UpdateTable(self):
        if self.data_container.GetArray().size == 0:
            return

        self.tableFeature.setRowCount(len(self.data_container.GetCaseName()))
        header_name = deepcopy(self.data_container.GetFeatureName())
        header_name.insert(0, 'Label')
        self.tableFeature.setColumnCount(len(header_name))
        self.tableFeature.setHorizontalHeaderLabels(header_name)
        self.tableFeature.setVerticalHeaderLabels(
            list(map(str, self.data_container.GetCaseName())))

        for row_index in range(len(self.data_container.GetCaseName())):
            for col_index in range(len(header_name)):
                if col_index == 0:
                    self.tableFeature.setItem(
                        row_index, col_index,
                        QTableWidgetItem(
                            str(self.data_container.GetLabel()[row_index])))
                else:
                    self.tableFeature.setItem(
                        row_index, col_index,
                        QTableWidgetItem(
                            str(self.data_container.GetArray()[row_index,
                                                               col_index -
                                                               1])))

        text = "The number of cases: {:d}\n".format(
            len(self.data_container.GetCaseName()))
        text += "The number of features: {:d}\n".format(
            len(self.data_container.GetFeatureName()))
        if len(np.unique(self.data_container.GetLabel())) == 2:
            positive_number = len(
                np.where(self.data_container.GetLabel() == np.max(
                    self.data_container.GetLabel()))[0])
            negative_number = len(
                self.data_container.GetLabel()) - positive_number
            assert (positive_number + negative_number == len(
                self.data_container.GetLabel()))
            text += "The number of positive samples: {:d}\n".format(
                positive_number)
            text += "The number of negative samples: {:d}\n".format(
                negative_number)
        self.textInformation.setText(text)

    def LoadData(self):
        dlg = QFileDialog()
        file_name, _ = dlg.getOpenFileName(self,
                                           'Open SCV file',
                                           filter="csv files (*.csv)")
        try:
            self.data_container.Load(file_name)
            self.logger.info('Open the file ' + file_name + ' Succeed.')
        except OSError as reason:
            self.logger.log('Open SCV file Error, The reason is ' +
                            str(reason))
            QMessageBox.about(self, 'Load data Error', reason.__str__())
            print('Error!' + str(reason))
        except ValueError:
            self.logger.error('Open SCV file ' + file_name +
                              ' Failed. because of value error.')
            QMessageBox.information(self, 'Error',
                                    'The selected data file mismatch.')
        self.UpdateTable()

        self.buttonRemove.setEnabled(True)
        self.buttonSave.setEnabled(True)

    def LoadTestingReferenceDataContainer(self):
        dlg = QFileDialog()
        file_name, _ = dlg.getOpenFileName(self,
                                           'Open SCV file',
                                           filter="csv files (*.csv)")
        try:
            self.__testing_ref_data_container.Load(file_name)
            self.loadTestingReference.setEnabled(False)
            self.clearTestingReference.setEnabled(True)
            self.spinBoxSeparate.setEnabled(False)
        except OSError as reason:
            self.logger.log('Load Testing Reference Error: ' + str(reason))
            print('Error!' + str(reason))
        except ValueError:
            self.logger.error('Open SCV file ' + file_name +
                              ' Failed. because of value error.')
            QMessageBox.information(self, 'Error',
                                    'The selected data file mismatch.')

    def ClearTestingReferenceDataContainer(self):
        del self.__testing_ref_data_container
        self.__testing_ref_data_container = DataContainer()
        self.loadTestingReference.setEnabled(True)
        self.clearTestingReference.setEnabled(False)
        self.spinBoxSeparate.setEnabled(False)

    def RemoveNonValidValue(self):
        if self.radioRemoveNonvalidCases.isChecked():
            self.data_container.RemoveUneffectiveCases()
        elif self.radioRemoveNonvalidFeatures.isChecked():
            self.data_container.RemoveUneffectiveFeatures()

        self.UpdateTable()

    def SetSeparateStatus(self):
        if self.checkSeparate.isChecked():
            self.spinBoxSeparate.setEnabled(True)
            self.loadTestingReference.setEnabled(True)
            self.clearTestingReference.setEnabled(False)
        else:
            self.spinBoxSeparate.setEnabled(False)
            self.loadTestingReference.setEnabled(False)
            self.clearTestingReference.setEnabled(False)

    def CheckAndSave(self):
        if self.data_container.IsEmpty():
            QMessageBox.warning(self, "Warning", "There is no data",
                                QMessageBox.Ok)
        elif not self.data_container.IsBinaryLabel():
            QMessageBox.warning(self, "Warning", "There are not 2 Labels",
                                QMessageBox.Ok)
            non_valid_number_Index = self.data_container.FindNonValidLabelIndex(
            )
            old_edit_triggers = self.tableFeature.editTriggers()
            self.tableFeature.setEditTriggers(QAbstractItemView.CurrentChanged)
            self.tableFeature.setCurrentCell(non_valid_number_Index, 0)
            self.tableFeature.setEditTriggers(old_edit_triggers)
        elif self.data_container.HasNonValidNumber():
            QMessageBox.warning(self, "Warning", "There are nan items",
                                QMessageBox.Ok)
            non_valid_number_Index = self.data_container.FindNonValidNumberIndex(
            )
            old_edit_triggers = self.tableFeature.editTriggers()
            self.tableFeature.setEditTriggers(QAbstractItemView.CurrentChanged)
            self.tableFeature.setCurrentCell(non_valid_number_Index[0],
                                             non_valid_number_Index[1] + 1)
            self.tableFeature.setEditTriggers(old_edit_triggers)
        else:
            remove_features_with_same_value = RemoveSameFeatures()
            self.data_container = remove_features_with_same_value.Run(
                self.data_container)

            data_balance = DataBalance()
            if self.radioDownSampling.isChecked():
                data_balance = DownSampling()
            elif self.radioUpSampling.isChecked():
                data_balance = UpSampling()
            elif self.radioSmote.isChecked():
                data_balance = SmoteSampling()

            if self.checkSeparate.isChecked():
                folder_name = QFileDialog.getExistingDirectory(
                    self, "Save data")
                if folder_name != '':
                    data_separate = DataSeparate.DataSeparate()
                    try:
                        if self.__testing_ref_data_container.IsEmpty():
                            testing_data_percentage = self.spinBoxSeparate.value(
                            )
                            training_data_container, _, = data_separate.RunByTestingPercentage(
                                self.data_container, testing_data_percentage,
                                folder_name)
                        else:
                            training_data_container, _, = data_separate.RunByTestingReference(
                                self.data_container,
                                self.__testing_ref_data_container, folder_name)
                            if training_data_container.IsEmpty():
                                QMessageBox.information(
                                    self, 'Error',
                                    'The testing data does not mismatch, please check the testing data '
                                    'really exists in current data')
                                return None
                        data_balance.Run(training_data_container,
                                         store_path=folder_name)
                    except Exception as e:
                        content = 'PrepareConnection, splitting failed: '
                        self.logger.error('{}{}'.format(content, str(e)))
                        QMessageBox.about(self, content, e.__str__())

            else:
                file_name, _ = QFileDialog.getSaveFileName(
                    self, "Save data", filter="csv files (*.csv)")
                if file_name != '':
                    data_balance.Run(self.data_container, store_path=file_name)
コード例 #9
0
        input_data_container = data_container
        for fs in self.__selector_list:
            output = fs.Run(input_data_container, store_folder)
            input_data_container = output
        return output


################################################################

if __name__ == '__main__':
    import os
    print(os.getcwd())
    from FAE.DataContainer.DataContainer import DataContainer
    data_container = DataContainer()
    print(os.path.abspath(r'..\..\Example\numeric_feature.csv'))
    data_container.Load(r'..\..\Example\numeric_feature.csv')
    # data_container.UsualNormalize()

    print(data_container.GetArray().shape)
    print(data_container.GetFeatureName())

    fs = FeatureSelectBySubName(['shape', 'ADC'])

    output = fs.Run(data_container)
    print(output.GetFeatureName())

    # fs1 = RemoveNonNumericFeature()
    # fs1.SetDataContainer(data_container)
    # non_number_data_container = fs1.Run()
    #
    # fs2 = FeatureSelectByANOVA(10)
コード例 #10
0
ファイル: FeatureSelector.py プロジェクト: zhongyi80/FAE
            input_data_container = output
        return output

    def SaveInfo(self, store_folder, all_features):
        for fs in self.__selector_list:
            fs.SaveInfo(store_folder, all_features)

    def SaveDataContainer(self, data_container, store_folder, store_key):
        for fs in self.__selector_list:
            fs.SaveDataContainer(data_container, store_folder, store_key)


################################################################

if __name__ == '__main__':
    from FAE.DataContainer.DataContainer import DataContainer
    from FAE.FeatureAnalysis.Normalizer import NormalizerZeroCenter
    from FAE.FeatureAnalysis.DimensionReduction import DimensionReductionByPCC

    dc = DataContainer()
    pcc = DimensionReductionByPCC()
    fs = FeatureSelectByKruskalWallis(selected_feature_number=5)

    dc.Load(r'..\..\Demo\train_numeric_feature.csv')

    dc = NormalizerZeroCenter.Run(dc)
    dc = pcc.Run(dc)
    print(dc.GetArray().shape)
    dc = fs.Run(dc)
    print(dc.GetArray().shape)
コード例 #11
0
ファイル: DimensionReduction.py プロジェクト: ttddtd/FAE
            vif_dict[exog] = vif

            # calculate tolerance
            tolerance = 1 - r_squared
            tolerance_dict[exog] = tolerance

        # return VIF DataFrame
        df_vif = pd.DataFrame({'VIF': vif_dict, 'Tolerance': tolerance_dict})

        return df_vif


if __name__ == '__main__':
    data_path = r'..\..\Demo\train_numeric_feature.csv'
    from FAE.DataContainer.DataContainer import DataContainer
    from FAE.FeatureAnalysis.Normalizer import NormalizerZeroCenter
    pca = DimensionReductionByPCA()

    dc = DataContainer()
    dc.Load(data_path)
    dc = NormalizerZeroCenter.Run(dc)
    # dc = pca.Run(dc)

    df = pd.DataFrame(dc.GetArray(),
                      index=dc.GetCaseName(),
                      columns=dc.GetFeatureName())
    dr = DimensionReductionByVIF()

    new_df = dr.CalculateVIF(df)

    print(dc.GetArray().shape, new_df.shape)
コード例 #12
0
ファイル: OneHotFeature.py プロジェクト: yuruiqi/FAE
from copy import deepcopy
from FAE.DataContainer.DataContainer import DataContainer


class FeatureEncodingOneHot():
    def __init__(self):
        pass

    def OneHotOneColumn(self, data_container, feature_list):
        info = data_container.GetFrame()
        feature_name = data_container.GetFeatureName()
        for feature in feature_list:
            assert(feature in feature_name)

        new_info = pd.get_dummies(info, columns=feature_list)
        new_data = DataContainer()
        new_data.SetFrame(new_info)
        return new_data



if __name__ == '__main__':
    import pandas as pd

    data = DataContainer()
    data.Load(r'c:\Users\yangs\Desktop\test.csv')
    info = data.GetFrame()

    new_info = pd.get_dummies(info, columns=['bGs', 'PIRADS', 't2score', 'DWIscore', 'MR_stage'])
    new_info.to_csv(r'c:\Users\yangs\Desktop\test_onehot.csv')
コード例 #13
0
                            test_auc_info[index] for index in column_list
                        ]
                        test_df.loc[feature_selector.GetName() + '-' +
                                    classifier.GetName()] = test_save_info
                        test_df.to_csv(test_store_path)

                # return val_return_list, test_return_list


if __name__ == '__main__':
    print(os.getcwd())
    from DataContainer.DataContainer import DataContainer
    import pandas as pd

    data_container = DataContainer()
    data_container.Load(r'..\tempResult\NumericFeature.csv')
    data_container.UsualNormalize()

    df = pd.DataFrame(columns=column_list)

    # Set Feature Selector List
    feature_selector_list = []
    feature_selector_list.append(
        FeatureSelectPipeline([
            RemoveSameFeatures(),
            RemoveCosSimilarityFeatures(),
            FeatureSelectByANOVA()
        ]))
    feature_selector_list.append(
        FeatureSelectPipeline([
            RemoveSameFeatures(),
コード例 #14
0
ファイル: Pipelines.py プロジェクト: zhongyi80/FAE
                                cv_val_metric['{}_{}'.format(CV_VAL, AUC_STD)]
                            self._AddOneMetric(cv_val_metric, os.path.join(cls_store_folder, 'metrics.csv'))
                            self._MergeOneMetric(cv_val_metric, CV_VAL, model_name)

        self.total_metric[CV_TRAIN].to_csv(os.path.join(store_folder, '{}_results.csv'.format(CV_TRAIN)))
        self.total_metric[CV_VAL].to_csv(os.path.join(store_folder, '{}_results.csv'.format(CV_VAL)))


if __name__ == '__main__':
    manager = PipelinesManager()

    index_dict = Index2Dict()

    train = DataContainer()
    test = DataContainer()
    train.Load(r'C:\Users\yangs\Desktop\train_numeric_feature.csv')
    test.Load(r'C:\Users\yangs\Desktop\test_numeric_feature.csv')

    faps = PipelinesManager(balancer=index_dict.GetInstantByIndex('UpSampling'),
                            normalizer_list=[index_dict.GetInstantByIndex('Mean')],
                            dimension_reduction_list=[index_dict.GetInstantByIndex('PCC')],
                            feature_selector_list=[index_dict.GetInstantByIndex('ANOVA')],
                            feature_selector_num_list=list(np.arange(1, 18)),
                            classifier_list=[index_dict.GetInstantByIndex('SVM')],
                            cross_validation=index_dict.GetInstantByIndex('5-Fold'))

    # for total, num in faps.RunWithoutCV(train, store_folder=r'..\..\Demo\db2-1'):
    #     print(total, num)
    for total, num, group in faps.RunWithCV(train, store_folder=r'..\..\Demo\db1'):
        print(total, num, group)
    for total, num in faps.MergeCvResult(store_folder=r'..\..\Demo\db2-1'):
コード例 #15
0
ファイル: PrepareForm.py プロジェクト: ttddtd/FAE
class PrepareConnection(QWidget, Ui_Prepare):
    close_signal = pyqtSignal(bool)

    def __init__(self, parent=None):
        super(PrepareConnection, self).__init__(parent)
        self.setupUi(self)
        self.data_container = DataContainer()
        self._filename = os.path.split(__file__)[-1]

        self.buttonLoad.clicked.connect(self.LoadData)
        self.buttonRemoveAndExport.clicked.connect(self.RemoveInvalidValue)

        self.__testing_ref_data_container = DataContainer()
        self.__clinical_ref = pd.DataFrame()

        self.radioSplitRandom.clicked.connect(self.ChangeSeparateMethod)
        self.radioSplitRef.clicked.connect(self.ChangeSeparateMethod)
        self.checkUseClinicRef.clicked.connect(
            self.RandomSeparateButtonUpdates)
        self.loadTestingReference.clicked.connect(
            self.LoadTestingReferenceDataContainer)
        self.clearTestingReference.clicked.connect(
            self.ClearTestingReferenceDataContainer)
        self.loadClinicRef.clicked.connect(self.LoadClinicalRef)
        self.clearClinicRef.clicked.connect(self.ClearClinicalRef)

        self.buttonSave.clicked.connect(self.CheckAndSave)

    def closeEvent(self, QCloseEvent):
        self.close_signal.emit(True)
        QCloseEvent.accept()

    def UpdateTable(self):
        self.tableFeature.setRowCount(self.data_container.GetFrame().shape[0])
        header_name = deepcopy(list(self.data_container.GetFrame().columns))

        min_col = np.min([len(header_name), 100])
        if min_col == 100:
            header_name = header_name[:100]
            header_name[-1] = '...'

        self.tableFeature.setColumnCount(min_col)
        self.tableFeature.setHorizontalHeaderLabels(header_name)
        self.tableFeature.setVerticalHeaderLabels(
            list(map(str,
                     self.data_container.GetFrame().index)))

        for row_index in range(self.data_container.GetFrame().shape[0]):
            for col_index in range(min_col):
                if col_index < 99:
                    self.tableFeature.setItem(
                        row_index, col_index,
                        QTableWidgetItem(
                            str(self.data_container.GetFrame().iloc[
                                row_index, col_index])))
                else:
                    self.tableFeature.setItem(row_index, col_index,
                                              QTableWidgetItem('...'))

        text = "The number of cases: {:d}\n".format(
            self.data_container.GetFrame().shape[0])
        # To process Label temporally
        if 'label' in self.data_container.GetFrame().columns:
            label_name = 'label'
            text += "The number of features: {:d}\n".format(
                self.data_container.GetFrame().shape[1] - 1)
        elif 'Label' in self.data_container.GetFrame().columns:
            label_name = 'Label'
            text += "The number of features: {:d}\n".format(
                self.data_container.GetFrame().shape[1] - 1)
        else:
            label_name = ''
            text += "The number of features: {:d}\n".format(
                self.data_container.GetFrame().shape[1])
        if label_name:
            labels = np.asarray(
                self.data_container.GetFrame()[label_name].values,
                dtype=np.int)
            if len(np.unique(labels)) == 2:
                positive_number = len(np.where(labels == np.max(labels))[0])
                negative_number = len(labels) - positive_number
                assert (positive_number + negative_number == len(labels))
                text += "The number of positive samples: {:d}\n".format(
                    positive_number)
                text += "The number of negative samples: {:d}\n".format(
                    negative_number)
        self.textInformation.setText(text)

    def SetButtonsState(self, state):
        self.buttonRemoveAndExport.setEnabled(state)
        self.buttonSave.setEnabled(state)
        self.checkExport.setEnabled(state)
        self.radioRemoveNone.setEnabled(state)
        self.radioRemoveNonvalidCases.setEnabled(state)
        self.radioRemoveNonvalidFeatures.setEnabled(state)
        self.radioSplitRandom.setEnabled(state)
        self.radioSplitRef.setEnabled(state)

    def LoadData(self):
        dlg = QFileDialog()
        file_name, _ = dlg.getOpenFileName(self,
                                           'Open SCV file',
                                           filter="csv files (*.csv)")
        if file_name:
            try:
                if self.data_container.Load(file_name, is_update=True):
                    self.UpdateTable()
                    self.SetButtonsState(True)

            except OSError as reason:
                eclog(self._filename).GetLogger().error(
                    'Load CSV Error: {}'.format(reason))
                QMessageBox.about(self, 'Load data Error', reason.__str__())
                print('Error!' + str(reason))
            except ValueError:
                eclog(self._filename).GetLogger().error(
                    'Open CSV Error: {}'.format(file_name))
                QMessageBox.information(self, 'Error',
                                        'The selected data file mismatch.')

    def LoadTestingReferenceDataContainer(self):
        dlg = QFileDialog()
        file_name, _ = dlg.getOpenFileName(self,
                                           'Open SCV file',
                                           filter="csv files (*.csv)")
        if file_name:
            try:
                self.__testing_ref_data_container.Load(file_name)
                self.loadTestingReference.setEnabled(False)
                self.clearTestingReference.setEnabled(True)
                self.spinBoxSeparate.setEnabled(False)
            except OSError as reason:
                eclog(self._filename).GetLogger().error(
                    'Load Testing Ref Error: {}'.format(reason))
                print('Error!' + str(reason))
            except ValueError:
                eclog(self._filename).GetLogger().error(
                    'Open CSV Error: {}'.format(file_name))
                QMessageBox.information(self, 'Error',
                                        'The selected data file mismatch.')

    def ClearTestingReferenceDataContainer(self):
        del self.__testing_ref_data_container
        self.__testing_ref_data_container = DataContainer()
        self.loadTestingReference.setEnabled(True)
        self.clearTestingReference.setEnabled(False)
        self.spinBoxSeparate.setEnabled(False)

    def LoadClinicalRef(self):
        dlg = QFileDialog()
        file_name, _ = dlg.getOpenFileName(self,
                                           'Open SCV file',
                                           filter="csv files (*.csv)")
        if file_name:
            try:
                self.__clinical_ref = pd.read_csv(file_name, index_col=0)
                if list(self.__clinical_ref.index
                        ) != self.data_container.GetCaseName():
                    QMessageBox.information(
                        self, 'Error',
                        'The index of clinical features is not consistent to the data'
                    )
                    return None
                self.loadClinicRef.setEnabled(False)
                self.clearClinicRef.setEnabled(True)
            except OSError as reason:
                eclog(self._filename).GetLogger().error(
                    'Load Clinical Ref Error: {}'.format(reason))
                QMessageBox.information(self, 'Error',
                                        'Can not Open the Files')
            except ValueError:
                eclog(self._filename).GetLogger().error(
                    'OpenCSV Error: {}'.format(file_name))
                QMessageBox.information(self, 'Error',
                                        'The selected data file mismatch.')
            return None

    def ClearClinicalRef(self):
        del self.__clinical_ref
        self.__clinical_ref = pd.DataFrame()
        self.loadClinicRef.setEnabled(True)
        self.clearClinicRef.setEnabled(False)

    def RemoveInvalidValue(self):
        if not self.data_container.IsEmpty():
            if self.checkExport.isChecked():
                folder_name = QFileDialog.getExistingDirectory(
                    self, "Save Invalid data")
                store_path = os.path.join(folder_name, 'invalid_feature.csv')
            else:
                store_path = ''

            if self.radioRemoveNone.isChecked():
                self.data_container.RemoveInvalid(store_path=store_path,
                                                  remove_index=REMOVE_NONE)
            if self.radioRemoveNonvalidCases.isChecked():
                self.data_container.RemoveInvalid(store_path=store_path,
                                                  remove_index=REMOVE_CASE)
            elif self.radioRemoveNonvalidFeatures.isChecked():
                self.data_container.RemoveInvalid(store_path=store_path,
                                                  remove_index=REMOVE_FEATURE)
            self.UpdateTable()

    def ChangeSeparateMethod(self):
        if self.radioSplitRandom.isChecked():
            self.spinBoxSeparate.setEnabled(True)
            self.checkUseClinicRef.setEnabled(True)
            self.loadTestingReference.setEnabled(False)
            self.clearTestingReference.setEnabled(False)
        elif self.radioSplitRef.isChecked():
            self.spinBoxSeparate.setEnabled(False)
            self.checkUseClinicRef.setEnabled(False)
            if self.__testing_ref_data_container.IsEmpty():
                self.loadTestingReference.setEnabled(True)
                self.clearTestingReference.setEnabled(False)
            else:
                self.loadTestingReference.setEnabled(False)
                self.clearTestingReference.setEnabled(True)
        self.RandomSeparateButtonUpdates()

    def RandomSeparateButtonUpdates(self):
        if self.checkUseClinicRef.isChecked():
            if self.__clinical_ref.size > 0:
                self.loadClinicRef.setEnabled(False)
                self.clearClinicRef.setEnabled(True)
            else:
                self.loadClinicRef.setEnabled(True)
                self.clearClinicRef.setEnabled(False)
        else:
            self.loadClinicRef.setEnabled(False)
            self.clearClinicRef.setEnabled(False)

    def CheckAndSave(self):
        if self.data_container.IsEmpty():
            QMessageBox.warning(self, "Warning", "There is no data",
                                QMessageBox.Ok)
        elif not self.data_container.IsBinaryLabel():
            QMessageBox.warning(self, "Warning", "There are not 2 Labels",
                                QMessageBox.Ok)
            non_valid_number_index = self.data_container.FindInvalidLabelIndex(
            )
            old_edit_triggers = self.tableFeature.editTriggers()
            self.tableFeature.setEditTriggers(QAbstractItemView.CurrentChanged)
            self.tableFeature.setCurrentCell(non_valid_number_index, 0)
            self.tableFeature.setEditTriggers(old_edit_triggers)
        elif self.data_container.HasInvalidNumber():
            QMessageBox.warning(self, "Warning", "There are nan items",
                                QMessageBox.Ok)
            non_valid_number_index = self.data_container.FindInvalidNumberIndex(
            )
            old_edit_triggers = self.tableFeature.editTriggers()
            self.tableFeature.setEditTriggers(QAbstractItemView.CurrentChanged)
            self.tableFeature.setCurrentCell(non_valid_number_index[0],
                                             non_valid_number_index[1] + 1)
            self.tableFeature.setEditTriggers(old_edit_triggers)
        else:
            remove_features_with_same_value = RemoveSameFeatures()
            self.data_container = remove_features_with_same_value.Run(
                self.data_container)

            if self.radioSplitRandom.isChecked(
            ) or self.radioSplitRef.isChecked():
                folder_name = QFileDialog.getExistingDirectory(
                    self, "Save data")
                if folder_name != '':
                    data_separate = DataSeparate.DataSeparate()
                    try:
                        if self.__testing_ref_data_container.IsEmpty():
                            testing_data_percentage = self.spinBoxSeparate.value(
                            )
                            if self.__clinical_ref.size == 0:
                                training_data_container, _, = \
                                    data_separate.RunByTestingPercentage(self.data_container,
                                                                         testing_data_percentage,
                                                                         store_folder=folder_name)
                            else:
                                training_data_container, _, = \
                                    data_separate.RunByTestingPercentage(self.data_container,
                                                                         testing_data_percentage,
                                                                         clinic_df=self.__clinical_ref,
                                                                         store_folder=folder_name)
                        else:
                            training_data_container, _, = \
                                data_separate.RunByTestingReference(self.data_container,
                                                                    self.__testing_ref_data_container,
                                                                    folder_name)
                            if training_data_container.IsEmpty():
                                QMessageBox.information(
                                    self, 'Error',
                                    'The testing data does not mismatch, please check the testing data '
                                    'really exists in current data')
                                return None
                        os.system("explorer.exe {:s}".format(
                            os.path.normpath(folder_name)))
                    except Exception as e:
                        content = 'PrepareConnection, splitting failed: '
                        eclog(self._filename).GetLogger().error(
                            'Split Error:  ' + e.__str__())
                        QMessageBox.about(self, content, e.__str__())

            else:
                file_name, _ = QFileDialog.getSaveFileName(
                    self, "Save data", filter="csv files (*.csv)")
                if file_name:
                    self.data_container.Save(file_name)
コード例 #16
0
                training_index_list.append(index)

        train_data_container = self.__SetNewData(data_container,
                                                 training_index_list)
        test_data_container = self.__SetNewData(data_container,
                                                testing_index_list)

        if store_folder:
            train_data_container.Save(
                os.path.join(store_folder, 'train_numeric_feature.csv'))
            test_data_container.Save(
                os.path.join(store_folder, 'test_numeric_feature.csv'))

        return train_data_container, test_data_container


if __name__ == '__main__':
    data = DataContainer()
    data.Load(r'..\..\Example\numeric_feature.csv')

    data_separator = DataSeparate()
    data_separator.Run(data, store_folder=r'..\..\Example\separate_test')

    ref_data_container = DataContainer()
    ref_data_container.Load(
        r'..\..\Example\separate_test\train_numeric_feature.csv')

    data_separator.training_ref_data_container = ref_data_container
    data_separator.Run(data,
                       store_folder=r'..\..\Example\separate_test\reload')
コード例 #17
0
            FeatureSelectByPCA()
        ]))

    # Set Classifier List
    classifier_list = []
    classifier_list.append(SVM())
    classifier_list.append(AE(max_iter=1000))
    classifier_list.append(RandomForest())
    classifier_list.append(LDA())

    cv = CrossValidationOnFeatureNumber('5-folder')

    data_container = DataContainer()
    if os.path.exists(r'Example\numeric_feature.csv'):
        data_path = r'Example\numeric_feature.csv'  # Run by Console
    elif os.path.exists(r'numeric_feature.csv'):
        data_path = r'numeric_feature.csv'  # Run by PyCharm
    data_container.Load(data_path)
    data_container.UsualAndL2Normalize()

    fae = FeatureAnalysisExplore(feature_selector_list=feature_selector_list,
                                 classifier_list=classifier_list,
                                 cv=cv,
                                 max_feature_number=20)
    if os.path.exists(r'Result'):
        store_path = r'Result'  # Run By PyCharm
    elif os.path.exists(r'Example\Result'):
        store_path = r'Example\Result'  # Run By Console

    fae.Run(data_container, store_folder=store_path)
コード例 #18
0
                testing_index_list.append(index)
            else:
                training_index_list.append(index)

        train_data_container = self.__SetNewData(data_container,
                                                 training_index_list)
        test_data_container = self.__SetNewData(data_container,
                                                testing_index_list)

        if store_folder:
            train_data_container.Save(
                os.path.join(store_folder, 'train_numeric_feature.csv'))
            test_data_container.Save(
                os.path.join(store_folder, 'test_numeric_feature.csv'))

        return train_data_container, test_data_container


if __name__ == '__main__':
    clinics = pd.read_csv(r'..\..\Demo\simulated_clinics.csv', index_col=0)
    container = DataContainer()
    container.Load(r'..\..\Demo\simulated_feature.csv')

    separator = DataSeparate()
    train, test = separator.RunByTestingPercentage(container,
                                                   0.3,
                                                   clinic_df=clinics)

    print(train.GetArray().shape, test.GetArray().shape)
    print(separator.clinic_split_result)
コード例 #19
0
ファイル: FeaturePipeline.py プロジェクト: zhongyi80/FAE
        self.__cv.SetClassifier(self.__classifier)
        train_cv_metric, val_cv_metric, test_metric, all_train_metric = self.__cv.Run(raw_train_data_container,
                                                              raw_test_data_conainer,
                                                              store_folder,
                                                              is_hyper_parameter)

        if store_folder:
            self.SavePipeline(len(raw_train_data_container.GetFeatureName()), os.path.join(store_folder, 'pipeline_info.csv'))

        return train_cv_metric, val_cv_metric, test_metric, all_train_metric

if __name__ == '__main__':
    index_dict = Index2Dict()

    train = DataContainer()
    test = DataContainer()
    train.Load(r'..\..\Demo\zero_center_normalized_training_feature.csv')
    test.Load(r'..\..\Demo\zero_center_normalized_testing_feature.csv')

    faps = FeatureAnalysisPipelines(balancer=index_dict.GetInstantByIndex('NoneBalance'),
                                    normalizer_list=[index_dict.GetInstantByIndex('None')],
                                    dimension_reduction_list=[index_dict.GetInstantByIndex('PCC')],
                                    feature_selector_list=[index_dict.GetInstantByIndex('RFE')],
                                    feature_selector_num_list=[15],
                                    classifier_list=[index_dict.GetInstantByIndex('LR')],
                                    cross_validation=index_dict.GetInstantByIndex('5-Folder'))

    for temp in faps.Run(train, test, store_folder=r'..\..\Demo\db2-2'):
        print(temp)
    print('Done')