예제 #1
0
    def __init__(self,
                 balancer=None,
                 normalizer_list=[NormalizerNone],
                 dimension_reduction_list=[DimensionReductionByPCC()],
                 feature_selector_list=[],
                 feature_selector_num_list=[],
                 classifier_list=[],
                 cross_validation=None,
                 is_hyper_parameter=False,
                 logger=None):
        self.balance = balancer
        self.normalizer_list = normalizer_list
        self.dimension_reduction_list = dimension_reduction_list
        self.feature_selector_list = feature_selector_list
        self.feature_selector_num_list = feature_selector_num_list
        self.classifier_list = classifier_list
        self.cv = cross_validation
        self.is_hyper_parameter = is_hyper_parameter
        self.__logger = logger
        self.version = VERSION

        self.total_metric = {
            TRAIN: pd.DataFrame(columns=HEADER),
            BALANCE_TRAIN: pd.DataFrame(columns=HEADER),
            TEST: pd.DataFrame(columns=HEADER),
            CV_TRAIN: pd.DataFrame(columns=HEADER),
            CV_VAL: pd.DataFrame(columns=HEADER)
        }

        self.GenerateAucDict()
예제 #2
0
    def Run(self,
            train_data_container,
            test_data_container=DataContainer(),
            store_folder='',
            is_hyper_parameter=False):
        column_list = [
            'sample_number', 'positive_number', 'negative_number', 'auc',
            'auc 95% CIs', 'auc std', 'accuracy', 'Youden Index',
            'sensitivity', 'specificity', 'positive predictive value',
            'negative predictive value'
        ]
        train_df = pd.DataFrame(columns=column_list)
        val_df = pd.DataFrame(columns=column_list)
        test_df = pd.DataFrame(columns=column_list)
        all_train_df = pd.DataFrame(columns=column_list)

        if self.__normalizer_list == []:
            self.__normalizer_list = [NormalizerNone()]

        if self._dimension_reduction_list == []:
            self._dimension_reduction_list = [DimensionReductionByPCC()]

        self.GenerateMetircDict()
        self.SavePipelineInfo(store_folder)

        num = 0
        total_num = len(self.__normalizer_list) * \
                    len(self._dimension_reduction_list) * \
                    len(self.__feature_selector_list) * \
                    len(self.__classifier_list) * \
                    len(self.__feature_selector_num_list)

        for normalizer, normalizer_index in zip(
                self.__normalizer_list, range(len(self.__normalizer_list))):
            for dimension_reductor, dimension_reductor_index in zip(
                    self._dimension_reduction_list,
                    range(len(self._dimension_reduction_list))):
                for feature_selector, feature_selector_index in zip(
                        self.__feature_selector_list,
                        range(len(self.__feature_selector_list))):
                    for classifier, classifier_index in zip(
                            self.__classifier_list,
                            range(len(self.__classifier_list))):
                        for feature_num, feature_num_index in zip(
                                self.__feature_selector_num_list,
                                range(len(self.__feature_selector_num_list))):
                            num += 1
                            yield normalizer.GetName(), dimension_reductor.GetName(), feature_selector.GetName(), feature_num, \
                                  classifier.GetName(), num, total_num

                            feature_selector.SetSelectedFeatureNumber(
                                feature_num)
                            one_pipeline = OnePipeline(
                                normalizer=normalizer,
                                dimension_reduction=dimension_reductor,
                                feature_selector=feature_selector,
                                classifier=classifier,
                                cross_validation=self.__cross_validation)
                            case_name = one_pipeline.GetStoreName()
                            case_store_folder = os.path.join(
                                store_folder, case_name)
                            train_cv_metric, val_cv_metric, test_metric, all_train_metric = one_pipeline.Run(
                                train_data_container, test_data_container,
                                case_store_folder, is_hyper_parameter)

                            self.__auc_matrix_dict['train'][
                                normalizer_index, dimension_reductor_index,
                                feature_selector_index, feature_num_index,
                                classifier_index] = train_cv_metric[
                                    'train_auc']
                            self.__auc_std_matrix_dict['train'][
                                normalizer_index, dimension_reductor_index,
                                feature_selector_index, feature_num_index,
                                classifier_index] = train_cv_metric[
                                    'train_auc std']
                            self.__auc_matrix_dict['all_train'][
                                normalizer_index, dimension_reductor_index,
                                feature_selector_index, feature_num_index,
                                classifier_index] = all_train_metric[
                                    'all_train_auc']
                            self.__auc_std_matrix_dict['all_train'][
                                normalizer_index, dimension_reductor_index,
                                feature_selector_index, feature_num_index,
                                classifier_index] = all_train_metric[
                                    'all_train_auc std']
                            self.__auc_matrix_dict['val'][
                                normalizer_index, dimension_reductor_index,
                                feature_selector_index, feature_num_index,
                                classifier_index] = val_cv_metric['val_auc']
                            self.__auc_std_matrix_dict['val'][
                                normalizer_index, dimension_reductor_index,
                                feature_selector_index, feature_num_index,
                                classifier_index] = val_cv_metric[
                                    'val_auc std']

                            if store_folder and os.path.isdir(store_folder):
                                store_path = os.path.join(
                                    store_folder, 'train_result.csv')
                                save_info = [
                                    train_cv_metric['train_' + index]
                                    for index in column_list
                                ]
                                train_df.loc[case_name] = save_info
                                train_df.to_csv(store_path)

                                store_path = os.path.join(
                                    store_folder, 'all_train_result.csv')
                                save_info = [
                                    all_train_metric['all_train_' + index]
                                    for index in column_list
                                ]
                                all_train_df.loc[case_name] = save_info
                                all_train_df.to_csv(store_path)

                                store_path = os.path.join(
                                    store_folder, 'val_result.csv')
                                save_info = [
                                    val_cv_metric['val_' + index]
                                    for index in column_list
                                ]
                                val_df.loc[case_name] = save_info
                                val_df.to_csv(store_path)

                                if not test_data_container.IsEmpty():
                                    self.__auc_matrix_dict['test'][
                                        normalizer_index,
                                        dimension_reductor_index,
                                        feature_selector_index,
                                        feature_num_index,
                                        classifier_index] = test_metric[
                                            'test_auc']
                                    self.__auc_std_matrix_dict['test'][
                                        normalizer_index,
                                        dimension_reductor_index,
                                        feature_selector_index,
                                        feature_num_index,
                                        classifier_index] = test_metric[
                                            'test_auc std']

                                    store_path = os.path.join(
                                        store_folder, 'test_result.csv')
                                    save_info = [
                                        test_metric['test_' + index]
                                        for index in column_list
                                    ]
                                    test_df.loc[case_name] = save_info
                                    test_df.to_csv(store_path)

                                self.SaveMetricDict(store_folder)
예제 #3
0
            input_data_container = output
        return output

    def SaveInfo(self, store_folder, all_features):
        for fs in self.__selector_list:
            fs.SaveInfo(store_folder, all_features)

    def SaveDataContainer(self, data_container, store_folder, store_key):
        for fs in self.__selector_list:
            fs.SaveDataContainer(data_container, store_folder, store_key)


################################################################

if __name__ == '__main__':
    from FAE.DataContainer.DataContainer import DataContainer
    from FAE.FeatureAnalysis.Normalizer import NormalizerZeroCenter
    from FAE.FeatureAnalysis.DimensionReduction import DimensionReductionByPCC

    dc = DataContainer()
    pcc = DimensionReductionByPCC()
    fs = FeatureSelectByKruskalWallis(selected_feature_number=5)

    dc.Load(r'..\..\Demo\train_numeric_feature.csv')

    dc = NormalizerZeroCenter.Run(dc)
    dc = pcc.Run(dc)
    print(dc.GetArray().shape)
    dc = fs.Run(dc)
    print(dc.GetArray().shape)
예제 #4
0
    def Run(self,
            train_data_container,
            test_data_container=DataContainer(),
            store_folder='',
            is_hyper_parameter=False):
        column_list = [
            'sample_number', 'positive_number', 'negative_number', 'auc',
            'auc 95% CIs', 'auc std', 'accuracy', 'Youden Index',
            'sensitivity', 'specificity', 'positive predictive value',
            'negative predictive value'
        ]
        train_df = pd.DataFrame(columns=column_list)
        val_df = pd.DataFrame(columns=column_list)
        test_df = pd.DataFrame(columns=column_list)
        all_train_df = pd.DataFrame(columns=column_list)

        if self.__normalizer_list == []:
            self.__normalizer_list = [NormalizerNone()]

        if self._dimension_reduction_list == []:
            self._dimension_reduction_list = [DimensionReductionByPCC()]

        self.GenerateMetircDict()
        self.SavePipelineInfo(store_folder)

        num = 0
        total_num = len(self.__normalizer_list) * \
                    len(self._dimension_reduction_list) * \
                    len(self.__feature_selector_list) * \
                    len(self.__classifier_list) * \
                    len(self.__feature_selector_num_list)

        for normalizer, normalizer_index in zip(
                self.__normalizer_list, range(len(self.__normalizer_list))):
            normalized_train_data_container = normalizer.Run(
                train_data_container)
            if not test_data_container.IsEmpty():
                normalized_test_data_container = normalizer.Run(
                    test_data_container, is_test=True)
            else:
                normalized_test_data_container = test_data_container

            for dimension_reducor, dimension_reductor_index in zip(
                    self._dimension_reduction_list,
                    range(len(self._dimension_reduction_list))):
                if dimension_reducor:
                    dr_train_data_container = dimension_reducor.Run(
                        normalized_train_data_container)
                    if not test_data_container.IsEmpty():
                        dr_test_data_container = dimension_reducor.Transform(
                            normalized_test_data_container)
                    else:
                        dr_test_data_container = normalized_test_data_container
                else:
                    dr_train_data_container = normalized_train_data_container
                    dr_test_data_container = normalized_test_data_container

                for feature_selector, feature_selector_index in zip(
                        self.__feature_selector_list,
                        range(len(self.__feature_selector_list))):
                    for feature_num, feature_num_index in zip(
                            self.__feature_selector_num_list,
                            range(len(self.__feature_selector_num_list))):
                        feature_selector.SetSelectedFeatureNumber(feature_num)
                        if feature_selector:
                            fs_train_data_container = feature_selector.Run(
                                dr_train_data_container)
                            if not test_data_container.IsEmpty():
                                selected_feature_name = fs_train_data_container.GetFeatureName(
                                )
                                fs = FeatureSelector()
                                fs_test_data_container = fs.SelectFeatureByName(
                                    dr_test_data_container,
                                    selected_feature_name)
                            else:
                                fs_test_data_container = dr_test_data_container
                        else:
                            fs_train_data_container = dr_train_data_container
                            fs_test_data_container = dr_test_data_container

                        for classifier, classifier_index in zip(
                                self.__classifier_list,
                                range(len(self.__classifier_list))):
                            self.__cross_validation.SetClassifier(classifier)

                            num += 1
                            yield normalizer.GetName(), dimension_reducor.GetName(), feature_selector.GetName(), feature_num, \
                                  classifier.GetName(), num, total_num

                            case_name = self.GetStoreName(
                                normalizer.GetName(),
                                dimension_reducor.GetName(),
                                feature_selector.GetName(), str(feature_num),
                                classifier.GetName())
                            case_store_folder = os.path.join(
                                store_folder, case_name)
                            if not os.path.exists(case_store_folder):
                                os.mkdir(case_store_folder)

                            # Save
                            normalizer.SaveInfo(
                                case_store_folder,
                                normalized_train_data_container.GetFeatureName(
                                ))
                            normalizer.SaveNormalDataContainer(
                                normalized_train_data_container,
                                case_store_folder,
                                is_test=False)
                            dimension_reducor.SaveInfo(case_store_folder)
                            dimension_reducor.SaveDataContainer(
                                dr_train_data_container,
                                case_store_folder,
                                is_test=False)
                            feature_selector.SaveInfo(
                                case_store_folder,
                                dr_train_data_container.GetFeatureName())
                            feature_selector.SaveDataContainer(
                                fs_train_data_container,
                                case_store_folder,
                                is_test=False)
                            if not test_data_container.IsEmpty():
                                normalizer.SaveNormalDataContainer(
                                    normalized_test_data_container,
                                    case_store_folder,
                                    is_test=True)
                                dimension_reducor.SaveDataContainer(
                                    dr_test_data_container,
                                    case_store_folder,
                                    is_test=True)
                                feature_selector.SaveDataContainer(
                                    fs_test_data_container,
                                    case_store_folder,
                                    is_test=True)

                            train_cv_metric, val_cv_metric, test_metric, all_train_metric = self.__cross_validation.Run(
                                fs_train_data_container,
                                fs_test_data_container, case_store_folder,
                                is_hyper_parameter)

                            self.SaveOnePipeline(
                                os.path.join(case_store_folder,
                                             'pipeline_info.csv'),
                                normalizer.GetName(),
                                dimension_reducor.GetName(),
                                feature_selector.GetName(), feature_num,
                                classifier.GetName(),
                                self.__cross_validation.GetName())

                            # Save Result
                            self.__auc_matrix_dict['train'][
                                normalizer_index, dimension_reductor_index,
                                feature_selector_index, feature_num_index,
                                classifier_index] = train_cv_metric[
                                    'train_auc']
                            self.__auc_std_matrix_dict['train'][
                                normalizer_index, dimension_reductor_index,
                                feature_selector_index, feature_num_index,
                                classifier_index] = train_cv_metric[
                                    'train_auc std']
                            self.__auc_matrix_dict['all_train'][
                                normalizer_index, dimension_reductor_index,
                                feature_selector_index, feature_num_index,
                                classifier_index] = all_train_metric[
                                    'all_train_auc']
                            self.__auc_std_matrix_dict['all_train'][
                                normalizer_index, dimension_reductor_index,
                                feature_selector_index, feature_num_index,
                                classifier_index] = all_train_metric[
                                    'all_train_auc std']
                            self.__auc_matrix_dict['val'][
                                normalizer_index, dimension_reductor_index,
                                feature_selector_index, feature_num_index,
                                classifier_index] = val_cv_metric['val_auc']
                            self.__auc_std_matrix_dict['val'][
                                normalizer_index, dimension_reductor_index,
                                feature_selector_index, feature_num_index,
                                classifier_index] = val_cv_metric[
                                    'val_auc std']

                            if store_folder and os.path.isdir(store_folder):
                                store_path = os.path.join(
                                    store_folder, 'train_result.csv')
                                save_info = [
                                    train_cv_metric['train_' + index]
                                    for index in column_list
                                ]
                                train_df.loc[case_name] = save_info
                                train_df.to_csv(store_path)

                                store_path = os.path.join(
                                    store_folder, 'all_train_result.csv')
                                save_info = [
                                    all_train_metric['all_train_' + index]
                                    for index in column_list
                                ]
                                all_train_df.loc[case_name] = save_info
                                all_train_df.to_csv(store_path)

                                store_path = os.path.join(
                                    store_folder, 'val_result.csv')
                                save_info = [
                                    val_cv_metric['val_' + index]
                                    for index in column_list
                                ]
                                val_df.loc[case_name] = save_info
                                val_df.to_csv(store_path)

                                if not test_data_container.IsEmpty():
                                    self.__auc_matrix_dict['test'][
                                        normalizer_index,
                                        dimension_reductor_index,
                                        feature_selector_index,
                                        feature_num_index,
                                        classifier_index] = test_metric[
                                            'test_auc']
                                    self.__auc_std_matrix_dict['test'][
                                        normalizer_index,
                                        dimension_reductor_index,
                                        feature_selector_index,
                                        feature_num_index,
                                        classifier_index] = test_metric[
                                            'test_auc std']

                                    store_path = os.path.join(
                                        store_folder, 'test_result.csv')
                                    save_info = [
                                        test_metric['test_' + index]
                                        for index in column_list
                                    ]
                                    test_df.loc[case_name] = save_info
                                    test_df.to_csv(store_path)

                                self.SaveMetricDict(store_folder)