Example #1
0
def DrawFeatureRelationshipAccordingToCsvFile(file_path,
                                              selected_feature_name_list,
                                              label_name_list,
                                              store_path=''):
    '''
    Help draw the feature relationship among different features according to the path of the data container.
    :param file_path: the file path of the csv file
    :param selected_feature_name_list: the features that would be drawn
    :param label_name_list: the name of the label. e.g. ['non-cnacer', 'cancer']
    :param store_path: The store path, supporting jpg and eps format.
    :return:
    '''
    data_container = DataContainer()
    data_container.Load(file_path)
    data_container.UsualNormalize()
    data, label, feature_name, case_name = data_container.GetData()

    if len(selected_feature_name_list) > 3 or len(
            selected_feature_name_list) < 1:
        print(
            "Please check the length of the feature list. It can only show the relationship of the 1, 2, or 3 features"
        )

    try:
        index = [feature_name.index(t) for t in selected_feature_name_list]
    except:
        print('The selected feature is not in the data container.')
        return
    result_data = []
    for sub_index in index:
        result_data.append(data[:, sub_index])
    DrawValueRelationship(result_data, selected_feature_name_list, label,
                          label_name_list, store_path)
Example #2
0
def GenerateDescription():
    training_data_container = DataContainer()
    training_data_container.Load(r'..\..\Example\numeric_feature.csv')

    one_pipeline = OnePipeline()
    one_pipeline.LoadPipeline(r'C:\MyCode\FAEGitHub\FAE\Example\report_temp\NormUnit_Cos_ANOVA_5_SVM\pipeline_info.csv')

    description = Description()
    description.Run(training_data_container, one_pipeline, r'..\..\Example\report_temp', r'..\..\Example\report')
Example #3
0
def TestNewData(NewDataCsv, model_folder, result_save_path=''):
    '''

    :param NewDataCsv: New radiomics feature matrix csv file path
    :param model_folder:The trained model path
    :return:classification result
    '''
    train_info = LoadTrainInfo(model_folder)
    new_data_container = DataContainer()

    #Normlization

    new_data_container.Load(NewDataCsv)

    # feature_selector = FeatureSelector()
    # feature_selector.SelectFeatureByName(new_data_container, train_info['selected_features'], is_replace=True)

    new_data_container = train_info['normalizer'].Transform(new_data_container)

    # data_frame = new_data_container.GetFrame()
    # data_frame = data_frame[train_info['selected_features']]
    # new_data_container.SetFrame(data_frame)
    # new_data_container.UpdateDataByFrame()

    ##Model
    train_info['classifier'].SetDataContainer(new_data_container)
    model = train_info['classifier'].GetModel()
    predict = model.predict_proba(new_data_container.GetArray())[:, 1]

    label = new_data_container.GetLabel()
    case_name = new_data_container.GetCaseName()

    test_result_info = [['CaseName', 'Pred', 'Label']]
    for index in range(len(label)):
        test_result_info.append(
            [case_name[index], predict[index], label[index]])

    metric = EstimateMetirc(predict, label)
    info = {}
    info.update(metric)
    cv = CrossValidation()

    print(metric)
    print('\t')

    if result_save_path:
        cv.SaveResult(info, result_save_path)
        np.save(os.path.join(result_save_path, 'test_predict.npy'), predict)
        np.save(os.path.join(result_save_path, 'test_label.npy'), label)
        with open(os.path.join(result_save_path, 'test_info.csv'),
                  'w',
                  newline='') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerows(test_result_info)

    return metric
Example #4
0
            feature_pvalue = self._EstimateDcFeaturePvalue(
                train_dc, test_dc, feature_distribution_type)
            if np.mean(list(feature_pvalue.values())) > mean_pvalue:
                mean_pvalue = np.mean(list(feature_pvalue.values()))
                output_train_dc, output_test_dc = train_dc, test_dc
                output_pvalue = feature_pvalue

        if output_folder is not None and os.path.isdir(output_folder):
            output_train_dc.Save(os.path.join(output_folder, 'train.csv'))
            output_test_dc.Save(os.path.join(output_folder, 'test.csv'))

            pvalue_df = pd.DataFrame(output_pvalue, index=['P Value'])
            distibute_df = pd.DataFrame(feature_distribution_type,
                                        index=['Distribution'])
            store_df = pd.concat((pvalue_df, distibute_df), axis=0)
            store_df.to_csv(os.path.join(output_folder, 'split_info.csv'))


if __name__ == '__main__':
    clinics = pd.read_csv(r'..\..\Demo\simulated_clinics.csv', index_col=0)
    container = DataContainer()
    container.Load(r'..\..\Demo\simulated_feature.csv')

    separator = DataSeparate()
    train, test = separator.RunByTestingPercentage(container,
                                                   0.3,
                                                   clinic_df=clinics)

    print(train.GetArray().shape, test.GetArray().shape)
    print(separator.clinic_split_result)
Example #5
0
                                                 model_name)

        self.total_metric[CV_TRAIN].to_csv(
            os.path.join(store_folder, '{}_results.csv'.format(CV_TRAIN)))
        self.total_metric[CV_VAL].to_csv(
            os.path.join(store_folder, '{}_results.csv'.format(CV_VAL)))


if __name__ == '__main__':
    manager = PipelinesManager()

    index_dict = Index2Dict()

    train = DataContainer()
    test = DataContainer()
    train.Load(r'C:\Users\yangs\Desktop\train_numeric_feature.csv')
    test.Load(r'C:\Users\yangs\Desktop\test_numeric_feature.csv')

    faps = PipelinesManager(
        balancer=index_dict.GetInstantByIndex('UpSampling'),
        normalizer_list=[index_dict.GetInstantByIndex('Mean')],
        dimension_reduction_list=[index_dict.GetInstantByIndex('PCC')],
        feature_selector_list=[index_dict.GetInstantByIndex('ANOVA')],
        feature_selector_num_list=list(np.arange(1, 18)),
        classifier_list=[index_dict.GetInstantByIndex('SVM')],
        cross_validation=index_dict.GetInstantByIndex('5-Fold'))

    # for total, num in faps.RunWithoutCV(train, store_folder=r'..\..\Demo\db2-1'):
    #     print(total, num)
    for total, num, group in faps.RunWithCV(train,
                                            store_folder=r'..\..\Demo\db1'):
Example #6
0
            input_data_container = output
        return output

    def SaveInfo(self, store_folder, all_features):
        for fs in self.__selector_list:
            fs.SaveInfo(store_folder, all_features)

    def SaveDataContainer(self, data_container, store_folder, store_key):
        for fs in self.__selector_list:
            fs.SaveDataContainer(data_container, store_folder, store_key)


################################################################

if __name__ == '__main__':
    from BC.DataContainer.DataContainer import DataContainer
    from BC.FeatureAnalysis.Normalizer import NormalizerZeroCenter
    from BC.FeatureAnalysis.DimensionReduction import DimensionReductionByPCC

    dc = DataContainer()
    pcc = DimensionReductionByPCC()
    fs = FeatureSelectByKruskalWallis(selected_feature_number=5)

    dc.Load(r'..\..\Demo\train_numeric_feature.csv')

    dc = NormalizerZeroCenter.Run(dc)
    dc = pcc.Run(dc)
    print(dc.GetArray().shape)
    dc = fs.Run(dc)
    print(dc.GetArray().shape)
Example #7
0
            vif_dict[exog] = vif

            # calculate tolerance
            tolerance = 1 - r_squared
            tolerance_dict[exog] = tolerance

        # return VIF DataFrame
        df_vif = pd.DataFrame({'VIF': vif_dict, 'Tolerance': tolerance_dict})

        return df_vif


if __name__ == '__main__':
    data_path = r'..\..\Demo\train_numeric_feature.csv'
    from BC.DataContainer.DataContainer import DataContainer
    from BC.FeatureAnalysis.Normalizer import NormalizerZeroCenter
    pca = DimensionReductionByPCA()

    dc = DataContainer()
    dc.Load(data_path)
    dc = NormalizerZeroCenter.Run(dc)
    # dc = pca.Run(dc)

    df = pd.DataFrame(dc.GetArray(),
                      index=dc.GetCaseName(),
                      columns=dc.GetFeatureName())
    dr = DimensionReductionByVIF()

    new_df = dr.CalculateVIF(df)

    print(dc.GetArray().shape, new_df.shape)
Example #8
0
        if output_folder is not None and os.path.isdir(output_folder):
            output_train_dc.Save(os.path.join(output_folder, 'train.csv'))
            output_test_dc.Save(os.path.join(output_folder, 'test.csv'))

            p_value_df = pd.DataFrame(output_p_value, index=['P Value'])
            distribute_df = pd.DataFrame(feature_distribution_type,
                                         index=['Distribution'])
            store_df = pd.concat((p_value_df, distribute_df), axis=0)
            store_df.to_csv(os.path.join(output_folder, 'split_info.csv'))


if __name__ == '__main__':
    # clinics = pd.read_csv(r'..\..\Demo\simulated_clinics.csv', index_col=0)
    # container = DataContainer()
    # container.Load(r'..\..\Demo\simulated_feature.csv')
    #
    # separator = DataSeparate()
    # train, test = separator.RunByTestingPercentage(container, 0.3, clinic_df=clinics)
    #
    # print(train.GetArray().shape, test.GetArray().shape)
    # print(separator.clinic_split_result)
    cluster_split = DataSplitterByFeatureCluster()
    container = DataContainer()
    container.Load(r'.\all_feature.csv')
    output_path = r'.\output'
    clinical_path = r'.\clinical.csv'
    cluster_split.VisualizePartsVariance(container, store_folder=output_path)
    cluster_split.Run(container, output_path, clinical_feature=clinical_path)
    cluster_split.VisualizeCluster(dimension='2d', store_folder=output_path)
    cluster_split.VisualizeCluster(dimension='3d', store_folder=output_path)
Example #9
0
class PrepareConnection(QWidget, Ui_Prepare):
    close_signal = pyqtSignal(bool)

    def __init__(self, parent=None):
        super(PrepareConnection, self).__init__(parent)
        self.setupUi(self)
        self.data_container = DataContainer()
        self._filename = os.path.split(__file__)[-1]

        self.buttonLoad.clicked.connect(self.LoadData)
        self.buttonRemoveAndExport.clicked.connect(self.RemoveInvalidValue)

        self.__testing_ref_data_container = DataContainer()
        self.__clinical_ref = pd.DataFrame()

        self.radioSplitRandom.clicked.connect(self.ChangeSeparateMethod)
        self.radioSplitRef.clicked.connect(self.ChangeSeparateMethod)
        self.checkUseClinicRef.clicked.connect(
            self.RandomSeparateButtonUpdates)
        self.loadTestingReference.clicked.connect(
            self.LoadTestingReferenceDataContainer)
        self.clearTestingReference.clicked.connect(
            self.ClearTestingReferenceDataContainer)
        self.loadClinicRef.clicked.connect(self.LoadClinicalRef)
        self.clearClinicRef.clicked.connect(self.ClearClinicalRef)

        self.buttonSave.clicked.connect(self.CheckAndSave)

    def closeEvent(self, QCloseEvent):
        self.close_signal.emit(True)
        QCloseEvent.accept()

    def UpdateTable(self):
        self.tableFeature.setRowCount(self.data_container.GetFrame().shape[0])
        header_name = deepcopy(list(self.data_container.GetFrame().columns))

        min_col = np.min([len(header_name), 100])
        if min_col == 100:
            header_name = header_name[:100]
            header_name[-1] = '...'

        self.tableFeature.setColumnCount(min_col)
        self.tableFeature.setHorizontalHeaderLabels(header_name)
        self.tableFeature.setVerticalHeaderLabels(
            list(map(str,
                     self.data_container.GetFrame().index)))

        for row_index in range(self.data_container.GetFrame().shape[0]):
            for col_index in range(min_col):
                if col_index < 99:
                    self.tableFeature.setItem(
                        row_index, col_index,
                        QTableWidgetItem(
                            str(self.data_container.GetFrame().iloc[
                                row_index, col_index])))
                else:
                    self.tableFeature.setItem(row_index, col_index,
                                              QTableWidgetItem('...'))

        text = "The number of cases: {:d}\n".format(
            self.data_container.GetFrame().shape[0])
        # To process Label temporally
        if 'label' in self.data_container.GetFrame().columns:
            label_name = 'label'
            text += "The number of features: {:d}\n".format(
                self.data_container.GetFrame().shape[1] - 1)
        elif 'Label' in self.data_container.GetFrame().columns:
            label_name = 'Label'
            text += "The number of features: {:d}\n".format(
                self.data_container.GetFrame().shape[1] - 1)
        else:
            label_name = ''
            text += "The number of features: {:d}\n".format(
                self.data_container.GetFrame().shape[1])
        if label_name:
            labels = np.asarray(
                self.data_container.GetFrame()[label_name].values,
                dtype=np.int)
            if len(np.unique(labels)) == 2:
                positive_number = len(np.where(labels == np.max(labels))[0])
                negative_number = len(labels) - positive_number
                assert (positive_number + negative_number == len(labels))
                text += "The number of positive samples: {:d}\n".format(
                    positive_number)
                text += "The number of negative samples: {:d}\n".format(
                    negative_number)
        self.textInformation.setText(text)

    def SetButtonsState(self, state):
        self.buttonRemoveAndExport.setEnabled(state)
        self.buttonSave.setEnabled(state)
        self.checkExport.setEnabled(state)
        self.radioRemoveNone.setEnabled(state)
        self.radioRemoveNonvalidCases.setEnabled(state)
        self.radioRemoveNonvalidFeatures.setEnabled(state)
        self.radioSplitRandom.setEnabled(state)
        self.radioSplitRef.setEnabled(state)

    def LoadData(self):
        dlg = QFileDialog()
        file_name, _ = dlg.getOpenFileName(self,
                                           'Open SCV file',
                                           filter="csv files (*.csv)")
        if file_name:
            try:
                if self.data_container.Load(file_name, is_update=False):
                    self.UpdateTable()
                    self.SetButtonsState(True)

            except OSError as reason:
                eclog(self._filename).GetLogger().error(
                    'Load CSV Error: {}'.format(reason))
                QMessageBox.about(self, 'Load data Error', reason.__str__())
                print('Error!' + str(reason))
            except ValueError:
                eclog(self._filename).GetLogger().error(
                    'Open CSV Error: {}'.format(file_name))
                QMessageBox.information(self, 'Error',
                                        'The selected data file mismatch.')

    def LoadTestingReferenceDataContainer(self):
        dlg = QFileDialog()
        file_name, _ = dlg.getOpenFileName(self,
                                           'Open SCV file',
                                           filter="csv files (*.csv)")
        if file_name:
            try:
                self.__testing_ref_data_container.Load(file_name)
                self.loadTestingReference.setEnabled(False)
                self.clearTestingReference.setEnabled(True)
                self.spinBoxSeparate.setEnabled(False)
            except OSError as reason:
                eclog(self._filename).GetLogger().error(
                    'Load Testing Ref Error: {}'.format(reason))
                print('Error!' + str(reason))
            except ValueError:
                eclog(self._filename).GetLogger().error(
                    'Open CSV Error: {}'.format(file_name))
                QMessageBox.information(self, 'Error',
                                        'The selected data file mismatch.')

    def ClearTestingReferenceDataContainer(self):
        del self.__testing_ref_data_container
        self.__testing_ref_data_container = DataContainer()
        self.loadTestingReference.setEnabled(True)
        self.clearTestingReference.setEnabled(False)
        self.spinBoxSeparate.setEnabled(False)

    def LoadClinicalRef(self):
        dlg = QFileDialog()
        file_name, _ = dlg.getOpenFileName(self,
                                           'Open SCV file',
                                           filter="csv files (*.csv)")
        if file_name:
            try:
                self.__clinical_ref = pd.read_csv(file_name, index_col=0)
                if list(self.__clinical_ref.index) != list(
                        self.data_container.GetFrame().index):
                    QMessageBox.information(
                        self, 'Error',
                        'The index of clinical features is not consistent to the data'
                    )
                    return None
                self.loadClinicRef.setEnabled(False)
                self.clearClinicRef.setEnabled(True)
            except OSError as reason:
                eclog(self._filename).GetLogger().error(
                    'Load Clinical Ref Error: {}'.format(reason))
                QMessageBox.information(self, 'Error',
                                        'Can not Open the Files')
            except ValueError:
                eclog(self._filename).GetLogger().error(
                    'OpenCSV Error: {}'.format(file_name))
                QMessageBox.information(self, 'Error',
                                        'The selected data file mismatch.')
            return None

    def ClearClinicalRef(self):
        del self.__clinical_ref
        self.__clinical_ref = pd.DataFrame()
        self.loadClinicRef.setEnabled(True)
        self.clearClinicRef.setEnabled(False)

    def RemoveInvalidValue(self):
        if not self.data_container.IsEmpty():
            if self.checkExport.isChecked():
                dlg = QFileDialog()
                store_path, _ = dlg.getSaveFileName(self,
                                                    'Save CSV feature files',
                                                    'features.csv',
                                                    filter="CSV files (*.csv)")

                # folder_name = QFileDialog.getExistingDirectory(self, "Save Invalid data")
                # store_path = os.path.join(folder_name, 'invalid_feature.csv')
            else:
                store_path = ''

            if self.radioRemoveNonvalidCases.isChecked():
                self.data_container.RemoveInvalid(store_path=store_path,
                                                  remove_index=REMOVE_CASE)
            elif self.radioRemoveNonvalidFeatures.isChecked():
                self.data_container.RemoveInvalid(store_path=store_path,
                                                  remove_index=REMOVE_FEATURE)
            self.UpdateTable()

    def ChangeSeparateMethod(self):
        if self.radioSplitRandom.isChecked():
            self.spinBoxSeparate.setEnabled(True)
            self.checkUseClinicRef.setEnabled(True)
            self.loadTestingReference.setEnabled(False)
            self.clearTestingReference.setEnabled(False)
        elif self.radioSplitRef.isChecked():
            self.spinBoxSeparate.setEnabled(False)
            self.checkUseClinicRef.setEnabled(False)
            if self.__testing_ref_data_container.IsEmpty():
                self.loadTestingReference.setEnabled(True)
                self.clearTestingReference.setEnabled(False)
            else:
                self.loadTestingReference.setEnabled(False)
                self.clearTestingReference.setEnabled(True)
        self.RandomSeparateButtonUpdates()

    def RandomSeparateButtonUpdates(self):
        if self.checkUseClinicRef.isChecked():
            if self.__clinical_ref.size > 0:
                self.loadClinicRef.setEnabled(False)
                self.clearClinicRef.setEnabled(True)
            else:
                self.loadClinicRef.setEnabled(True)
                self.clearClinicRef.setEnabled(False)
        else:
            self.loadClinicRef.setEnabled(False)
            self.clearClinicRef.setEnabled(False)

    def CheckAndSave(self):
        if self.data_container.IsEmpty():
            QMessageBox.warning(self, "Warning", "There is no data",
                                QMessageBox.Ok)
            return None

        if self.data_container.HasInvalidNumber():
            QMessageBox.warning(self, "Warning", "There are nan items",
                                QMessageBox.Ok)
            non_valid_number_index = self.data_container.FindInvalidNumberIndex(
            )
            old_edit_triggers = self.tableFeature.editTriggers()
            self.tableFeature.setEditTriggers(QAbstractItemView.CurrentChanged)
            self.tableFeature.setCurrentCell(non_valid_number_index[0],
                                             non_valid_number_index[1])
            self.tableFeature.setEditTriggers(old_edit_triggers)
            return None

        self.data_container.UpdateDataByFrame()

        if not self.data_container.IsBinaryLabel():
            QMessageBox.warning(self, "Warning", "There are not 2 Labels",
                                QMessageBox.Ok)
            return None

        remove_features_with_same_value = RemoveSameFeatures()
        self.data_container = remove_features_with_same_value.Run(
            self.data_container)

        if self.radioSplitRandom.isChecked() or self.radioSplitRef.isChecked():
            folder_name = QFileDialog.getExistingDirectory(self, "Save data")
            if folder_name != '':
                data_separate = DataSeparate.DataSeparate()
                try:
                    if self.__testing_ref_data_container.IsEmpty():
                        testing_data_percentage = self.spinBoxSeparate.value()
                        if self.__clinical_ref.size == 0:
                            training_data_container, _, = \
                                data_separate.RunByTestingPercentage(self.data_container,
                                                                     testing_data_percentage,
                                                                     store_folder=folder_name)
                        else:
                            training_data_container, _, = \
                                data_separate.RunByTestingPercentage(self.data_container,
                                                                     testing_data_percentage,
                                                                     clinic_df=self.__clinical_ref,
                                                                     store_folder=folder_name)
                    else:
                        training_data_container, _, = \
                            data_separate.RunByTestingReference(self.data_container,
                                                                self.__testing_ref_data_container,
                                                                folder_name)
                        if training_data_container.IsEmpty():
                            QMessageBox.information(
                                self, 'Error',
                                'The testing data does not mismatch, please check the testing data '
                                'really exists in current data')
                            return None
                    os.system("explorer.exe {:s}".format(
                        os.path.normpath(folder_name)))
                except Exception as e:
                    content = 'PrepareConnection, splitting failed: '
                    eclog(self._filename).GetLogger().error('Split Error:  ' +
                                                            e.__str__())
                    QMessageBox.about(self, content, e.__str__())

        else:
            file_name, _ = QFileDialog.getSaveFileName(
                self, "Save data", filter="csv files (*.csv)")
            if file_name:
                self.data_container.Save(file_name)
Example #10
0
            raw_train_data_container, raw_test_data_conainer, store_folder,
            is_hyper_parameter)

        if store_folder:
            self.SavePipeline(len(raw_train_data_container.GetFeatureName()),
                              os.path.join(store_folder, 'pipeline_info.csv'))

        return train_cv_metric, val_cv_metric, test_metric, all_train_metric


if __name__ == '__main__':
    index_dict = Index2Dict()

    train = DataContainer()
    test = DataContainer()
    train.Load(r'..\..\Demo\zero_center_normalized_training_feature.csv')
    test.Load(r'..\..\Demo\zero_center_normalized_testing_feature.csv')

    faps = FeatureAnalysisPipelines(
        balancer=index_dict.GetInstantByIndex('NoneBalance'),
        normalizer_list=[index_dict.GetInstantByIndex('None')],
        dimension_reduction_list=[index_dict.GetInstantByIndex('PCC')],
        feature_selector_list=[index_dict.GetInstantByIndex('RFE')],
        feature_selector_num_list=[15],
        classifier_list=[index_dict.GetInstantByIndex('LR')],
        cross_validation=index_dict.GetInstantByIndex('5-Folder'))

    for temp in faps.Run(train, test, store_folder=r'..\..\Demo\db2-2'):
        print(temp)
    print('Done')
Example #11
0
        return "To Remove the unbalance of the training data set, we applied an Tomek link after the " \
               "Synthetic Minority Oversampling TEchnique (SMOTE) to make positive/negative samples balance. "

    def Run(self, data_container, store_path=''):
        data, label, feature_name, label_name = data_container.GetData()
        data_resampled, label_resampled = self._model.fit_sample(data, label)

        new_case_name = [
            'Generate' + str(index) for index in range(data_resampled.shape[0])
        ]
        new_data_container = DataContainer(data_resampled, label_resampled,
                                           data_container.GetFeatureName(),
                                           new_case_name)
        if store_path != '':
            if os.path.isdir(store_path):
                new_data_container.Save(
                    os.path.join(store_path,
                                 '{}_features.csv'.format(self._name)))
            else:
                new_data_container.Save(store_path)
        return new_data_container


if __name__ == '__main__':
    dc = DataContainer()
    dc.Load(r'..\..\Example\numeric_feature.csv')
    print(dc.GetArray().shape, np.sum(dc.GetLabel()))
    b = SmoteTomekSampling()
    new = b.Run(dc)
    print(new.GetArray().shape, np.sum(new.GetLabel()))
Example #12
0
'''

import pandas as pd
from copy import deepcopy
from BC.DataContainer.DataContainer import DataContainer


class FeatureEncodingOneHot():
    def __init__(self):
        pass

    def OneHotOneColumn(self, data_container, feature_list):
        info = data_container.GetFrame()
        feature_name = data_container.GetFeatureName()
        for feature in feature_list:
            assert (feature in feature_name)

        new_info = pd.get_dummies(info, columns=feature_list)
        new_data = DataContainer()
        new_data.SetFrame(new_info)
        return new_data


if __name__ == '__main__':
    data = DataContainer()
    data.Load(r'c:\Users\yangs\Desktop\test.csv')
    info = data.GetFrame()

    new_info = pd.get_dummies(
        info, columns=['bGs', 'PIRADS', 't2score', 'DWIscore', 'MR_stage'])
    new_info.to_csv(r'c:\Users\yangs\Desktop\test_onehot.csv')