Ejemplo n.º 1
0
    def VisualizePartsVariance(self,
                               dc: DataContainer,
                               max_k=None,
                               method='SSE',
                               store_folder=None,
                               is_show=True):
        # method must be one of SSE or SC. SSE denotes xxxx, SC denotes Silhouette Coefficient

        # TODO: Normalize the train_data
        data = dc.GetArray().transpose()

        if max_k is None:
            max_k = min(data.shape[0], 50)

        assert (method in ['SSE', 'SC'])

        #TODO: plot
        score = []
        for k in range(2, max_k):
            if method == 'SSE':
                pass
            elif method == 'SC':
                pass

        if store_folder and os.path.isdir(store_folder):
            plt.savefig(os.path.join(store_folder, 'ClusteringPlot.jpg'))

        if is_show:
            plt.show()
Ejemplo n.º 2
0
    def VisualizePartsVariance(self, dc: DataContainer, max_k=None, method='SSE',
                               store_folder=None, is_show=True):
        # method must be one of SSE or SC. SSE denotes xxxx, SC denotes Silhouette Coefficient

        data = dc.GetArray()  # get train data
        processed_data = self._DataPreProcess(dc)

        if max_k is None:
            max_k = min(data.shape[0], 50)

        assert(method in ['SSE', 'SC'])

        score = []
        for k in range(2, max_k):
            print('make cluster k=', k)
            estimator = KMeans(n_clusters=k) 
            estimator.fit(processed_data)
            if method == 'SSE':
                score.append(estimator.inertia_)
            elif method == 'SC':
                score.append(silhouette_score(processed_data, estimator.labels_, metric='euclidean'))
        X = range(2, max_k)
        plt.xlabel('k')
        plt.ylabel(method)
        plt.plot(X, score, 'o-')

        if store_folder and os.path.isdir(store_folder):
            plt.savefig(os.path.join(store_folder, 'ClusteringParameterPlot.jpg'))

        if is_show:
            plt.show()
Ejemplo n.º 3
0
def TestNewData(NewDataCsv, model_folder, result_save_path=''):
    '''

    :param NewDataCsv: New radiomics feature matrix csv file path
    :param model_folder:The trained model path
    :return:classification result
    '''
    train_info = LoadTrainInfo(model_folder)
    new_data_container = DataContainer()

    #Normlization

    new_data_container.Load(NewDataCsv)

    # feature_selector = FeatureSelector()
    # feature_selector.SelectFeatureByName(new_data_container, train_info['selected_features'], is_replace=True)

    new_data_container = train_info['normalizer'].Transform(new_data_container)

    # data_frame = new_data_container.GetFrame()
    # data_frame = data_frame[train_info['selected_features']]
    # new_data_container.SetFrame(data_frame)
    # new_data_container.UpdateDataByFrame()

    ##Model
    train_info['classifier'].SetDataContainer(new_data_container)
    model = train_info['classifier'].GetModel()
    predict = model.predict_proba(new_data_container.GetArray())[:, 1]

    label = new_data_container.GetLabel()
    case_name = new_data_container.GetCaseName()

    test_result_info = [['CaseName', 'Pred', 'Label']]
    for index in range(len(label)):
        test_result_info.append(
            [case_name[index], predict[index], label[index]])

    metric = EstimateMetirc(predict, label)
    info = {}
    info.update(metric)
    cv = CrossValidation()

    print(metric)
    print('\t')

    if result_save_path:
        cv.SaveResult(info, result_save_path)
        np.save(os.path.join(result_save_path, 'test_predict.npy'), predict)
        np.save(os.path.join(result_save_path, 'test_label.npy'), label)
        with open(os.path.join(result_save_path, 'test_info.csv'),
                  'w',
                  newline='') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerows(test_result_info)

    return metric
Ejemplo n.º 4
0
            input_data_container = output
        return output

    def SaveInfo(self, store_folder, all_features):
        for fs in self.__selector_list:
            fs.SaveInfo(store_folder, all_features)

    def SaveDataContainer(self, data_container, store_folder, store_key):
        for fs in self.__selector_list:
            fs.SaveDataContainer(data_container, store_folder, store_key)


################################################################

if __name__ == '__main__':
    from BC.DataContainer.DataContainer import DataContainer
    from BC.FeatureAnalysis.Normalizer import NormalizerZeroCenter
    from BC.FeatureAnalysis.DimensionReduction import DimensionReductionByPCC

    dc = DataContainer()
    pcc = DimensionReductionByPCC()
    fs = FeatureSelectByKruskalWallis(selected_feature_number=5)

    dc.Load(r'..\..\Demo\train_numeric_feature.csv')

    dc = NormalizerZeroCenter.Run(dc)
    dc = pcc.Run(dc)
    print(dc.GetArray().shape)
    dc = fs.Run(dc)
    print(dc.GetArray().shape)
Ejemplo n.º 5
0
            vif_dict[exog] = vif

            # calculate tolerance
            tolerance = 1 - r_squared
            tolerance_dict[exog] = tolerance

        # return VIF DataFrame
        df_vif = pd.DataFrame({'VIF': vif_dict, 'Tolerance': tolerance_dict})

        return df_vif


if __name__ == '__main__':
    data_path = r'..\..\Demo\train_numeric_feature.csv'
    from BC.DataContainer.DataContainer import DataContainer
    from BC.FeatureAnalysis.Normalizer import NormalizerZeroCenter
    pca = DimensionReductionByPCA()

    dc = DataContainer()
    dc.Load(data_path)
    dc = NormalizerZeroCenter.Run(dc)
    # dc = pca.Run(dc)

    df = pd.DataFrame(dc.GetArray(),
                      index=dc.GetCaseName(),
                      columns=dc.GetFeatureName())
    dr = DimensionReductionByVIF()

    new_df = dr.CalculateVIF(df)

    print(dc.GetArray().shape, new_df.shape)
Ejemplo n.º 6
0
        return "To Remove the unbalance of the training data set, we applied an Tomek link after the " \
               "Synthetic Minority Oversampling TEchnique (SMOTE) to make positive/negative samples balance. "

    def Run(self, data_container, store_path=''):
        data, label, feature_name, label_name = data_container.GetData()
        data_resampled, label_resampled = self._model.fit_sample(data, label)

        new_case_name = [
            'Generate' + str(index) for index in range(data_resampled.shape[0])
        ]
        new_data_container = DataContainer(data_resampled, label_resampled,
                                           data_container.GetFeatureName(),
                                           new_case_name)
        if store_path != '':
            if os.path.isdir(store_path):
                new_data_container.Save(
                    os.path.join(store_path,
                                 '{}_features.csv'.format(self._name)))
            else:
                new_data_container.Save(store_path)
        return new_data_container


if __name__ == '__main__':
    dc = DataContainer()
    dc.Load(r'..\..\Example\numeric_feature.csv')
    print(dc.GetArray().shape, np.sum(dc.GetLabel()))
    b = SmoteTomekSampling()
    new = b.Run(dc)
    print(new.GetArray().shape, np.sum(new.GetLabel()))