Ejemplo n.º 1
0
def clustering(feature_tsv,
               output_dir,
               k_min,
               k_max,
               cv_repetition,
               covariate_tsv=None,
               cv_strategy='hold_out',
               save_models=False,
               cluster_predefined_c=0.25,
               class_weight_balanced=True,
               weight_initialization_type='DPP',
               num_iteration=50,
               num_consensus=20,
               tol=1e-8,
               n_threads=8,
               verbose=False):
    """
    MLNI core function for clustering
    Args:
        feature_tsv:str, path to the tsv containing extracted feature, following the BIDS convention. The tsv contains
        the following headers: "
                                 "i) the first column is the participant_id;"
                                 "ii) the second column should be the session_id;"
                                 "iii) the third column should be the diagnosis;"
                                 "The following column should be the extracted features. e.g., the ROI features"
        output_dir: str, path to store the clustering results
        k_min: int, minimum k (number of clusters)
        k_max: int, maximum k (number of clusters)
        cv_repetition: int, number of repetitions for cross-validation (CV)
        covariate_tsv: str, path to the tsv containing the covariates, eg., age or sex. The header (first 3 columns) of
                     the tsv file is the same as the feature_tsv, following the BIDS convention.
        cv_strategy: str, cross validation strategy used. Default is hold_out. choices=['k_fold', 'hold_out']
        save_models: Bool, if save all models during CV. Default is False to save space.
                      Set true only if you are going to apply the trained model to unseen data.
        cluster_predefined_c: Float, default is 0.25. The predefined best c if you do not want to perform a nested CV to
                             find it. If used, it should be a float number
        class_weight_balanced: Bool, default is True. If the two groups are balanced.
        weight_initialization_type: str, default is DPP. The strategy for initializing the weight to control the
                                    hyperplances and the subpopulation of patients. choices=["random_hyperplane", "random_assign", "k_means", "DPP"]
        num_iteration: int, default is 50. The number of iterations to iteratively optimize the polytope.
        num_consensus: int, default is 20. The number of repeats for consensus clustering to eliminate the unstable clustering.
        tol: float, default is 1e-8. Clustering stopping criterion.
        n_threads: int, default is 8. The number of threads to run model in parallel.
        verbose: Bool, default is False. If the output message is verbose.

    Returns: clustering outputs.

    """
    print('MLNI for semi-supervised clustering...')
    if covariate_tsv == None:
        input_data = RB_Input(feature_tsv, covariate_tsv=None)
    else:
        input_data = RB_Input(feature_tsv, covariate_tsv=covariate_tsv)

    ## data split
    print('Data split was performed based on validation strategy: %s...\n' %
          cv_strategy)
    if cv_strategy == "hold_out":
        ## check if data split has been done, if yes, the pickle file is there
        if os.path.isfile(
                os.path.join(
                    output_dir, 'data_split_stratified_' + str(cv_repetition) +
                    '-holdout.pkl')):
            split_index = pickle.load(
                open(
                    os.path.join(
                        output_dir, 'data_split_stratified_' +
                        str(cv_repetition) + '-holdout.pkl'), 'rb'))
        else:
            split_index, _ = make_cv_partition(input_data.get_y(), cv_strategy,
                                               output_dir, cv_repetition)
    elif cv_strategy == "k_fold":
        ## check if data split has been done, if yes, the pickle file is there
        if os.path.isfile(
                os.path.join(
                    output_dir, 'data_split_stratified_' + str(cv_repetition) +
                    '-fold.pkl')):
            split_index = pickle.load(
                open(
                    os.path.join(
                        output_dir, 'data_split_stratified_' +
                        str(cv_repetition) + '-fold.pkl'), 'rb'))
        else:
            split_index, _ = make_cv_partition(input_data.get_y(), cv_strategy,
                                               output_dir, cv_repetition)

    print('Data split has been done!\n')

    print('Starts semi-supervised clustering...')
    ## Here, semi-supervised clustering
    wf_clustering = RB_DualSVM_Subtype(
        input_data,
        feature_tsv,
        split_index,
        cv_repetition,
        k_min,
        k_max,
        os.path.join(output_dir, 'clustering'),
        balanced=class_weight_balanced,
        num_consensus=num_consensus,
        num_iteration=num_iteration,
        tol=tol,
        predefined_c=cluster_predefined_c,
        weight_initialization_type=weight_initialization_type,
        n_threads=n_threads,
        save_models=save_models,
        verbose=verbose)

    wf_clustering.run()
    print('Finish...')
Ejemplo n.º 2
0
def classification_roi_feature_selection(feature_tsv,
                                         output_dir,
                                         cv_repetition,
                                         cv_strategy='hold_out',
                                         class_weight_balanced=True,
                                         feature_selection_method='RFE',
                                         top_k=50,
                                         n_threads=8,
                                         seed=None,
                                         verbose=False):
    """
    MLNI core function for classification for ROI-based features with nested feature selection

    Args:
        feature_tsv:str, path to the tsv containing extracted feature, following the BIDS convention. The tsv contains
        the following headers: "
                                 "i) the first column is the participant_id;"
                                 "ii) the second column should be the session_id;"
                                 "iii) the third column should be the diagnosis;"
                                 "The following column should be the extracted features. e.g., the ROI features"
        output_dir: str, path to store the classification results.
        cv_repetition: int, number of repetitions for cross-validation (CV)
        cv_strategy: str, cross validation strategy used. Default is hold_out. choices=['k_fold', 'hold_out']
        class_weight_balanced: Bool, default is True. If the two groups are balanced.
        feature_selection_method: str, default is RFE. choices=['ANOVA', 'RF', 'PCA', 'RFE'].
        top_k: int, default is 50 (50%). Percentage of original feature that the method want to select.
        n_threads: int, default is 8. The number of threads to run model in parallel.
        verbose: Bool, default is False. If the output message is verbose.

    Returns: classification outputs.

    """
    print(
        'MLNI for a binary classification with nested CV and nested feature selection method...'
    )
    input_data = RB_Input(feature_tsv, standardization_method="minmax")

    ## data split
    print('Data split was performed based on validation strategy: %s...\n' %
          cv_strategy)
    ## check if data split has been done, if yes, the pickle file is there
    if os.path.isfile(
            os.path.join(
                output_dir, 'data_split_stratified_' + str(cv_repetition) +
                '-holdout.pkl')):
        split_index = pickle.load(
            open(
                os.path.join(
                    output_dir, 'data_split_stratified_' + str(cv_repetition) +
                    '-holdout.pkl'), 'rb'))
    else:
        split_index, _ = make_cv_partition(input_data.get_y(),
                                           cv_strategy,
                                           output_dir,
                                           cv_repetition,
                                           seed=seed)
    print('Data split has been done!\n')

    print('Starts binary classification...')
    ## Here, we perform a nested CV (outer CV with defined CV method, inner CV with 10-fold grid search) for classification.
    if cv_strategy == 'hold_out':
        wf_classification = RB_RepeatedHoldOut_DualSVM_Classification_Nested_Feature_Selection(
            input_data,
            split_index,
            os.path.join(output_dir, 'classification'),
            n_threads=n_threads,
            n_iterations=cv_repetition,
            balanced=class_weight_balanced,
            feature_selection_method=feature_selection_method,
            top_k=top_k,
            verbose=verbose)
        wf_classification.run()
    elif cv_strategy == 'k_fold':
        raise Exception(
            "Non-nested feature selection is currently only supported for repeated hold-out CV"
        )
    else:
        raise Exception("CV methods have not been implemented")

    print('Finish...')
Ejemplo n.º 3
0
def classification_voxel(participant_tsv,
                         output_dir,
                         cv_repetition,
                         cv_strategy='hold_out',
                         class_weight_balanced=True,
                         n_threads=8,
                         seed=None,
                         verbose=False):
    """
    MLNI core function for classification with voxel-wise features

    Args:
        participant_tsv:str, path to the tsv containing extracted feature, following the BIDS convention. The tsv contains
        the following headers: "
                                 "i) the first column is the participant_id;"
                                 "ii) the second column should be the session_id;"
                                 "iii) the third column should be the diagnosis;"
                                 "iv) the forth column should be the path to each image;"
        output_dir: str, path to store the classification results.
        cv_repetition: int, number of repetitions for cross-validation (CV)
        cv_strategy: str, cross validation strategy used. Default is hold_out. choices=['k_fold', 'hold_out']
        class_weight_balanced: Bool, default is True. If the two groups are balanced.
        n_threads: int, default is 8. The number of threads to run model in parallel.
        verbose: Bool, default is False. If the output message is verbose.

    Returns: classification outputs.

    """
    print('MLNI for a binary classification with nested CV...')
    input_data = VB_Input(participant_tsv)

    ## data split
    print('Data split was performed based on validation strategy: %s...\n' %
          cv_strategy)
    ## check if data split has been done, if yes, the pickle file is there
    if os.path.isfile(
            os.path.join(
                output_dir, 'data_split_stratified_' + str(cv_repetition) +
                '-holdout.pkl')):
        split_index = pickle.load(
            open(
                os.path.join(
                    output_dir, 'data_split_stratified_' + str(cv_repetition) +
                    '-holdout.pkl'), 'rb'))
    else:
        split_index, _ = make_cv_partition(input_data.get_y(),
                                           cv_strategy,
                                           output_dir,
                                           cv_repetition,
                                           seed=seed)
    print('Data split has been done!\n')

    print('Starts binary classification...')
    ## Here, we perform a nested CV (outer CV with defined CV method, inner CV with 10-fold grid search) for classification.
    if cv_strategy == 'hold_out':
        wf_classification = VB_RepeatedHoldOut_DualSVM_Classification(
            input_data,
            split_index,
            os.path.join(output_dir, 'classification'),
            n_threads=n_threads,
            n_iterations=cv_repetition,
            balanced=class_weight_balanced,
            verbose=verbose)
        wf_classification.run()
    elif cv_strategy == 'k_fold':
        wf_classification = VB_KFold_DualSVM_Classification(
            input_data,
            split_index,
            os.path.join(output_dir, 'classification'),
            cv_repetition,
            n_threads=n_threads,
            balanced=class_weight_balanced,
            verbose=verbose)
        wf_classification.run()
    else:
        raise Exception("CV methods have not been implemented")

    print('Finish...')
Ejemplo n.º 4
0
def classification_multiscale_opnmf_multikernel(
        participant_tsv,
        opnmf_dir,
        output_dir,
        components_list,
        cv_repetition,
        cv_strategy='hold_out',
        multikernel_method='AverageMKL',
        class_weight_balanced=True,
        n_threads=8,
        verbose=False):
    """
    Classification based on the multi-scale feature extracted from opNMF and different multikernel learhing (MKL) strategies.
    Args:
        participant_tsv:
             "i) the first column is the participant_id;"
             "ii) the second column should be the session_id;"
             "iii) the third column should be the diagnosis;"
        opnmf_dir: str, path to the ouptu_dir of opNMF
        output_dir: str, path to store the classification results.
        components_list: list, a list containing all the Cs (number of components)
        num_components_max: int, max of number_of_components
        num_components_step: int, step size
        cv_repetition: int, number of repetitions for cross-validation (CV)
        cv_strategy: str, cross validation strategy used. Currrently only support for hold_out. choices=['hold_out']
        class_weight_balanced: Bool, default is True. If the two groups are balanced.
        n_threads: int, default is 8. The number of threads to run model in parallel.
        verbose: Bool, default is False. If the output message is verbose.
        multikernel_method: str, method for the MKL. Choice: ['AverageMKL']
    Returns:

    """
    if cv_strategy != 'hold_out':
        raise Exception("Only support repetaed hold-out CV currently!")
    ### For voxel approach
    print('Multi-scale ensemble classification...')
    print('Starts classification for each specific scale...')

    ## read the participant tsv
    df_participant = pd.read_csv(participant_tsv, sep='\t')

    ## create a temp file in the output_dir to save the intermediate tsv files
    output_dir_multikernel = os.path.join(output_dir, 'multikernel')
    output_dir_intermediate = os.path.join(output_dir, 'intermediate')
    if not os.path.exists(output_dir_intermediate):
        os.makedirs(output_dir_intermediate)

    ## make the final reuslts folder
    if not os.path.exists(output_dir_multikernel):
        os.makedirs(output_dir_multikernel)

    def prepare_opnmf_tsv_multikernel(components_list, output_dir, opnmf_dir,
                                      df_participant):
        """
        This is the function to calculate the multi-kernel for classification.
        Args:
            components_list:
            output_dir:
            opnmf_dir:
            df_participant:

        Returns:

        """
        kernel_list = []
        ## first loop on different initial C.
        for i in components_list:
            ## create a temp file in the output_dir to save the intermediate tsv files
            component_output_dir = os.path.join(output_dir,
                                                'component_' + str(i))
            if not os.path.exists(component_output_dir):
                os.makedirs(component_output_dir)
            ### grab the output tsv of each C from opNMF
            opnmf_tsv = os.path.join(opnmf_dir, 'NMF', 'component_' + str(i),
                                     'atlas_components_signal.tsv')
            df_opnmf = pd.read_csv(opnmf_tsv, sep='\t')
            ### only take the rows in opnmf_tsv which are in common in participant_tsv
            df_opnmf = df_opnmf.loc[df_opnmf['participant_id'].isin(
                df_participant['participant_id'])]
            ## now check the dimensions
            if df_participant.shape[0] != df_opnmf.shape[0]:
                raise Exception(
                    "The dimension of the participant_tsv and opNMF are not consistent!"
                )
            ### make sure the row order is consistent with the participant_tsv
            df_opnmf = df_opnmf.set_index('participant_id')
            df_opnmf = df_opnmf.reindex(index=df_participant['participant_id'])
            df_opnmf = df_opnmf.reset_index()
            ## replace the path column in df_opnmf to be diagnosis, and save it to temp path for pyHYDRA classification
            diagnosis_list = list(df_participant['diagnosis'])
            df_opnmf["path"] = diagnosis_list
            df_opnmf.rename(columns={'path': 'diagnosis'}, inplace=True)
            ## save to tsv in a temporal folder
            opnmf_component_tsv = os.path.join(
                output_dir, 'intermediate',
                'opnmf_component_' + str(i) + '.tsv')
            df_opnmf.to_csv(opnmf_component_tsv,
                            index=False,
                            sep='\t',
                            encoding='utf-8')
            ## Calculate the linear kernel for each C
            input_data = RB_Input(opnmf_component_tsv,
                                  standardization_method="minmax")
            kernel = input_data.get_kernel()
            kernel_list.append(kernel)

        ## merge the list of kernels based on the weights of number of components
        components_list_weight = [
            i / sum(components_list) for i in components_list
        ]
        import numpy as np
        kernel_final = np.zeros(kernel.shape)
        for j in range(len(kernel_list)):
            if j == 0:
                kernel_final = kernel_list[j] * components_list_weight[j]
            else:
                kernel_final += kernel_list[j] * components_list_weight[j]
        return kernel_final, input_data

    if multikernel_method == 'AverageMKL':
        kernel_final, input_data = prepare_opnmf_tsv_multikernel(
            components_list, output_dir, opnmf_dir, df_participant)
        ## data split
        print(
            'Data split was performed based on validation strategy: %s...\n' %
            cv_strategy)
        ## check if data split has been done, if yes, the pickle file is there
        if os.path.isfile(
                os.path.join(
                    output_dir, 'data_split_stratified_' + str(cv_repetition) +
                    '-holdout.pkl')):
            split_index = pickle.load(
                open(
                    os.path.join(
                        output_dir, 'data_split_stratified_' +
                        str(cv_repetition) + '-holdout.pkl'), 'rb'))
        else:
            split_index, _ = make_cv_partition(input_data.get_y(), cv_strategy,
                                               output_dir, cv_repetition)
        print('Data split has been done!\n')

        print('Starts binary classification...')
        ## Here, we perform a nested CV (outer CV with defined CV method, inner CV with 10-fold grid search) for classification.
        if cv_strategy == 'hold_out':
            wf_classification = RB_RepeatedHoldOut_DualSVM_Classification(
                input_data,
                split_index,
                os.path.join(output_dir, 'multikernel'),
                n_threads=n_threads,
                n_iterations=cv_repetition,
                balanced=class_weight_balanced,
                kernel=kernel_final,
                verbose=verbose)
            wf_classification.run()
        elif cv_strategy == 'k_fold':
            wf_classification = RB_KFold_DualSVM_Classification(
                input_data,
                split_index,
                os.path.join(output_dir, 'multikernel'),
                cv_repetition,
                n_threads=n_threads,
                kernel=kernel_final,
                balanced=class_weight_balanced,
                verbose=verbose)
            wf_classification.run()
        else:
            raise Exception("CV methods have not been implemented")
    else:
        raise Exception("Other MKL methods have not been implemented yet...")
    print('Finish...')
Ejemplo n.º 5
0
def regression_roi(feature_tsv,
                   output_dir,
                   cv_repetition,
                   cv_strategy='hold_out',
                   n_threads=8,
                   seed=None,
                   verbose=False):
    """
    Core function for regression with ROI-based features

    Args:
        feature_tsv:str, path to the tsv containing extracted feature, following the BIDS convention. The tsv contains
        the following headers: "
                                 "i) the first column is the participant_id;"
                                 "ii) the second column should be the session_id;"
                                 "iii) the third column should be the diagnosis;"
                                 "The following column should be the extracted features. e.g., the ROI features"
        output_dir: str, path to store the regression results.
        cv_repetition: int, number of repetitions for cross-validation (CV)
        cv_strategy: str, cross validation strategy used. Default is hold_out. choices=['k_fold', 'hold_out']
        n_threads: int, default is 8. The number of threads to run model in parallel.
        verbose: Bool, default is False. If the output message is verbose.

    Returns: regression outputs.

    """
    print('MLNI for a regression with nested CV...')
    input_data = RB_Input(feature_tsv, standardization_method="minmax")

    ## data split
    print('Data split was performed based on validation strategy: %s...\n' %
          cv_strategy)
    ## check if data split has been done, if yes, the pickle file is there
    if os.path.isfile(
            os.path.join(
                output_dir, 'data_split_stratified_' + str(cv_repetition) +
                '-holdout.pkl')):
        split_index = pickle.load(
            open(
                os.path.join(
                    output_dir, 'data_split_stratified_' + str(cv_repetition) +
                    '-holdout.pkl'), 'rb'))
    else:
        split_index, _ = make_cv_partition(input_data.get_y(),
                                           cv_strategy,
                                           output_dir,
                                           cv_repetition,
                                           seed=seed)
    print('Data split has been done!\n')

    print('Starts regression with SVR...')
    ## Here, we perform a nested CV (outer CV with defined CV method, inner CV with 10-fold grid search) for regression.
    if cv_strategy == 'hold_out':
        wf_regression = RB_RepeatedHoldOut_DualSVM_Regression(
            input_data,
            split_index,
            os.path.join(output_dir, 'regression'),
            n_threads=n_threads,
            n_iterations=cv_repetition,
            verbose=verbose)
        wf_regression.run()
    elif cv_strategy == 'k_fold':
        wf_regression = RB_KFold_DualSVM_Regression(input_data,
                                                    split_index,
                                                    os.path.join(
                                                        output_dir,
                                                        'regression'),
                                                    cv_repetition,
                                                    n_threads=n_threads,
                                                    verbose=verbose)
        wf_regression.run()
    else:
        raise Exception("CV methods have not been implemented")

    print('Finish...')