Ejemplo n.º 1
0
def process_arff(feature_path, subjects, classes, out_dir):
    """Processes the given dataset to return a clean name and path."""

    loaded_dataset = load_arff_dataset(feature_path)
    if len(loaded_dataset.description) > 1:
        method_name = loaded_dataset.description
    else:
        method_name = basename(feature_path)

    out_name = make_dataset_filename(method_name)
    out_path_cur_dataset = pjoin(out_dir, out_name)
    loaded_dataset.save(out_path_cur_dataset)

    if not saved_dataset_matches(loaded_dataset, subjects, classes):
        raise ValueError('supplied ARFF dataset does not match '
                         'samples in the meta data.')

    return method_name, out_path_cur_dataset
Ejemplo n.º 2
0
def import_datasets(method_list,
                    out_dir,
                    subjects,
                    classes,
                    feature_path,
                    feature_type='dir_of_dirs'):
    """
    Imports all the specified feature sets and organizes them into datasets.

    Parameters
    ----------
    method_list : list of callables
        Set of predefined methods returning a vector of features f
        or a given sample id and location

    out_dir : str
        Path to the output folder

    subjects : list of str
        List of sample ids

    classes : dict
        Dict identifying the class for each sample id in the dataset.

    feature_path : list of str
        List of paths to the root folder containing features (pre- or user-defined).
        Must be of same length as method_list

    feature_type : str
        a string identifying the structure of feature set.
        Choices = ('dir_of_dirs', 'data_matrix')

    Returns
    -------
    method_names : list of str
        List of method names used for annotation

    dataset_paths_file : str
        Path to the file containing paths to imported feature sets

    missing_data_flag : list
        List of boolean flags
        indicating whether data is missing in each of the input datasets.

    """
    def clean_str(string):
        return ' '.join(string.strip().split(' _-:\n\r\t'))

    method_names = list()
    outpath_list = list()

    for mm, cur_method in enumerate(method_list):
        if cur_method in [get_pyradigm]:

            method_name, out_path_cur_dataset = process_pyradigm(
                feature_path[mm], subjects, classes)
        elif cur_method in [get_arff]:
            method_name, out_path_cur_dataset = process_arff(
                feature_path[mm], subjects, classes, out_dir)
        else:
            if cur_method in [get_dir_of_dirs]:
                method_name = basename(feature_path[mm])

            elif cur_method in [get_data_matrix]:
                method_name = os.path.splitext(basename(feature_path[mm]))[0]

            else:
                method_name = cur_method.__name__

            out_name = make_dataset_filename(method_name)
            out_path_cur_dataset = pjoin(out_dir, out_name)
            if not saved_dataset_matches(out_path_cur_dataset, subjects,
                                         classes):
                # noinspection PyTypeChecker
                out_path_cur_dataset = get_features(subjects, classes,
                                                    feature_path[mm], out_dir,
                                                    out_name, cur_method,
                                                    feature_type)

        method_names.append(clean_str(method_name))
        outpath_list.append(out_path_cur_dataset)

    # checking if there are any duplicates
    if len(set(outpath_list)) < len(outpath_list):
        raise RuntimeError(
            'Duplicate paths to input dataset found!\n'
            'Try distinguish inputs further. Otherwise report this '
            'bug @ github.com/raamana/neuropredict/issues/new')

    print('\nData import is done.\n\n')

    return method_names, outpath_list
Ejemplo n.º 3
0
def import_datasets(method_list,
                    out_dir,
                    subjects,
                    classes,
                    feature_path,
                    feature_type='dir_of_dirs'):
    """
    Imports all the specified feature sets and organizes them into datasets.

    Parameters
    ----------
    method_list : list of callables
        Set of predefined methods returning a vector of features for a given sample id and location
    out_dir : str
        Path to the output folder

    subjects : list of str
        List of sample ids
    classes : dict
        Dict identifying the class for each sample id in the dataset.
    feature_path : list of str
        List of paths to the root directory containing the features (pre- or user-defined).
        Must be of same length as method_list
    feature_type : str
        a string identifying the structure of feature set.
        Choices = ('dir_of_dirs', 'data_matrix')

    Returns
    -------
    method_names : list of str
        List of method names used for annotation.
    dataset_paths_file : str
        Path to the file containing paths to imported feature sets.

    """
    def clean_str(string):
        return ' '.join(string.strip().split(' _-:\n\r\t'))

    method_names = list()
    outpath_list = list()
    for mm, cur_method in enumerate(method_list):
        if cur_method in [get_dir_of_dirs]:
            method_name = basename(feature_path[mm])

        elif cur_method in [get_data_matrix]:
            method_name = os.path.splitext(basename(feature_path[mm]))[0]

        elif cur_method in [get_pyradigm]:

            if feature_type in ['pyradigm']:
                loaded_dataset = MLDataset(filepath=feature_path[mm])
            else:
                raise ValueError('Invalid state of the program!')

            if len(loaded_dataset.description) > 1:
                method_name = loaded_dataset.description
            else:
                method_name = basename(feature_path[mm])

            method_names.append(clean_str(method_name))
            if saved_dataset_matches(loaded_dataset, subjects, classes):
                outpath_list.append(feature_path[mm])
                continue
            else:
                raise ValueError(
                    'supplied pyradigm dataset does not match samples in the meta data.'
                )

        elif cur_method in [get_arff]:

            loaded_dataset = MLDataset(arff_path=feature_path[mm])
            if len(loaded_dataset.description) > 1:
                method_name = loaded_dataset.description
            else:
                method_name = basename(feature_path[mm])

            method_names.append(clean_str(method_name))
            out_name = make_dataset_filename(method_name)
            outpath_dataset = pjoin(out_dir, out_name)
            loaded_dataset.save(outpath_dataset)
            outpath_list.append(outpath_dataset)
            continue
        else:
            # adding an index for an even more unique identification
            # method_name = '{}_{}'.format(cur_method.__name__,mm)
            method_name = cur_method.__name__

        method_names.append(clean_str(method_name))
        out_name = make_dataset_filename(method_name)

        outpath_dataset = pjoin(out_dir, out_name)
        if not saved_dataset_matches(outpath_dataset, subjects, classes):
            # noinspection PyTypeChecker
            outpath_dataset = get_features(subjects, classes, feature_path[mm],
                                           out_dir, out_name, cur_method,
                                           feature_type)

        outpath_list.append(outpath_dataset)

    combined_name = uniq_combined_name(method_names)

    dataset_paths_file = pjoin(out_dir,
                               'datasetlist.' + combined_name + '.txt')
    with open(dataset_paths_file, 'w') as dpf:
        dpf.writelines('\n'.join(outpath_list))

    return method_names, dataset_paths_file
Ejemplo n.º 4
0
def import_datasets(method_list, out_dir, subjects, classes,
                    feature_path, feature_type='dir_of_dirs',
                    user_impute_strategy=cfg.default_imputation_strategy):
    """
    Imports all the specified feature sets and organizes them into datasets.

    Parameters
    ----------
    method_list : list of callables
        Set of predefined methods returning a vector of features for a given sample id and location

    out_dir : str
        Path to the output folder

    subjects : list of str
        List of sample ids

    classes : dict
        Dict identifying the class for each sample id in the dataset.

    feature_path : list of str
        List of paths to the root directory containing the features (pre- or user-defined).
        Must be of same length as method_list

    feature_type : str
        a string identifying the structure of feature set.
        Choices = ('dir_of_dirs', 'data_matrix')

    user_impute_strategy : str
        Strategy to handle the missing data: whether to raise an error if data is missing, or
            to impute them using the method chosen here.

    Returns
    -------
    method_names : list of str
        List of method names used for annotation

    dataset_paths_file : str
        Path to the file containing paths to imported feature sets

    missing_data_flag : list
        List of boolean flags indicating whether data is missing in each of the input datasets.

    """

    def clean_str(string): return ' '.join(string.strip().split(' _-:\n\r\t'))

    from neuropredict.io import process_pyradigm, process_arff

    method_names = list()
    outpath_list = list()
    missing_data_flag = list() # boolean flag for each dataset

    for mm, cur_method in enumerate(method_list):
        if cur_method in [get_pyradigm]:

            method_name, out_path_cur_dataset = process_pyradigm(feature_path[mm], subjects, classes)

            # if feature_type in ['pyradigm']:
            #     loaded_dataset = MLDataset(filepath=feature_path[mm])
            # else:
            #     raise ValueError('Invalid state of the program!')
            #
            # if len(loaded_dataset.description) > 1:
            #     method_name = loaded_dataset.description
            # else:
            #     method_name = basename(feature_path[mm])
            #
            # method_names.append(clean_str(method_name))
            # if not saved_dataset_matches(loaded_dataset, subjects, classes):
            #     raise ValueError(
            #         'supplied pyradigm dataset does not match samples in the meta data.')
            # else:
            #     out_path_cur_dataset = feature_path[mm]

        elif cur_method in [get_arff]:

            method_name, out_path_cur_dataset = process_arff(feature_path[mm], subjects, classes,
                                                             out_dir)

            # loaded_dataset = MLDataset(arff_path=feature_path[mm])
            # if len(loaded_dataset.description) > 1:
            #     method_name = loaded_dataset.description
            # else:
            #     method_name = basename(feature_path[mm])
            #
            # method_names.append(clean_str(method_name))
            # out_name = make_dataset_filename(method_name)
            # out_path_cur_dataset = pjoin(out_dir, out_name)
            # loaded_dataset.save(out_path_cur_dataset)
        else:

            if cur_method in [get_dir_of_dirs]:
                method_name = basename(feature_path[mm])

            elif cur_method in [get_data_matrix]:
                method_name = os.path.splitext(basename(feature_path[mm]))[0]

            else:
                method_name = cur_method.__name__

            out_name = make_dataset_filename(method_name)

            out_path_cur_dataset = pjoin(out_dir, out_name)
            if not saved_dataset_matches(out_path_cur_dataset, subjects, classes):
                # noinspection PyTypeChecker
                out_path_cur_dataset = get_features(subjects, classes,
                                                    feature_path[mm],
                                                    out_dir, out_name,
                                                    cur_method, feature_type)

        # checking for presence of any missing data
        data_mat, targets, ids = MLDataset(filepath=out_path_cur_dataset).data_and_labels()
        is_nan = np.isnan(data_mat)
        if is_nan.any():
            data_missing_here = True
            num_sub_with_md = np.sum(is_nan.sum(axis=1) > 0)
            num_var_with_md = np.sum(is_nan.sum(axis=0) > 0)
            if user_impute_strategy == 'raise':
                raise MissingDataException(
                    '{}/{} subjects with missing data found in {}/{} features\n'
                    '\tin {} dataset at {}\n'
                    '\tFill them and rerun, '
                    'or choose one of the available imputation strategies: {}'
                    ''.format(num_sub_with_md, data_mat.shape[0],
                              num_var_with_md, data_mat.shape[1],
                              method_name, out_path_cur_dataset,
                              cfg.avail_imputation_strategies))
        else:
            data_missing_here = False

        method_names.append(clean_str(method_name))
        outpath_list.append(out_path_cur_dataset)
        missing_data_flag.append(data_missing_here)

    # finalizing the imputation strategy
    if any(missing_data_flag):
        print('\nOne or more of the input datasets have missing data!')
        if user_impute_strategy == 'raise':
            raise MissingDataException('Fill them and rerun, or choose one of the available '
                                       'imputation strategies: {}'
                                       ''.format(cfg.avail_imputation_strategies))
        else:
            impute_strategy = user_impute_strategy
            print('The imputation strategy chosen is: {}'.format(impute_strategy))
    else:
        # disabling the imputation altogether if there is no missing data
        impute_strategy = None
        if user_impute_strategy in ('raise', None):
            print('Ignoring imputation strategy chosen, as no missing data were found!')

    combined_name = uniq_combined_name(method_names)

    # checking if there are any duplicates
    if len(set(outpath_list)) < len(outpath_list):
        raise RuntimeError('Duplicate paths to input dataset found!\n'
                           'Try distinguish inputs further. Otherwise report this bug '
                           '@ github.com/raamana/neuropredict/issues/new')

    dataset_paths_file = pjoin(out_dir, 'datasetlist.' + combined_name + '.txt')
    with open(dataset_paths_file, 'w') as dpf:
        dpf.writelines('\n'.join(outpath_list))

    print('\nData import is done.\n\n')

    return method_names, dataset_paths_file, missing_data_flag, impute_strategy