Example #1
0
def new_dataset_with_same_ids_classes(in_ds):

    feat_dim = np.random.randint(1, max_feat_dim)
    out_ds = MLDataset()
    for id_ in in_ds.keys:
        out_ds.add_sample(id_, np.random.rand(feat_dim),
                       class_id=in_ds.classes[id_], label=in_ds.labels[id_])
    return out_ds
Example #2
0
def make_fully_separable_classes(max_class_size = 10, max_dim = 22):

    from sklearn.datasets import make_blobs

    random_center = np.random.rand(max_dim)
    cluster_std = 1.5
    centers =  [random_center, random_center+cluster_std*6]
    blobs_X, blobs_y = make_blobs(n_samples=max_class_size, n_features=max_dim,
                      centers=centers, cluster_std=cluster_std)

    unique_labels = np.unique(blobs_y)
    class_ids = { lbl : str(lbl) for lbl in unique_labels }

    new_ds = MLDataset()
    for index, row in enumerate(blobs_X):
            new_ds.add_sample('sub{}'.format(index), row, label=blobs_y[index],
                              class_id=class_ids[blobs_y[index]])

    return new_ds
def test_eq_copy():
    new_copy = MLDataset(in_dataset=copy_dataset)
    assert new_copy == copy_dataset
def test_init_with_dict():
    new_ds = MLDataset(data=test_dataset.data,
                       labels=test_dataset.labels,
                       classes=test_dataset.classes)
    assert new_ds == test_dataset
def test_cant_read_nonexisting_file():
    with raises(IOError):
        a = MLDataset('/nonexistentrandomdir/disofddlsfj/arbitrary.noname.pkl')
from pytest import raises, warns

from pyradigm import MLDataset

out_dir = '.'

num_classes = np.random.randint(2, 50)
class_sizes = np.random.randint(10, 100, num_classes)
num_features = np.random.randint(10, 100)
num_samples = sum(class_sizes)

class_set = np.array(['C{:05d}'.format(x) for x in range(num_classes)])
feat_names = np.array([str(x) for x in range(num_features)])

test_dataset = MLDataset()
for class_index, class_id in enumerate(class_set):
    for sub_ix in range(class_sizes[class_index]):
        subj_id = '{}_S{:05d}'.format(class_set[class_index], sub_ix)
        feat = np.random.random(num_features)
        test_dataset.add_sample(subj_id, feat, class_index, class_id,
                                feat_names)

out_file = os.path.join(out_dir, 'random_example_dataset.pkl')
test_dataset.save(out_file)

# same IDs, new features
same_ids_new_feat = MLDataset()
for sub_id in test_dataset.keys:
    feat = np.random.random(num_features)
    same_ids_new_feat.add_sample(sub_id, feat, test_dataset.labels[sub_id],
def import_datasets(method_list, out_dir, subjects, classes,
                    feature_path, feature_type='dir_of_dirs',
                    user_impute_strategy=cfg.default_imputation_strategy):
    """
    Imports all the specified feature sets and organizes them into datasets.

    Parameters
    ----------
    method_list : list of callables
        Set of predefined methods returning a vector of features for a given sample id and location

    out_dir : str
        Path to the output folder

    subjects : list of str
        List of sample ids

    classes : dict
        Dict identifying the class for each sample id in the dataset.

    feature_path : list of str
        List of paths to the root directory containing the features (pre- or user-defined).
        Must be of same length as method_list

    feature_type : str
        a string identifying the structure of feature set.
        Choices = ('dir_of_dirs', 'data_matrix')

    user_impute_strategy : str
        Strategy to handle the missing data: whether to raise an error if data is missing, or
            to impute them using the method chosen here.

    Returns
    -------
    method_names : list of str
        List of method names used for annotation

    dataset_paths_file : str
        Path to the file containing paths to imported feature sets

    missing_data_flag : list
        List of boolean flags indicating whether data is missing in each of the input datasets.

    """

    def clean_str(string): return ' '.join(string.strip().split(' _-:\n\r\t'))

    from neuropredict.io import process_pyradigm, process_arff

    method_names = list()
    outpath_list = list()
    missing_data_flag = list() # boolean flag for each dataset

    for mm, cur_method in enumerate(method_list):
        if cur_method in [get_pyradigm]:

            method_name, out_path_cur_dataset = process_pyradigm(feature_path[mm], subjects, classes)

            # if feature_type in ['pyradigm']:
            #     loaded_dataset = MLDataset(filepath=feature_path[mm])
            # else:
            #     raise ValueError('Invalid state of the program!')
            #
            # if len(loaded_dataset.description) > 1:
            #     method_name = loaded_dataset.description
            # else:
            #     method_name = basename(feature_path[mm])
            #
            # method_names.append(clean_str(method_name))
            # if not saved_dataset_matches(loaded_dataset, subjects, classes):
            #     raise ValueError(
            #         'supplied pyradigm dataset does not match samples in the meta data.')
            # else:
            #     out_path_cur_dataset = feature_path[mm]

        elif cur_method in [get_arff]:

            method_name, out_path_cur_dataset = process_arff(feature_path[mm], subjects, classes,
                                                             out_dir)

            # loaded_dataset = MLDataset(arff_path=feature_path[mm])
            # if len(loaded_dataset.description) > 1:
            #     method_name = loaded_dataset.description
            # else:
            #     method_name = basename(feature_path[mm])
            #
            # method_names.append(clean_str(method_name))
            # out_name = make_dataset_filename(method_name)
            # out_path_cur_dataset = pjoin(out_dir, out_name)
            # loaded_dataset.save(out_path_cur_dataset)
        else:

            if cur_method in [get_dir_of_dirs]:
                method_name = basename(feature_path[mm])

            elif cur_method in [get_data_matrix]:
                method_name = os.path.splitext(basename(feature_path[mm]))[0]

            else:
                method_name = cur_method.__name__

            out_name = make_dataset_filename(method_name)

            out_path_cur_dataset = pjoin(out_dir, out_name)
            if not saved_dataset_matches(out_path_cur_dataset, subjects, classes):
                # noinspection PyTypeChecker
                out_path_cur_dataset = get_features(subjects, classes,
                                                    feature_path[mm],
                                                    out_dir, out_name,
                                                    cur_method, feature_type)

        # checking for presence of any missing data
        data_mat, targets, ids = MLDataset(filepath=out_path_cur_dataset).data_and_labels()
        is_nan = np.isnan(data_mat)
        if is_nan.any():
            data_missing_here = True
            num_sub_with_md = np.sum(is_nan.sum(axis=1) > 0)
            num_var_with_md = np.sum(is_nan.sum(axis=0) > 0)
            if user_impute_strategy == 'raise':
                raise MissingDataException(
                    '{}/{} subjects with missing data found in {}/{} features\n'
                    '\tin {} dataset at {}\n'
                    '\tFill them and rerun, '
                    'or choose one of the available imputation strategies: {}'
                    ''.format(num_sub_with_md, data_mat.shape[0],
                              num_var_with_md, data_mat.shape[1],
                              method_name, out_path_cur_dataset,
                              cfg.avail_imputation_strategies))
        else:
            data_missing_here = False

        method_names.append(clean_str(method_name))
        outpath_list.append(out_path_cur_dataset)
        missing_data_flag.append(data_missing_here)

    # finalizing the imputation strategy
    if any(missing_data_flag):
        print('\nOne or more of the input datasets have missing data!')
        if user_impute_strategy == 'raise':
            raise MissingDataException('Fill them and rerun, or choose one of the available '
                                       'imputation strategies: {}'
                                       ''.format(cfg.avail_imputation_strategies))
        else:
            impute_strategy = user_impute_strategy
            print('The imputation strategy chosen is: {}'.format(impute_strategy))
    else:
        # disabling the imputation altogether if there is no missing data
        impute_strategy = None
        if user_impute_strategy in ('raise', None):
            print('Ignoring imputation strategy chosen, as no missing data were found!')

    combined_name = uniq_combined_name(method_names)

    # checking if there are any duplicates
    if len(set(outpath_list)) < len(outpath_list):
        raise RuntimeError('Duplicate paths to input dataset found!\n'
                           'Try distinguish inputs further. Otherwise report this bug '
                           '@ github.com/raamana/neuropredict/issues/new')

    dataset_paths_file = pjoin(out_dir, 'datasetlist.' + combined_name + '.txt')
    with open(dataset_paths_file, 'w') as dpf:
        dpf.writelines('\n'.join(outpath_list))

    print('\nData import is done.\n\n')

    return method_names, dataset_paths_file, missing_data_flag, impute_strategy
Example #8
0
for base_feature in base_feature_list:
    print(' Processing {}'.format(base_feature))
    incomplete_processing[base_feature] = dict()
    comb_nan_values[base_feature] = dict()

    for stat_method in roi_stat_list:
        print('Gathering data for {}'.format(stat_method))
        expt_id = '{}_{}_{}_smoothing{}_size{}'.format(stat_method,
                                                       base_feature, atlas,
                                                       fwhm, node_size)

        flag_nan_exists = False
        flag_incomplete = False
        flag_unexpected = False
        dataset = MLDataset()

        incomplete_processing[base_feature][stat_method] = dict()
        comb_nan_values[base_feature][stat_method] = dict()
        for ds_name in dataset_list:
            print(' working on {}'.format(ds_name))
            proc_dir = pjoin(base_dir, ds_name, 'processed')
            out_dir = pjoin(proc_dir, 'graynet',
                            '{}_{}_fwhm{}'.format(base_feature, atlas, fwhm))

            meta_list = pjoin(proc_dir, 'target_lists',
                              'meta_{}.csv'.format(ds_name))
            sample_ids, classes = run_workflow.get_metadata(meta_list)

            incomplete_processing[base_feature][stat_method][ds_name] = list()
            comb_nan_values[base_feature][stat_method][ds_name] = list()
Example #9
0
        from pyradigm import MLDataset
    except:
        raise ImportError('could not import pyradigm')
else:
    raise NotImplementedError('pyradigm supports only 2.7.13 or 3+. Upgrade to Python 3+ is recommended.')

out_dir  = '.'

num_classes  = np.random.randint( 2, 50)
class_sizes  = np.random.randint(10, 100, num_classes)
num_features = np.random.randint(10, 100)

class_set    = np.array([ 'C{:05d}'.format(x) for x in range(num_classes)])
feat_names   = np.array([ str(x) for x in range(num_features) ])

test_dataset = MLDataset()
for class_index, class_id in enumerate(class_set):
    for sub_ix in range(class_sizes[class_index]):
        subj_id = '{}_S{:05d}'.format(class_set[class_index],sub_ix)
        feat = np.random.random(num_features)
        test_dataset.add_sample(subj_id, feat, class_index, class_id, feat_names)

out_file = os.path.join(out_dir,'random_example_dataset.pkl')
test_dataset.save(out_file)

# same IDs, new features
same_ids_new_feat = MLDataset()
for sub_id in test_dataset.keys:
    feat = np.random.random(num_features)
    same_ids_new_feat.add_sample(sub_id, feat,
                                 test_dataset.labels[sub_id],
Example #10
0
def test_unpickling():
    out_file = os.path.join(out_dir, 'random_pickled_dataset.pkl')
    copy_dataset.save(out_file)
    reloaded_dataset = MLDataset(filepath=out_file,
                                 description='reloaded test_dataset')
    assert copy_dataset == reloaded_dataset
Example #11
0
sys.dont_write_bytecode = True

from pytest import raises, warns, set_trace

from pyradigm import MLDataset

out_dir = '.'

num_classes = np.random.randint(2, 50)
class_sizes = np.random.randint(10, 1000, num_classes)
num_features = np.random.randint(10, 500)

class_set = np.array(['C{:05d}'.format(x) for x in range(num_classes)])
feat_names = np.array([str(x) for x in range(num_features)])

test_dataset = MLDataset()
for class_index, class_id in enumerate(class_set):
    for sub_ix in range(class_sizes[class_index]):
        subj_id = '{}_S{:05d}'.format(class_set[class_index], sub_ix)
        feat = np.random.random(num_features)
        test_dataset.add_sample(subj_id, feat, class_index, class_id,
                                feat_names)

out_file = os.path.join(out_dir, 'random_example_dataset.pkl')
test_dataset.save(out_file)

class_set, label_set, class_sizes = test_dataset.summarize_classes()

reloaded_dataset = MLDataset(filepath=out_file,
                             description='reloaded test_dataset')
Example #12
0
def run(dataset_path_file, method_names, out_results_dir,
        train_perc = 0.8, num_repetitions = 200,
        positive_class = None):
    """

    Parameters
    ----------
    dataset_path_file : str
        path to file containing list of paths (each containing a valid MLDataset).
    method_names : list
        A list of names to denote the different feature extraction methods
    out_results_dir : str
        Path to output directory to save the cross validation results to.
    train_perc : float, optional
        Percetange of subjects to train the classifier on.
        The percentage is applied to the size of the smallest class to estimate
        the number of subjects from each class to be reserved for training.
        The smallest class is chosen to avoid class-imbalance in the training set.
        Default: 0.8 (80%).
    num_repetitions : int, optional
        Number of repetitions of cross-validation estimation. Default: 200.
    positive_class : str
        Name of the class to be treated as positive in calculation of AUC

    Returns
    -------
    results_path : str
        Path to pickle file containing full set of CV results.

    """

    # structure of this function
    # load datasets
    # validate each dataset
    # ensure same number of subjects across all datasets
    #        same number of features/subject in each dataset
    #        same class set across all datasets
    # re-map the labels (from 1 to n) to ensure numeric labels do not differ
    # sort them if need be (not needed if MLDatasets)
    # for rep 1 to N, for feat 1 to M,
    #   run train/test/evaluate.
    #   keep tab on misclassifications
    # save results (comprehensive and reloadable manner)

    assert os.path.exists(dataset_path_file), "File containing dataset paths does not exist."
    with open(dataset_path_file, 'r') as dpf:
        dataset_paths = dpf.read().splitlines()

    try:
        out_results_dir = os.path.abspath(out_results_dir)
        if not os.path.exists(out_results_dir):
            os.mkdir(out_results_dir)
    except:
        raise IOError('Error in checking or creating output directiory. Ensure write permissions!')

    num_repetitions = int(num_repetitions)
    assert num_repetitions < np.Inf, "Infinite number of repetitions is not recommened!"
    assert num_repetitions > 1, "More than repetition is necessary!"
    # TODO warning when num_rep are not suficient: need a heuristic to assess it

    # loading datasets
    datasets = list()
    for fp in dataset_paths:
        assert os.path.exists(fp), "Dataset @ {} does not exist.".format(fp)

        try:
            # there is an internal validation of dataset
            ds = MLDataset(fp)
        except:
            print("Dataset @ {} is not a valid MLDataset!".format(fp))
            raise

        # add the valid dataset to list
        datasets.append(ds)

    # ensure same number of subjects across all datasets
    num_datasets = int(len(datasets))
    # looking into the first dataset
    common_ds = datasets[0]
    class_set, label_set, class_sizes = common_ds.summarize_classes()
    num_samples = common_ds.num_samples
    num_classes = len(class_set)

    if num_datasets > 1:
        for idx in range(1, num_datasets):
            this_ds = datasets[idx]
            assert num_samples==this_ds.num_samples, "Number of samples in different datasets differ!"
            assert set(class_set)==set(this_ds.classes.values()), \
                "Classes differ among datasets! \n One dataset: {} \n Another: {}".format(
                    set(class_set), set(this_ds.classes.values()))

    # re-map the labels (from 1 to n) to ensure numeric labels do not differ
    remapped_class_labels = dict()
    for idx, cls in enumerate(class_set):
        remapped_class_labels[cls] = idx

    # finding the numeric label for positive class
    # label will also be in the index into the arrays over classes due to construction above
    if num_classes == 2:
        if positive_class is None:
            positive_class = class_set[-1]
        # List.index(item) returns the first index of a match
        pos_class_index = class_set.index(positive_class)  # remapped_class_labels[positive_class]

    labels_with_correspondence = dict()
    for subid in common_ds.sample_ids:
        labels_with_correspondence[subid] = remapped_class_labels[common_ds.classes[subid]]

    for idx in range(num_datasets):
        datasets[idx].labels = labels_with_correspondence

    assert (train_perc >= 0.01 and train_perc <= 0.99), \
        "Training percentage {} out of bounds - must be > 0.01 and < 0.99".format(train_perc)

    num_features = np.zeros(num_datasets).astype(np.int64)
    for idx in range(num_datasets):
        num_features[idx] = datasets[idx].num_features

    # determine the common size for training
    print("Different classes in the training set are stratified to match the smallest class!")
    train_size_per_class = np.int64(np.floor(train_perc*class_sizes).astype(np.float64))
    # per-class
    train_size_common = np.int64(np.minimum(min(train_size_per_class), train_size_per_class))
    # single number
    reduced_sizes = np.unique(train_size_common)
    assert len(reduced_sizes)==1, "Error in stratification of training set based on the smallest class!"
    train_size_common = reduced_sizes[0]

    total_test_samples = np.int64(np.sum(class_sizes) - num_classes*train_size_common)

    pred_prob_per_class    = np.full([num_repetitions, num_datasets, total_test_samples, num_classes], np.nan)
    pred_labels_per_rep_fs = np.full([num_repetitions, num_datasets, total_test_samples], np.nan)
    test_labels_per_rep    = np.full([num_repetitions, total_test_samples], np.nan)

    best_min_leaf_size  = np.full([num_repetitions, num_datasets], np.nan)
    best_num_predictors = np.full([num_repetitions, num_datasets], np.nan)

    # initialize misclassification counters
    num_times_tested = list()
    num_times_misclfd= list()
    for dd in range(num_datasets):
        num_times_tested.append(Counter(common_ds.sample_ids))
        num_times_misclfd.append(Counter(common_ds.sample_ids))
        for subid in common_ds.sample_ids:
            num_times_tested[dd][subid] = 0
            num_times_misclfd[dd][subid]= 0

    # multi-class metrics
    confusion_matrix  = np.full([num_classes, num_classes, num_repetitions, num_datasets], np.nan)
    accuracy_balanced = np.full([num_repetitions, num_datasets], np.nan)
    auc_weighted = np.full([num_repetitions, num_datasets], np.nan)

    # # specificity & sensitivity are ill-defined in the general case as they require us to know which class is positive
    # # hence would refer them from now on simply correct classification rates (ccr)
    # moreover this can be easily computed from the confusion matrix anyway.
    # ccr_perclass = np.full([num_repetitions, num_datasets, num_classes], np.nan)
    # binary metrics
    # TODO later when are the uses of precision and recall appropriate?
    # precision    = np.full([num_repetitions, num_datasets], np.nan)
    # recall       = np.full([num_repetitions, num_datasets], np.nan)

    feature_names = [None]*num_datasets
    feature_importances_rf = [None]*num_datasets
    for idx in range(num_datasets):
        feature_importances_rf[idx] = np.full([num_repetitions,num_features[idx]], np.nan)
        feature_names[idx] = datasets[idx].feature_names

    # repeated-hold out CV begins here
    # TODO LATER implement a multi-process version as differnt rep's are embarrasingly parallel
    # use the following one statement processing that can be forked to parallel threads
    # pred_prob_per_class[rep, dd, :, :], pred_labels_per_rep_fs[rep, dd, :], \
    # confmat, misclsfd_ids_this_run, feature_importances_rf[dd][rep, :], \
    # best_min_leaf_size[rep, dd], best_num_predictors[rep, dd] \
    #     = holdout_evaluation(datasets, train_size_common, total_test_samples)

    max_width_method_names = max(map(len, method_names))

    for rep in range(num_repetitions):
        print("\n CV repetition {:3d} ".format(rep))

        # TODO to achieve feature- or method-level parallization,
        #   train/test splits need to be saved at the entry level for each subgroup and used here
        train_set, test_set = common_ds.train_test_split_ids(count_per_class=train_size_common)
        test_labels_per_rep[rep, :] = [ common_ds.labels[sid] for sid in test_set if sid in common_ds.labels]

        # evaluating each feature/dataset
        # try set test_labels_per_rep outside dd loop as its the same across all dd
        for dd in range(num_datasets):
            # print("\t feature {:3d} {:>{}}: ".format(dd, method_names[dd], max_width_method_names), end='')
            print("\t feature {index:3d} {name:>{namewidth}} : ".format(index=dd,
                                                                       name=method_names[dd],
                                                                       namewidth=max_width_method_names),
                  end='')

            train_fs = datasets[dd].get_subset(train_set)
            test_fs  = datasets[dd].get_subset(test_set)

            pred_prob_per_class[rep, dd, :, :], \
                pred_labels_per_rep_fs[rep, dd, :], true_test_labels, \
                confmat, misclsfd_ids_this_run, feature_importances_rf[dd][rep,:], \
                best_min_leaf_size[rep, dd], best_num_predictors[rep, dd] = \
                eval_optimized_clsfr_on_testset(train_fs, test_fs, label_order_in_CM=label_set)

            accuracy_balanced[rep,dd] = balanced_accuracy(confmat)
            confusion_matrix[:,:,rep,dd] = confmat
            print('balanced accuracy: {:.4f} '.format(accuracy_balanced[rep, dd]), end='')

            if num_classes == 2:
                # TODO FIX auc calculation flipped
                # TODO store fpr and tpr per rep, and provide the user to option to vizualize the average if they wish
                auc_weighted[rep,dd] = roc_auc_score(true_test_labels,
                                                       pred_prob_per_class[rep, dd, :, pos_class_index],
                                                       average='weighted')
                print('\t weighted AUC: {:.4f}'.format(auc_weighted[rep,dd]), end='')

            num_times_misclfd[dd].update(misclsfd_ids_this_run)
            num_times_tested[dd].update(test_fs.sample_ids)

            print('')

    # save results
    var_list_to_save = [dataset_paths, method_names, train_perc, num_repetitions, num_classes,
                        pred_prob_per_class, pred_labels_per_rep_fs, test_labels_per_rep,
                        best_min_leaf_size, best_num_predictors,
                        feature_importances_rf, feature_names,
                        num_times_misclfd, num_times_tested,
                        confusion_matrix, class_set,
                        accuracy_balanced, auc_weighted, positive_class ]

    var_names_to_save = ['dataset_paths', 'method_names', 'train_perc', 'num_repetitions', 'num_classes',
                        'pred_prob_per_class', 'pred_labels_per_rep_fs', 'test_labels_per_rep',
                        'best_min_leaf_size', 'best_num_predictors',
                        'feature_importances_rf', 'feature_names',
                        'num_times_misclfd', 'num_times_tested',
                        'confusion_matrix', 'class_set',
                        'accuracy_balanced', 'auc_weighted', 'positive_class' ]

    locals_var_dict = locals()
    dict_to_save = {var : locals_var_dict[var] for var in cfg.rhst_data_variables_to_persist}

    out_results_path = save_results(out_results_dir, dict_to_save)

    return out_results_path