def getfeatures(subjects,
                classes,
                featdir,
                outdir,
                outname,
                getmethod=None,
                feature_type='dir_of_dris'):
    """Populates the pyradigm data structure with features from a given method.

    getmethod: takes in a path and returns a vectorized feature set (e.g. set of subcortical volumes),
        with an optional array of names for each feature.
    classes: dict of class labels keyed in by subject id

    """

    assert callable(getmethod), "Supplied getmethod is not callable!" \
                                "It must take in a path and return a vectorized feature set and labels."

    # generating an unique numeric label for each class (sorted in order of their appearance in metadata file)
    class_set = set(classes.values())
    class_labels = dict()
    for idx, cls in enumerate(class_set):
        class_labels[cls] = idx

    ids_excluded = list()

    if feature_type == 'data_matrix':
        data_matrix = get_data_matrix(featdir)

    ds = MLDataset()
    for subjid in subjects:
        try:
            if feature_type == 'data_matrix':
                data = data_matrix[subjects.index(subjid), :]
                feat_names = None
            else:
                data, feat_names = getmethod(featdir, subjid)

            ds.add_sample(subjid, data, class_labels[classes[subjid]],
                          classes[subjid], feat_names)
        except:
            ids_excluded.append(subjid)
            traceback.print_exc()
            warnings.warn(
                "Features for {} via {} method could not be read or added. "
                "Excluding it.".format(subjid, getmethod.__name__))

    # warning for if failed to extract features even for one subject
    alert_failed_feature_extraction(len(ids_excluded), ds.num_samples,
                                    len(subjects))

    # save the dataset to disk to enable passing on multiple dataset(s)
    savepath = os.path.join(outdir, outname)
    ds.save(savepath)

    return savepath
Exemple #2
0
def process_arff(feature_path, subjects, classes, out_dir):
    """Processes the given dataset to return a clean name and path."""

    loaded_dataset = MLDataset(arff_path=feature_path)
    if len(loaded_dataset.description) > 1:
        method_name = loaded_dataset.description
    else:
        method_name = basename(feature_path)

    out_name = make_dataset_filename(method_name)
    out_path_cur_dataset = pjoin(out_dir, out_name)
    loaded_dataset.save(out_path_cur_dataset)

    if not saved_dataset_matches(loaded_dataset, subjects, classes):
        raise ValueError(
            'supplied ARFF dataset does not match samples in the meta data.')

    return method_name, out_path_cur_dataset
Exemple #3
0
def import_datasets(method_list,
                    out_dir,
                    subjects,
                    classes,
                    feature_path,
                    feature_type='dir_of_dirs'):
    """
    Imports all the specified feature sets and organizes them into datasets.

    Parameters
    ----------
    method_list : list of callables
        Set of predefined methods returning a vector of features for a given sample id and location
    out_dir : str
        Path to the output folder

    subjects : list of str
        List of sample ids
    classes : dict
        Dict identifying the class for each sample id in the dataset.
    feature_path : list of str
        List of paths to the root directory containing the features (pre- or user-defined).
        Must be of same length as method_list
    feature_type : str
        a string identifying the structure of feature set.
        Choices = ('dir_of_dirs', 'data_matrix')

    Returns
    -------
    method_names : list of str
        List of method names used for annotation.
    dataset_paths_file : str
        Path to the file containing paths to imported feature sets.

    """
    def clean_str(string):
        return ' '.join(string.strip().split(' _-:\n\r\t'))

    method_names = list()
    outpath_list = list()
    for mm, cur_method in enumerate(method_list):
        if cur_method in [get_dir_of_dirs]:
            method_name = basename(feature_path[mm])

        elif cur_method in [get_data_matrix]:
            method_name = os.path.splitext(basename(feature_path[mm]))[0]

        elif cur_method in [get_pyradigm]:

            if feature_type in ['pyradigm']:
                loaded_dataset = MLDataset(filepath=feature_path[mm])
            else:
                raise ValueError('Invalid state of the program!')

            if len(loaded_dataset.description) > 1:
                method_name = loaded_dataset.description
            else:
                method_name = basename(feature_path[mm])

            method_names.append(clean_str(method_name))
            if saved_dataset_matches(loaded_dataset, subjects, classes):
                outpath_list.append(feature_path[mm])
                continue
            else:
                raise ValueError(
                    'supplied pyradigm dataset does not match samples in the meta data.'
                )

        elif cur_method in [get_arff]:

            loaded_dataset = MLDataset(arff_path=feature_path[mm])
            if len(loaded_dataset.description) > 1:
                method_name = loaded_dataset.description
            else:
                method_name = basename(feature_path[mm])

            method_names.append(clean_str(method_name))
            out_name = make_dataset_filename(method_name)
            outpath_dataset = pjoin(out_dir, out_name)
            loaded_dataset.save(outpath_dataset)
            outpath_list.append(outpath_dataset)
            continue
        else:
            # adding an index for an even more unique identification
            # method_name = '{}_{}'.format(cur_method.__name__,mm)
            method_name = cur_method.__name__

        method_names.append(clean_str(method_name))
        out_name = make_dataset_filename(method_name)

        outpath_dataset = pjoin(out_dir, out_name)
        if not saved_dataset_matches(outpath_dataset, subjects, classes):
            # noinspection PyTypeChecker
            outpath_dataset = get_features(subjects, classes, feature_path[mm],
                                           out_dir, out_name, cur_method,
                                           feature_type)

        outpath_list.append(outpath_dataset)

    combined_name = uniq_combined_name(method_names)

    dataset_paths_file = pjoin(out_dir,
                               'datasetlist.' + combined_name + '.txt')
    with open(dataset_paths_file, 'w') as dpf:
        dpf.writelines('\n'.join(outpath_list))

    return method_names, dataset_paths_file
Exemple #4
0
def get_features(subjects,
                 classes,
                 featdir,
                 outdir,
                 outname,
                 get_method=None,
                 feature_type='dir_of_dris'):
    """
    Populates the pyradigm data structure with features from a given method.

    Parameters
    ----------
    subjects : list or ndarray
        List of subject IDs
    classes : dict
        dict of class labels keyed in by subject id
    featdir : str
        Path to input directory to read the features from
    outdir : str
        Path to output directory to save the gathered features to.
    outname : str
        Name of the feature set
    get_method : callable
        Callable that takes in a path and returns a vectorized feature set (e.g. set of subcortical volumes),
        with an optional array of names for each feature.
    feature_type : str
        Identifier of data organization for features.

    Returns
    -------
    saved_path : str
        Path where the features have been saved to as an MLDataset

    """

    if not callable(get_method):
        raise ValueError(
            "Supplied get_method is not callable! "
            "It must take in a path and return a vectorized feature set and labels."
        )

    # generating an unique numeric label for each class (sorted in order of their appearance in metadata file)
    class_set = set(classes.values())
    class_labels = dict()
    for idx, cls in enumerate(class_set):
        class_labels[cls] = idx

    ids_excluded = list()

    if feature_type == 'data_matrix':
        data_matrix = get_data_matrix(featdir)

    ds = MLDataset()
    for subjid in subjects:
        try:
            if feature_type == 'data_matrix':
                data = data_matrix[subjects.index(subjid), :]
                feat_names = None
            else:
                data, feat_names = get_method(featdir, subjid)

            ds.add_sample(subjid, data, class_labels[classes[subjid]],
                          classes[subjid], feat_names)
        except:
            ids_excluded.append(subjid)
            traceback.print_exc()
            warnings.warn(
                "Features for {} via {} method could not be read or added. "
                "Excluding it.".format(subjid, get_method.__name__))

    # warning for if failed to extract features even for one subject
    alert_failed_feature_extraction(len(ids_excluded), ds.num_samples,
                                    len(subjects))

    # save the dataset to disk to enable passing on multiple dataset(s)
    saved_path = realpath(pjoin(outdir, outname))
    try:
        ds.save(saved_path)
    except IOError as ioe:
        print('Unable to save {} features to disk in folder:\n{}'.format(
            outname, outdir))
        raise ioe

    return saved_path
                    flag_incomplete = True
                    incomplete_processing[base_feature][weight_method][
                        ds_name].append(sample)
                    # print('processing incomplete for {} {} {}'.format(ds_name, weight_method, sample))

        if flag_nan_exists or flag_incomplete or flag_unexpected:
            pass
            # print('{:20} {:25} - processing unusable; totally skipping it.'.format(base_feature, weight_method))
        else:
            print('{:20} {:5} {:25} - fully usable.'.format(
                base_feature, ds_name, weight_method))
            dataset.description = '{}_{}'.format(base_feature, weight_method)
            out_path = pjoin(
                out_dir, '{}_{}.MLDataset.pkl'.format(base_feature,
                                                      weight_method))
            dataset.save(out_path)

    # saving
    with open(pjoin(out_dir, 'incomplete_unusable_processing.pkl'),
              'wb') as ipf:
        pickle.dump([incomplete_processing, comb_nan_values], ipf)

# reading
with open(pjoin(out_dir, 'incomplete_unusable_processing.pkl'), 'rb') as ipf:
    incomplete_processing, comb_nan_values = pickle.load(ipf)

# results
for base_feature in base_feature_list:
    for ds_name in dataset_list:
        for weight_method in histogram_dist:
            print('{:20} {:5} {:25} {:5} {:5}'.format(
Exemple #6
0
    for base_feature in features_freesurfer:

        id_list, classes = get_metadata(meta_file)
        class_set = list(set(classes.values()))
        class_set.sort()
        labels = {sub: class_set.index(cls) for sub, cls in classes.items()}

        out_path = pjoin(
            vis_out_dir,
            'raw_features_{}_{}.MLDataset.pkl'.format(base_feature,
                                                      '_'.join(class_set)))

        try:
            ds = MLDataset(filepath=out_path)
        except:
            traceback.print_exc()
            id_data = import_features(freesurfer_dir,
                                      id_list,
                                      base_feature,
                                      atlas=atlas,
                                      fwhm=fwhm)
            ds = MLDataset(data=id_data, labels=labels, classes=classes)
            ds.save(out_path)

        data, lbl, ids = ds.data_and_labels()
        print('{} {}\n min : {:.4f}\n max : {:.4f}'.format(
            dataset_name, base_feature, np.min(data), np.max(data)))
        for perc in [1, 5, 95, 99]:
            print('{:3d}% : {:10.4f}'.format(perc, np.percentile(data, perc)))
Exemple #7
0
class_sizes = np.random.randint(10, 1000, num_classes)
num_features = np.random.randint(10, 500)

class_set = np.array(['C{:05d}'.format(x) for x in range(num_classes)])
feat_names = np.array([str(x) for x in range(num_features)])

test_dataset = MLDataset()
for class_index, class_id in enumerate(class_set):
    for sub_ix in range(class_sizes[class_index]):
        subj_id = '{}_S{:05d}'.format(class_set[class_index], sub_ix)
        feat = np.random.random(num_features)
        test_dataset.add_sample(subj_id, feat, class_index, class_id,
                                feat_names)

out_file = os.path.join(out_dir, 'random_example_dataset.pkl')
test_dataset.save(out_file)

test_dataset.description = 'test dataset'
print(test_dataset)
print('default format:\n {}'.format(test_dataset))
print('full repr     :\n {:full}'.format(test_dataset))
print('string/short  :\n {:s}'.format(test_dataset))

class_set, label_set, class_sizes = test_dataset.summarize_classes()

reloaded_dataset = MLDataset(filepath=out_file,
                             description='reloaded test_dataset')

copy_dataset = MLDataset(in_dataset=test_dataset)

rand_index = np.random.randint(0, len(class_set), 1)[0]
Exemple #8
0
num_classes  = np.random.randint( 2, 50)
class_sizes  = np.random.randint(10, 100, num_classes)
num_features = np.random.randint(10, 100)

class_set    = np.array([ 'C{:05d}'.format(x) for x in range(num_classes)])
feat_names   = np.array([ str(x) for x in range(num_features) ])

test_dataset = MLDataset()
for class_index, class_id in enumerate(class_set):
    for sub_ix in range(class_sizes[class_index]):
        subj_id = '{}_S{:05d}'.format(class_set[class_index],sub_ix)
        feat = np.random.random(num_features)
        test_dataset.add_sample(subj_id, feat, class_index, class_id, feat_names)

out_file = os.path.join(out_dir,'random_example_dataset.pkl')
test_dataset.save(out_file)

# same IDs, new features
same_ids_new_feat = MLDataset()
for sub_id in test_dataset.keys:
    feat = np.random.random(num_features)
    same_ids_new_feat.add_sample(sub_id, feat,
                                 test_dataset.labels[sub_id],
                                 test_dataset.classes[sub_id])

same_ids_new_feat.feature_names = np.array([ 'new_f{}'.format(x) for x in range(
        num_features) ])

test_dataset.description = 'test dataset'
print(test_dataset)
print('default format:\n {}'.format(test_dataset))