def new_dataset_with_same_ids_classes(in_ds):

    feat_dim = np.random.randint(1, max_feat_dim)
    out_ds = MLDataset()
    for id_ in in_ds.keys:
        out_ds.add_sample(id_, np.random.rand(feat_dim),
                       class_id=in_ds.classes[id_], label=in_ds.labels[id_])
    return out_ds
def getfeatures(subjects,
                classes,
                featdir,
                outdir,
                outname,
                getmethod=None,
                feature_type='dir_of_dris'):
    """Populates the pyradigm data structure with features from a given method.

    getmethod: takes in a path and returns a vectorized feature set (e.g. set of subcortical volumes),
        with an optional array of names for each feature.
    classes: dict of class labels keyed in by subject id

    """

    assert callable(getmethod), "Supplied getmethod is not callable!" \
                                "It must take in a path and return a vectorized feature set and labels."

    # generating an unique numeric label for each class (sorted in order of their appearance in metadata file)
    class_set = set(classes.values())
    class_labels = dict()
    for idx, cls in enumerate(class_set):
        class_labels[cls] = idx

    ids_excluded = list()

    if feature_type == 'data_matrix':
        data_matrix = get_data_matrix(featdir)

    ds = MLDataset()
    for subjid in subjects:
        try:
            if feature_type == 'data_matrix':
                data = data_matrix[subjects.index(subjid), :]
                feat_names = None
            else:
                data, feat_names = getmethod(featdir, subjid)

            ds.add_sample(subjid, data, class_labels[classes[subjid]],
                          classes[subjid], feat_names)
        except:
            ids_excluded.append(subjid)
            traceback.print_exc()
            warnings.warn(
                "Features for {} via {} method could not be read or added. "
                "Excluding it.".format(subjid, getmethod.__name__))

    # warning for if failed to extract features even for one subject
    alert_failed_feature_extraction(len(ids_excluded), ds.num_samples,
                                    len(subjects))

    # save the dataset to disk to enable passing on multiple dataset(s)
    savepath = os.path.join(outdir, outname)
    ds.save(savepath)

    return savepath
Exemple #3
0
def new_dataset_with_same_ids_classes(in_ds):

    feat_dim = np.random.randint(1, max_feat_dim)
    out_ds = MLDataset()
    for id_ in in_ds.keys:
        out_ds.add_sample(id_,
                          np.random.rand(feat_dim),
                          class_id=in_ds.classes[id_],
                          label=in_ds.labels[id_])
    return out_ds
Exemple #4
0
def make_random_MLdataset(max_num_classes=20,
                          max_class_size=50,
                          max_dim=100,
                          stratified=True):
    "Generates a random MLDataset for use in testing."

    smallest = 10
    max_class_size = max(smallest, max_class_size)
    largest = max(50, max_class_size)
    largest = max(smallest + 3, largest)

    if max_num_classes != 2:
        num_classes = np.random.randint(2, max_num_classes, 1)
    else:
        num_classes = 2

    if type(num_classes) == np.ndarray:
        num_classes = num_classes[0]
    if not stratified:
        class_sizes = np.random.random_integers(smallest,
                                                largest,
                                                size=[num_classes, 1])
    else:
        class_sizes = np.repeat(np.random.randint(smallest, largest),
                                num_classes)

    num_features = np.random.randint(min(3, max_dim), max(3, max_dim), 1)[0]
    feat_names = [str(x) for x in range(num_features)]

    class_ids = list()
    labels = list()
    for cl in range(num_classes):
        class_ids.append('class-{}'.format(cl))
        labels.append(int(cl))

    ds = MLDataset()
    for cc, class_ in enumerate(class_ids):
        subids = [
            'sub{:03}-class{:03}'.format(ix, cc)
            for ix in range(class_sizes[cc])
        ]
        for sid in subids:
            ds.add_sample(sid, feat_generator(num_features), int(cc), class_,
                          feat_names)

    return ds
def make_fully_separable_classes(max_class_size = 10, max_dim = 22):

    from sklearn.datasets import make_blobs

    random_center = np.random.rand(max_dim)
    cluster_std = 1.5
    centers =  [random_center, random_center+cluster_std*6]
    blobs_X, blobs_y = make_blobs(n_samples=max_class_size, n_features=max_dim,
                      centers=centers, cluster_std=cluster_std)

    unique_labels = np.unique(blobs_y)
    class_ids = { lbl : str(lbl) for lbl in unique_labels }

    new_ds = MLDataset()
    for index, row in enumerate(blobs_X):
            new_ds.add_sample('sub{}'.format(index), row, label=blobs_y[index],
                              class_id=class_ids[blobs_y[index]])

    return new_ds
Exemple #6
0
def make_fully_separable_classes(max_class_size=50, max_dim=100):

    from sklearn.datasets import make_blobs

    random_center = np.random.rand(max_dim)
    cluster_std = 1.5
    centers = [random_center, random_center + cluster_std * 6]
    blobs_X, blobs_y = make_blobs(n_samples=max_class_size,
                                  n_features=max_dim,
                                  centers=centers,
                                  cluster_std=cluster_std)

    unique_labels = np.unique(blobs_y)
    class_ids = {lbl: str(lbl) for lbl in unique_labels}

    new_ds = MLDataset()
    for index, row in enumerate(blobs_X):
        new_ds.add_sample('sub{}'.format(index),
                          row,
                          label=blobs_y[index],
                          class_id=class_ids[blobs_y[index]])

    return new_ds
Exemple #7
0
def get_features(subjects,
                 classes,
                 featdir,
                 outdir,
                 outname,
                 get_method=None,
                 feature_type='dir_of_dris'):
    """
    Populates the pyradigm data structure with features from a given method.

    Parameters
    ----------
    subjects : list or ndarray
        List of subject IDs
    classes : dict
        dict of class labels keyed in by subject id
    featdir : str
        Path to input directory to read the features from
    outdir : str
        Path to output directory to save the gathered features to.
    outname : str
        Name of the feature set
    get_method : callable
        Callable that takes in a path and returns a vectorized feature set (e.g. set of subcortical volumes),
        with an optional array of names for each feature.
    feature_type : str
        Identifier of data organization for features.

    Returns
    -------
    saved_path : str
        Path where the features have been saved to as an MLDataset

    """

    if not callable(get_method):
        raise ValueError(
            "Supplied get_method is not callable! "
            "It must take in a path and return a vectorized feature set and labels."
        )

    # generating an unique numeric label for each class (sorted in order of their appearance in metadata file)
    class_set = set(classes.values())
    class_labels = dict()
    for idx, cls in enumerate(class_set):
        class_labels[cls] = idx

    ids_excluded = list()

    if feature_type == 'data_matrix':
        data_matrix = get_data_matrix(featdir)

    ds = MLDataset()
    for subjid in subjects:
        try:
            if feature_type == 'data_matrix':
                data = data_matrix[subjects.index(subjid), :]
                feat_names = None
            else:
                data, feat_names = get_method(featdir, subjid)

            ds.add_sample(subjid, data, class_labels[classes[subjid]],
                          classes[subjid], feat_names)
        except:
            ids_excluded.append(subjid)
            traceback.print_exc()
            warnings.warn(
                "Features for {} via {} method could not be read or added. "
                "Excluding it.".format(subjid, get_method.__name__))

    # warning for if failed to extract features even for one subject
    alert_failed_feature_extraction(len(ids_excluded), ds.num_samples,
                                    len(subjects))

    # save the dataset to disk to enable passing on multiple dataset(s)
    saved_path = realpath(pjoin(outdir, outname))
    try:
        ds.save(saved_path)
    except IOError as ioe:
        print('Unable to save {} features to disk in folder:\n{}'.format(
            outname, outdir))
        raise ioe

    return saved_path
            for sample in sample_ids:
                feat_path = pjoin(out_dir, sample,
                                  '{}_{}'.format(expt_id, file_ext))
                if pexists(feat_path):
                    graph = nx.read_graphml(feat_path)
                    data = get_weights_order(graph, atlas_rois)
                    idx_nan = np.logical_not(np.isfinite(data))
                    local_flag_nan_exists = np.count_nonzero(idx_nan) > 0
                    if local_flag_nan_exists:
                        flag_nan_exists = True
                        comb_nan_values[base_feature][weight_method][
                            ds_name].append(sample)
                        # print('NaNs found for {} {} {}'.format(ds_name, weight_method, sample))
                    elif len(data) >= num_links_expected:
                        dataset.add_sample(sample,
                                           data,
                                           numeric_labels[classes[sample]],
                                           class_id=classes[sample])
                    else:
                        flag_unexpected = True
                        incomplete_processing[base_feature][weight_method][
                            ds_name].append(sample)
                else:
                    flag_incomplete = True
                    incomplete_processing[base_feature][weight_method][
                        ds_name].append(sample)
                    # print('processing incomplete for {} {} {}'.format(ds_name, weight_method, sample))

        if flag_nan_exists or flag_incomplete or flag_unexpected:
            pass
            # print('{:20} {:25} - processing unusable; totally skipping it.'.format(base_feature, weight_method))
        else:
Exemple #9
0
out_dir = '.'

num_classes = np.random.randint(2, 50)
class_sizes = np.random.randint(10, 1000, num_classes)
num_features = np.random.randint(10, 500)

class_set = np.array(['C{:05d}'.format(x) for x in range(num_classes)])
feat_names = np.array([str(x) for x in range(num_features)])

test_dataset = MLDataset()
for class_index, class_id in enumerate(class_set):
    for sub_ix in range(class_sizes[class_index]):
        subj_id = '{}_S{:05d}'.format(class_set[class_index], sub_ix)
        feat = np.random.random(num_features)
        test_dataset.add_sample(subj_id, feat, class_index, class_id,
                                feat_names)

out_file = os.path.join(out_dir, 'random_example_dataset.pkl')
test_dataset.save(out_file)

test_dataset.description = 'test dataset'
print(test_dataset)
print('default format:\n {}'.format(test_dataset))
print('full repr     :\n {:full}'.format(test_dataset))
print('string/short  :\n {:s}'.format(test_dataset))

class_set, label_set, class_sizes = test_dataset.summarize_classes()

reloaded_dataset = MLDataset(filepath=out_file,
                             description='reloaded test_dataset')
out_dir = '.'

num_classes = np.random.randint(2, 50)
class_sizes = np.random.randint(10, 100, num_classes)
num_features = np.random.randint(10, 100)
num_samples = sum(class_sizes)

class_set = np.array(['C{:05d}'.format(x) for x in range(num_classes)])
feat_names = np.array([str(x) for x in range(num_features)])

test_dataset = MLDataset()
for class_index, class_id in enumerate(class_set):
    for sub_ix in range(class_sizes[class_index]):
        subj_id = '{}_S{:05d}'.format(class_set[class_index], sub_ix)
        feat = np.random.random(num_features)
        test_dataset.add_sample(subj_id, feat, class_index, class_id,
                                feat_names)

out_file = os.path.join(out_dir, 'random_example_dataset.pkl')
test_dataset.save(out_file)

# same IDs, new features
same_ids_new_feat = MLDataset()
for sub_id in test_dataset.keys:
    feat = np.random.random(num_features)
    same_ids_new_feat.add_sample(sub_id, feat, test_dataset.labels[sub_id],
                                 test_dataset.classes[sub_id])

same_ids_new_feat.feature_names = np.array(
    ['new_f{}'.format(x) for x in range(num_features)])

test_dataset.description = 'test dataset'
Exemple #11
0
out_dir  = '.'

num_classes  = np.random.randint( 2, 50)
class_sizes  = np.random.randint(10, 100, num_classes)
num_features = np.random.randint(10, 100)

class_set    = np.array([ 'C{:05d}'.format(x) for x in range(num_classes)])
feat_names   = np.array([ str(x) for x in range(num_features) ])

test_dataset = MLDataset()
for class_index, class_id in enumerate(class_set):
    for sub_ix in range(class_sizes[class_index]):
        subj_id = '{}_S{:05d}'.format(class_set[class_index],sub_ix)
        feat = np.random.random(num_features)
        test_dataset.add_sample(subj_id, feat, class_index, class_id, feat_names)

out_file = os.path.join(out_dir,'random_example_dataset.pkl')
test_dataset.save(out_file)

# same IDs, new features
same_ids_new_feat = MLDataset()
for sub_id in test_dataset.keys:
    feat = np.random.random(num_features)
    same_ids_new_feat.add_sample(sub_id, feat,
                                 test_dataset.labels[sub_id],
                                 test_dataset.classes[sub_id])

same_ids_new_feat.feature_names = np.array([ 'new_f{}'.format(x) for x in range(
        num_features) ])