def new_dataset_with_same_ids_classes(in_ds): feat_dim = np.random.randint(1, max_feat_dim) out_ds = MLDataset() for id_ in in_ds.keys: out_ds.add_sample(id_, np.random.rand(feat_dim), class_id=in_ds.classes[id_], label=in_ds.labels[id_]) return out_ds
def getfeatures(subjects, classes, featdir, outdir, outname, getmethod=None, feature_type='dir_of_dris'): """Populates the pyradigm data structure with features from a given method. getmethod: takes in a path and returns a vectorized feature set (e.g. set of subcortical volumes), with an optional array of names for each feature. classes: dict of class labels keyed in by subject id """ assert callable(getmethod), "Supplied getmethod is not callable!" \ "It must take in a path and return a vectorized feature set and labels." # generating an unique numeric label for each class (sorted in order of their appearance in metadata file) class_set = set(classes.values()) class_labels = dict() for idx, cls in enumerate(class_set): class_labels[cls] = idx ids_excluded = list() if feature_type == 'data_matrix': data_matrix = get_data_matrix(featdir) ds = MLDataset() for subjid in subjects: try: if feature_type == 'data_matrix': data = data_matrix[subjects.index(subjid), :] feat_names = None else: data, feat_names = getmethod(featdir, subjid) ds.add_sample(subjid, data, class_labels[classes[subjid]], classes[subjid], feat_names) except: ids_excluded.append(subjid) traceback.print_exc() warnings.warn( "Features for {} via {} method could not be read or added. " "Excluding it.".format(subjid, getmethod.__name__)) # warning for if failed to extract features even for one subject alert_failed_feature_extraction(len(ids_excluded), ds.num_samples, len(subjects)) # save the dataset to disk to enable passing on multiple dataset(s) savepath = os.path.join(outdir, outname) ds.save(savepath) return savepath
def make_random_MLdataset(max_num_classes=20, max_class_size=50, max_dim=100, stratified=True): "Generates a random MLDataset for use in testing." smallest = 10 max_class_size = max(smallest, max_class_size) largest = max(50, max_class_size) largest = max(smallest + 3, largest) if max_num_classes != 2: num_classes = np.random.randint(2, max_num_classes, 1) else: num_classes = 2 if type(num_classes) == np.ndarray: num_classes = num_classes[0] if not stratified: class_sizes = np.random.random_integers(smallest, largest, size=[num_classes, 1]) else: class_sizes = np.repeat(np.random.randint(smallest, largest), num_classes) num_features = np.random.randint(min(3, max_dim), max(3, max_dim), 1)[0] feat_names = [str(x) for x in range(num_features)] class_ids = list() labels = list() for cl in range(num_classes): class_ids.append('class-{}'.format(cl)) labels.append(int(cl)) ds = MLDataset() for cc, class_ in enumerate(class_ids): subids = [ 'sub{:03}-class{:03}'.format(ix, cc) for ix in range(class_sizes[cc]) ] for sid in subids: ds.add_sample(sid, feat_generator(num_features), int(cc), class_, feat_names) return ds
def make_fully_separable_classes(max_class_size = 10, max_dim = 22): from sklearn.datasets import make_blobs random_center = np.random.rand(max_dim) cluster_std = 1.5 centers = [random_center, random_center+cluster_std*6] blobs_X, blobs_y = make_blobs(n_samples=max_class_size, n_features=max_dim, centers=centers, cluster_std=cluster_std) unique_labels = np.unique(blobs_y) class_ids = { lbl : str(lbl) for lbl in unique_labels } new_ds = MLDataset() for index, row in enumerate(blobs_X): new_ds.add_sample('sub{}'.format(index), row, label=blobs_y[index], class_id=class_ids[blobs_y[index]]) return new_ds
def make_fully_separable_classes(max_class_size=50, max_dim=100): from sklearn.datasets import make_blobs random_center = np.random.rand(max_dim) cluster_std = 1.5 centers = [random_center, random_center + cluster_std * 6] blobs_X, blobs_y = make_blobs(n_samples=max_class_size, n_features=max_dim, centers=centers, cluster_std=cluster_std) unique_labels = np.unique(blobs_y) class_ids = {lbl: str(lbl) for lbl in unique_labels} new_ds = MLDataset() for index, row in enumerate(blobs_X): new_ds.add_sample('sub{}'.format(index), row, label=blobs_y[index], class_id=class_ids[blobs_y[index]]) return new_ds
def get_features(subjects, classes, featdir, outdir, outname, get_method=None, feature_type='dir_of_dris'): """ Populates the pyradigm data structure with features from a given method. Parameters ---------- subjects : list or ndarray List of subject IDs classes : dict dict of class labels keyed in by subject id featdir : str Path to input directory to read the features from outdir : str Path to output directory to save the gathered features to. outname : str Name of the feature set get_method : callable Callable that takes in a path and returns a vectorized feature set (e.g. set of subcortical volumes), with an optional array of names for each feature. feature_type : str Identifier of data organization for features. Returns ------- saved_path : str Path where the features have been saved to as an MLDataset """ if not callable(get_method): raise ValueError( "Supplied get_method is not callable! " "It must take in a path and return a vectorized feature set and labels." ) # generating an unique numeric label for each class (sorted in order of their appearance in metadata file) class_set = set(classes.values()) class_labels = dict() for idx, cls in enumerate(class_set): class_labels[cls] = idx ids_excluded = list() if feature_type == 'data_matrix': data_matrix = get_data_matrix(featdir) ds = MLDataset() for subjid in subjects: try: if feature_type == 'data_matrix': data = data_matrix[subjects.index(subjid), :] feat_names = None else: data, feat_names = get_method(featdir, subjid) ds.add_sample(subjid, data, class_labels[classes[subjid]], classes[subjid], feat_names) except: ids_excluded.append(subjid) traceback.print_exc() warnings.warn( "Features for {} via {} method could not be read or added. " "Excluding it.".format(subjid, get_method.__name__)) # warning for if failed to extract features even for one subject alert_failed_feature_extraction(len(ids_excluded), ds.num_samples, len(subjects)) # save the dataset to disk to enable passing on multiple dataset(s) saved_path = realpath(pjoin(outdir, outname)) try: ds.save(saved_path) except IOError as ioe: print('Unable to save {} features to disk in folder:\n{}'.format( outname, outdir)) raise ioe return saved_path
for sample in sample_ids: feat_path = pjoin(out_dir, sample, '{}_{}'.format(expt_id, file_ext)) if pexists(feat_path): graph = nx.read_graphml(feat_path) data = get_weights_order(graph, atlas_rois) idx_nan = np.logical_not(np.isfinite(data)) local_flag_nan_exists = np.count_nonzero(idx_nan) > 0 if local_flag_nan_exists: flag_nan_exists = True comb_nan_values[base_feature][weight_method][ ds_name].append(sample) # print('NaNs found for {} {} {}'.format(ds_name, weight_method, sample)) elif len(data) >= num_links_expected: dataset.add_sample(sample, data, numeric_labels[classes[sample]], class_id=classes[sample]) else: flag_unexpected = True incomplete_processing[base_feature][weight_method][ ds_name].append(sample) else: flag_incomplete = True incomplete_processing[base_feature][weight_method][ ds_name].append(sample) # print('processing incomplete for {} {} {}'.format(ds_name, weight_method, sample)) if flag_nan_exists or flag_incomplete or flag_unexpected: pass # print('{:20} {:25} - processing unusable; totally skipping it.'.format(base_feature, weight_method)) else:
out_dir = '.' num_classes = np.random.randint(2, 50) class_sizes = np.random.randint(10, 1000, num_classes) num_features = np.random.randint(10, 500) class_set = np.array(['C{:05d}'.format(x) for x in range(num_classes)]) feat_names = np.array([str(x) for x in range(num_features)]) test_dataset = MLDataset() for class_index, class_id in enumerate(class_set): for sub_ix in range(class_sizes[class_index]): subj_id = '{}_S{:05d}'.format(class_set[class_index], sub_ix) feat = np.random.random(num_features) test_dataset.add_sample(subj_id, feat, class_index, class_id, feat_names) out_file = os.path.join(out_dir, 'random_example_dataset.pkl') test_dataset.save(out_file) test_dataset.description = 'test dataset' print(test_dataset) print('default format:\n {}'.format(test_dataset)) print('full repr :\n {:full}'.format(test_dataset)) print('string/short :\n {:s}'.format(test_dataset)) class_set, label_set, class_sizes = test_dataset.summarize_classes() reloaded_dataset = MLDataset(filepath=out_file, description='reloaded test_dataset')
out_dir = '.' num_classes = np.random.randint(2, 50) class_sizes = np.random.randint(10, 100, num_classes) num_features = np.random.randint(10, 100) num_samples = sum(class_sizes) class_set = np.array(['C{:05d}'.format(x) for x in range(num_classes)]) feat_names = np.array([str(x) for x in range(num_features)]) test_dataset = MLDataset() for class_index, class_id in enumerate(class_set): for sub_ix in range(class_sizes[class_index]): subj_id = '{}_S{:05d}'.format(class_set[class_index], sub_ix) feat = np.random.random(num_features) test_dataset.add_sample(subj_id, feat, class_index, class_id, feat_names) out_file = os.path.join(out_dir, 'random_example_dataset.pkl') test_dataset.save(out_file) # same IDs, new features same_ids_new_feat = MLDataset() for sub_id in test_dataset.keys: feat = np.random.random(num_features) same_ids_new_feat.add_sample(sub_id, feat, test_dataset.labels[sub_id], test_dataset.classes[sub_id]) same_ids_new_feat.feature_names = np.array( ['new_f{}'.format(x) for x in range(num_features)]) test_dataset.description = 'test dataset'
out_dir = '.' num_classes = np.random.randint( 2, 50) class_sizes = np.random.randint(10, 100, num_classes) num_features = np.random.randint(10, 100) class_set = np.array([ 'C{:05d}'.format(x) for x in range(num_classes)]) feat_names = np.array([ str(x) for x in range(num_features) ]) test_dataset = MLDataset() for class_index, class_id in enumerate(class_set): for sub_ix in range(class_sizes[class_index]): subj_id = '{}_S{:05d}'.format(class_set[class_index],sub_ix) feat = np.random.random(num_features) test_dataset.add_sample(subj_id, feat, class_index, class_id, feat_names) out_file = os.path.join(out_dir,'random_example_dataset.pkl') test_dataset.save(out_file) # same IDs, new features same_ids_new_feat = MLDataset() for sub_id in test_dataset.keys: feat = np.random.random(num_features) same_ids_new_feat.add_sample(sub_id, feat, test_dataset.labels[sub_id], test_dataset.classes[sub_id]) same_ids_new_feat.feature_names = np.array([ 'new_f{}'.format(x) for x in range( num_features) ])