def test_sanity_checks(): """Ensure that sanity checks are performed, and as expected.""" ### -------------- as you add them to dataset -------------- with raises(EmptyFeatureSetException): ds.add_samplet('empty_features', [], 'target') ### -------------- as you save them to disk -------------- ds.add_samplet('all_zeros', np.zeros((ds.num_features, 1)), 'target') with raises(ConstantValuesException): ds.save(out_file) ds.del_samplet('all_zeros') # checking for random constant value! const_value = np.random.randint(10, 100) const_feat_set = np.full((ds.num_features, 1), const_value) ds.add_samplet('all_constant', const_feat_set, 'target') with raises(ConstantValuesException): ds.save(out_file) # now checking for constants across samplets # this is easily achieved by adding different samplets with same features # such a bug is possible, when user made a mistake querying # the right files for the right samplet ID const_ds = ClfDataset() rand_feat_same_across_samplets = np.random.randn(10) for index in range(np.random.randint(10, 100)): const_ds.add_samplet(str(index), rand_feat_same_across_samplets, index) with raises(ConstantValuesException): const_ds.save(out_file)
def make_random_Dataset(max_num_classes=20, max_class_size=50, max_dim=100, stratified=True): "Generates a random Dataset for use in testing." smallest = 10 max_class_size = max(smallest, max_class_size) largest = max(50, max_class_size) largest = max(smallest + 3, largest) if max_num_classes != 2: num_classes = np.random.randint(2, max_num_classes, 1) else: num_classes = 2 if type(num_classes) == np.ndarray: num_classes = num_classes[0] if not stratified: class_sizes = np.random.random_integers(smallest, largest, size=[num_classes, 1]) else: class_sizes = np.repeat(np.random.randint(smallest, largest), num_classes) num_features = np.random.randint(min(3, max_dim), max(3, max_dim), 1)[0] feat_names = [str(x) for x in range(num_features)] class_ids = list() labels = list() for cl in range(num_classes): class_ids.append('class-{}'.format(cl)) labels.append(int(cl)) ds = ClfDataset() for cc, class_ in enumerate(class_ids): subids = [ 'sub{:03}-class{:03}'.format(ix, cc) for ix in range(class_sizes[cc]) ] for sid in subids: ds.add_samplet(samplet_id=sid, features=feat_generator(num_features), target=class_, feature_names=feat_names) return ds
def make_fully_separable_classes(max_class_size=50, max_dim=100): from sklearn.datasets import make_blobs random_center = np.random.rand(max_dim) cluster_std = 1.5 centers = [random_center, random_center + cluster_std * 6] blobs_X, blobs_y = make_blobs(n_samples=max_class_size, n_features=max_dim, centers=centers, cluster_std=cluster_std) unique_labels = np.unique(blobs_y) class_ids = {lbl: str(lbl) for lbl in unique_labels} new_ds = ClfDataset() for index, row in enumerate(blobs_X): new_ds.add_samplet(samplet_id='sub{}'.format(index), features=row, # label=blobs_y[index], target=class_ids[blobs_y[index]]) return new_ds
def load_arff_dataset(ds_path): """Convenience utility to quickly load ARFF files into pyradigm format""" try: ds = ClassificationDataset.from_arff(ds_path) except: try: ds = RegressionDataset.from_arff(ds_path) except: try: ds = MLDataset(arff_path=ds_path) except: raise TypeError( 'Error in loading the ARFF dataset @ path below!' ' Ignoring {}'.format(ds_path)) return ds
def load_dataset(ds_path): """Convenience utility to quickly load any type of pyradigm dataset""" try: ds = ClassificationDataset(dataset_path=ds_path) except: try: ds = RegressionDataset(dataset_path=ds_path) except: try: warn( 'MLDtaset is deprecated. Switch to the latest pyradigm data ' 'structures such as ClassificationDataset or ' 'RegressionDataset as soon as possible.') ds = MLDataset(filepath=ds_path) except: raise TypeError('Dataset class @ path below not recognized!' ' Must be a valid instance of one of ' 'ClassificationDataset or ' 'RegressionDataset or MLDataset.\n' ' Ignoring {}'.format(ds_path)) return ds
def get_features(samplet_id_list, classes, featdir, outdir, outname, get_method=None, feature_type='dir_of_dris'): """ Populates the pyradigm data structure with features from a given method. Parameters ---------- samplet_id_list : list or ndarray List of subject IDs classes : dict dict of class labels keyed in by subject id featdir : str Path to input directory to read the features from outdir : str Path to output directory to save the gathered features to. outname : str Name of the feature set get_method : callable Callable that takes in a path and returns a vectorized feature set e.g. set of subcortical volumes, with an optional array of names for each feature. feature_type : str Identifier of data organization for features. Returns ------- saved_path : str Path where the features have been saved to as a pyradigm dataset """ if not callable(get_method): raise ValueError("Supplied get_method is not callable! " " It must take in a path and " "return a vectorized feature set and labels.") # generating an unique numeric label for each class # (sorted in order of their appearance in metadata file) class_set = set(classes.values()) class_labels = dict() for idx, cls in enumerate(class_set): class_labels[cls] = idx ids_excluded = list() if feature_type == 'data_matrix': data_matrix = get_data_matrix(featdir) ds = ClassificationDataset() for samplet_id in samplet_id_list: try: if feature_type == 'data_matrix': data = data_matrix[samplet_id_list.index(samplet_id), :] feat_names = None else: data, feat_names = get_method(featdir, samplet_id) ds.add_samplet(samplet_id=samplet_id, features=data, target=classes[samplet_id], feature_names=feat_names) except: ids_excluded.append(samplet_id) traceback.print_exc() warnings.warn( "Features for {} via {} method could not be read or added." " Excluding it.".format(samplet_id, get_method.__name__)) # warning for if failed to extract features even for one subject alert_failed_feature_extraction(len(ids_excluded), ds.num_samplets, len(samplet_id_list)) # save the dataset to disk to enable passing on multiple dataset(s) saved_path = realpath(pjoin(outdir, outname)) try: ds.save(saved_path) except IOError as ioe: print('Unable to save {} features to disk in folder:\n{}' ''.format(outname, outdir)) raise ioe return saved_path
estimator = 'randomforestclassifier' # 'svm' # dr_method = 'isomap' # 'selectkbest_f_classif' # 'variancethreshold' # dr_size = 'tenth' gs_level = 'none' # 'light' random.seed(42) # to save time for local tests covar_list = ('age', 'gender', 'dummy') covar_types = ('age', 'gender', 'float') covar_arg = ' '.join(['age', 'gender']) deconf_method = 'residualize' out_path1 = os.path.join(out_dir, 'random_clf_ds1.pkl') out_path2 = os.path.join(out_dir, 'random_clf_ds2.pkl') if pexists(out_path1) and pexists(out_path2): ds_one = ClassificationDataset(dataset_path=out_path1) ds_two = ClassificationDataset(dataset_path=out_path2) else: ds_one = make_random_ClfDataset(max_num_classes=max_num_classes, stratified=True, max_class_size=max_class_size, max_dim=max_dim, min_num_classes=min_num_classes, attr_names=covar_list, attr_types=covar_types) ds_one.save(out_path1) ds_two = dataset_with_new_features_same_everything_else(ds_one, max_dim) ds_two.save(out_path2) A = 0