def _fp(self, X): """The cluster workhorse Parameters ---------- X : 2D array-like (n_sample, n_feature) The data to decompose Return ------ Xc - 2D array-like (n_sample, n_clusters) """ nrow = X.shape[0] clabels = self.estimator.fit_predict(X.transpose()) uclabels = unique_nan(clabels) uclabels = sort_nanfirst(uclabels) # uclabels = sorted(np.unique(clabels)) # uclabels = unique_sorted_with_nan(uclabels) # Average cluster examples, filling Xc Xc = np.zeros((nrow, len(uclabels))) ## Init w/ 0 for i, ucl in enumerate(uclabels): Xc[:,i] = X[:,ucl == clabels].mean(1) assert checkX(Xc) assert Xc.shape[0] == X.shape[0], ("After transform wrong row number") assert Xc.shape[1] == len(uclabels), ("Afer transform" " wrong col number") return Xc
def _ft(self, X): """The decompose workhorse Parameters ---------- X : 2D array-like (n_sample, n_feature) The data to decompose Return ------ Xc - 2D array-like (n_sample, n_components) """ Xc = self.estimator.fit_transform(X) assert checkX(Xc) assert Xc.shape[0] == X.shape[0], ("After transform wrong row number") # The n_components attr is optional try: assert Xc.shape[1] <= self.estimator.n_components, ("Too many" "components") except AttributeError: pass return Xc
def eva(X, y, trial_index, window, tr): """Average trials for each feature in X Parameters ---------- X : 2D array-like (n_sample, n_feature) The data to decompose y : 1D array, None by default Sample labels for the data. In y, np.nan and 'nan' values are ignored. trial_index : 1D array (n_sample, ) Each unique entry should match a trial. window : int Trial length Return ------ Xeva : a 2D arrays (n_feature*unique_y, window) The average trials feature_names : 1D array The names of the features (taken from y) """ evas = [] eva_names = [] scaler = MinMaxScaler(feature_range=(0, 1)) for j in range(X.shape[1]): Xtrials = [] xj = X[:,j][:,np.newaxis] ## Need 2D # Each feature into trials, rescale too Xtrial, feature_names = by_trial(xj, trial_index, window, y) Xtrial = scaler.fit_transform(Xtrial.astype(np.float)) unique_fn = sorted(np.unique(feature_names)) unique_fn = unique_sorted_with_nan(unique_fn) # and again by unique_y/fe]ature_names Xlabels, _ = by_labels(X=Xtrial.transpose(), y=feature_names) # put all that togthether Xtrials.extend([Xl.transpose() for Xl in Xlabels]) # and average the trials then # name names. evas.extend([Xt.mean(axis=1) for Xt in Xtrials]) eva_names.extend(unique_fn) # Reshape : (window, len(unique_y)*n_features) Xeva = np.vstack(evas).transpose() eva_names = np.asarray(eva_names) assert checkX(Xeva) assert Xeva.shape[0] == window, ("After EVA rows not equal to window") assert Xeva.shape[1] == len(unique_fn) * X.shape[1], ("After" "EVA wrong number of features") assert eva_names.shape[0] == Xeva.shape[1], ("eva_names and Xeva" "don't match") return Xeva, eva_names
def load_nii(nifiti, clean=True, sparse=False, smooth=False, **kwargs): """Convert the nifiti-1 file into a 2D array (n_sample x n_features). Parameters ---------- nifti - str The name of the data to load clean - boolean (True) Remove invariant features features? If used n_features will not match n_voxels in the orignal nifit1 file. This operation is not reversable. If you clean there is probablity little point in converting to a sparse representation. sparse - boolean (False) Use the (CSC) sparse format (True)? smooth - boolean (False) High/low pass filter the data? [, ...] - Optional parameters for smooth (defaults: tr=1.5, ub=0.06, lb=0.006) Return ------ X - 2D array (n_sample x n_features) The BOLD data """ # Data is 4d (x,y,z,t) we want 2d, where each column is # a voxel and each row is the temporal (t) data # i.e. the final shape should be (x*y*x, t) nii = nb.nifti1.load(nifiti) numt = nii.shape[3] numxyz = nii.shape[0] * nii.shape[1] * nii.shape[2] dims = (numxyz, numt) # Get into 2d (n_feature, n_sample) X = nii.get_data().astype('int16').reshape(dims).transpose() if clean: X = remove_invariant_features(X, sparse=False) if smooth: # Setup smooth params tr = 1.5 ub = 0.06 lb = 0.001 if "tr" in kwargs: tr = kwargs["tr"] if "ub" in kwargs: ub = kwargs["ub"] if "lb" in kwargs: ub = kwargs["lb"] X = smoothfn(X, tr=tr, ub=ub, lb=lb) assert checkX(X) if sparse: X = csc_matrix(X) return X
def correlateX(X, y, corr="spearman"): """Correlate each feature in X, with y (some set of dummmy coded labels). Parameters ---------- X - a 2d col oreinted array of features y - a 1d array of labels corr - name of correlation function: 'pearson' or 'spearman' Returns ------- corrs - a 1d array of correlations ps - a 1d array of p-values Note ---- Correlation's are calculated using either pearson's r (which assumes Gaussian errors) of spearman's rho (a rank-based non-parametric method.) """ X = np.array(X) y = np.array(y) ## Force... just in case checkX(X) if corr == "pearson": corrf = pearsonr elif corr == "spearman": corrf = spearmanr else: raise ValueError("stat was not valid.") corrs = [] ps = [] for jj in range(X.shape[1]): r, p = corrf(X[:,jj], y) corrs.append(r) ps.append(p) return np.array(corrs), np.array(ps)
def correlateX(X, y, corr="spearman"): """Correlate each feature in X, with y (some set of dummmy coded labels). Parameters ---------- X - a 2d col oreinted array of features y - a 1d array of labels corr - name of correlation function: 'pearson' or 'spearman' Returns ------- corrs - a 1d array of correlations ps - a 1d array of p-values Note ---- Correlation's are calculated using either pearson's r (which assumes Gaussian errors) of spearman's rho (a rank-based non-parametric method.) """ X = np.array(X) y = np.array(y) ## Force... just in case checkX(X) if corr == "pearson": corrf = pearsonr elif corr == "spearman": corrf = spearmanr else: raise ValueError("stat was not valid.") corrs = [] ps = [] for jj in range(X.shape[1]): r, p = corrf(X[:, jj], y) corrs.append(r) ps.append(p) return np.array(corrs), np.array(ps)
def by_trial(X, trial_index, window, y): """Rehapes X so each trial is feature. Note ---- In y, np.nan and 'nan' values are ignored. """ ncol = X.shape[1] # ---- # Remove short trials from X. locations = locate_short_trials(trial_index, window) if len(locations) > 0: short_mask = locations.pop() == trial_index for i in locations: short_mask = short_mask | (i == trial_index) short_mask = np.logical_not(short_mask) X = X[short_mask,] trial_index = trial_index[short_mask] # ---- # Find all the trials trial_masks = [] for trial in np.unique(trial_index): if np.isnan(trial): continue trial_masks.append(trial == trial_index) # And split up X Xlist = [] feature_names = [] for mask in trial_masks: y0 = 0 if y is not None: y0 = y[mask][0] if np.str(y0) != 'nan': Xlist.append(X[mask,][0:window,]) feature_names.append(np.repeat(y0, ncol)) feature_names = np.hstack(feature_names) # Create Xtrial by horizonal stacking Xtrial = np.hstack(Xlist) # Sanity assert checkX(Xtrial) assert Xtrial.shape[1] == feature_names.shape[0], ("After reshape" "Xtrial and feature_names don't match") assert Xtrial.shape[0] == window, ("Number of samples in Xtrial" "doesn't match window") return Xtrial, feature_names
def by_trial(X, trial_index, window, y): """Rehapes X so each trial is feature. Note ---- In y, np.nan and 'nan' values are ignored. """ ncol = X.shape[1] # ---- # Remove short trials from X. locations = locate_short_trials(trial_index, window) if len(locations) > 0: short_mask = locations.pop() == trial_index for i in locations: short_mask = short_mask | (i == trial_index) short_mask = np.logical_not(short_mask) X = X[short_mask, ] trial_index = trial_index[short_mask] # ---- # Find all the trials trial_masks = [] for trial in np.unique(trial_index): if np.isnan(trial): continue trial_masks.append(trial == trial_index) # And split up X Xlist = [] feature_names = [] for mask in trial_masks: y0 = 0 if y is not None: y0 = y[mask][0] if np.str(y0) != 'nan': Xlist.append(X[mask, ][0:window, ]) feature_names.append(np.repeat(y0, ncol)) feature_names = np.hstack(feature_names) # Create Xtrial by horizonal stacking Xtrial = np.hstack(Xlist) # Sanity assert checkX(Xtrial) assert Xtrial.shape[1] == feature_names.shape[0], ( "After reshape" "Xtrial and feature_names don't match") assert Xtrial.shape[0] == window, ("Number of samples in Xtrial" "doesn't match window") return Xtrial, feature_names
def restack(X, feature_names): """Reshape X into a stack of matrices based on feature names, one new 'layer' for each unique (sorted) entry in x. Note ---- In order to ensure the layers are stackable, if the number of cols is off zero are added as pad whereever needed. """ X = np.array(X) feature_names = np.array(feature_names) unique_names = np.unique(feature_names) nrow = X.shape[0] if X.shape[1] != feature_names.shape[0]: raise ValueError( "Number of features in X doesn't match feature_names.") # Init the reshaped X (Xstack) and the feature # mask, then loop over the rest mask = unique_names[0] == feature_names assert mask.shape[0] == feature_names.shape[0], ("The mask was the" "wrong shape") assert np.sum(mask) > 1, ("The mask was empty") Xstack = X[:, mask] for name in unique_names[1:]: mask = name == feature_names assert np.sum(mask) > 1, ("The mask was empty") Xname = X[:, mask] diff = Xstack.shape[1] - Xname.shape[1] if diff < 0: Xstack = _addcol(Xstack, np.abs(diff)) elif diff > 0: Xcond = _addcol(Xname, np.abs(diff)) Xstack = np.vstack([Xstack, Xname]) fn_stack = [] for name in unique_names: fn_stack.extend([ name, ] * nrow) fn_stack = np.array(fn_stack) assert checkX(Xstack) assert fn_stack.shape[0] == Xstack.shape[0], ( "After stacking X and" "feature_names did not match.") return Xstack, fn_stack
def decompose_tcdf(tcdf): """Decompose tcdf into its parts - X, cond, dataname, index.""" index = np.array(tcdf["index"].tolist()) cond = np.array(tcdf["cond"].tolist()) dataname = np.array(tcdf["dataname"].tolist()) tcdf = tcdf.drop(labels=["index", "cond", "dataname"], axis=1) X = np.array(tcdf.as_matrix()) assert checkX(X) return X, cond, dataname, index
def restack(X, feature_names): """Reshape X into a stack of matrices based on feature names, one new 'layer' for each unique (sorted) entry in x. Note ---- In order to ensure the layers are stackable, if the number of cols is off zero are added as pad whereever needed. """ X = np.array(X) feature_names = np.array(feature_names) unique_names = np.unique(feature_names) nrow = X.shape[0] if X.shape[1] != feature_names.shape[0]: raise ValueError("Number of features in X doesn't match feature_names.") # Init the reshaped X (Xstack) and the feature # mask, then loop over the rest mask = unique_names[0] == feature_names assert mask.shape[0] == feature_names.shape[0], ("The mask was the" "wrong shape") assert np.sum(mask) > 1, ("The mask was empty") Xstack = X[:,mask] for name in unique_names[1:]: mask = name == feature_names assert np.sum(mask) > 1, ("The mask was empty") Xname = X[:,mask] diff = Xstack.shape[1] - Xname.shape[1] if diff < 0: Xstack = _addcol(Xstack, np.abs(diff)) elif diff > 0: Xcond = _addcol(Xname, np.abs(diff)) Xstack = np.vstack([Xstack, Xname]) fn_stack = [] for name in unique_names: fn_stack.extend([name, ] * nrow) fn_stack = np.array(fn_stack) assert checkX(Xstack) assert fn_stack.shape[0] == Xstack.shape[0], ("After stacking X and" "feature_names did not match.") return Xstack, fn_stack
def eva(X, y, trial_index, window, tr): """Average trials for each feature in X Parameters ---------- X : 2D array-like (n_sample, n_feature) The data to decompose y : 1D array, None by default Sample labels for the data. In y, np.nan and 'nan' values are ignored. trial_index : 1D array (n_sample, ) Each unique entry should match a trial. window : int Trial length Return ------ Xeva : a 2D arrays (n_feature*unique_y, window) The average trials feature_names : 1D array The names of the features (taken from y) """ evas = [] eva_names = [] scaler = MinMaxScaler(feature_range=(0, 1)) for j in range(X.shape[1]): Xtrials = [] xj = X[:, j][:, np.newaxis] ## Need 2D # Each feature into trials, rescale too Xtrial, feature_names = by_trial(xj, trial_index, window, y) Xtrial = scaler.fit_transform(Xtrial.astype(np.float)) unique_fn = sorted(np.unique(feature_names)) unique_fn = unique_sorted_with_nan(unique_fn) # and again by unique_y/fe]ature_names Xlabels, _ = by_labels(X=Xtrial.transpose(), y=feature_names) # put all that togthether Xtrials.extend([Xl.transpose() for Xl in Xlabels]) # and average the trials then # name names. evas.extend([Xt.mean(axis=1) for Xt in Xtrials]) eva_names.extend(unique_fn) # Reshape : (window, len(unique_y)*n_features) Xeva = np.vstack(evas).transpose() eva_names = np.asarray(eva_names) assert checkX(Xeva) assert Xeva.shape[0] == window, ("After EVA rows not equal to window") assert Xeva.shape[1] == len(unique_fn) * X.shape[1], ( "After" "EVA wrong number of features") assert eva_names.shape[0] == Xeva.shape[1], ("eva_names and Xeva" "don't match") return Xeva, eva_names
def fir(X, y, trial_index, window, tr): """ Average trials for each feature in X, using Burock's (2000) method. Parameters ---------- X : 2D array-like (n_sample, n_feature) The data to decompose y : 1D array, None by default Sample labels for the data. In y, np.nan and 'nan' values are treated as baseline labels. trial_index : 1D array (n_sample, ) Each unique entry should match a trial. window : int Trial length Return ------ Xfir : a 2D arrays (n_feature*unique_y, window) The average trials feature_names : 1D array """ # Norm then pad. scaler = MinMaxScaler(feature_range=(0, 1)) X = scaler.fit_transform(X.astype(np.float)) X = np.vstack([X, np.ones((window, X.shape[1]), dtype=np.float)]) # Save the org y names ynames = sorted(np.unique(y)) ynames = unique_sorted_with_nan(ynames) # y becomes integers y = create_y(y) # Make the design matrix. dm = _create_dm(y, window) # dm DEBUG #import time #np.savetxt("dm-{0}".format(time.strftime("%m_%d_%Y_%H_%s_%m")), dm, fmt="%1.0f") dm = np.matrix(dm) # FIR! fir_names = [] firs = [] for j in range(X.shape[1]): x = np.matrix(X[:, j]) fir = np.array(np.linalg.pinv(dm.T * dm) * dm.T * x.T)[0:-1] ## Drop dummy fir = fir.reshape(len(ynames) - 1, window) firs.append(fir) fir_names.extend(ynames[1:]) ## Drop nan/baseline Xfir = np.vstack(firs).transpose() fir_names = np.asarray(fir_names) assert checkX(Xfir) assert Xfir.shape[0] == window, ("After FIR rows not equal to window") assert Xfir.shape[1] == (len(ynames[1:]) * X.shape[1]), ("After" "FIR wrong number of features") assert fir_names.shape[0] == Xfir.shape[1], ("fir_names and Xfir" "don't match") return Xfir, fir_names
range(*[int(i) for i in args.train_data.split(':')]), range(*[int(i) for i in args.train_window.split(':')]), args.train_labels, args.train_trial_tr ) Xtest, ytest = _data( args.test, range(*[int(i) for i in args.test_data.split(':')]), range(*[int(i) for i in args.test_window.split(':')]), args.test_labels, args.test_trial_tr ) X = np.vstack([Xtrain, Xtest]) y = np.concatenate([ytrain, ytest]) cvcode = np.asarray([0]*ytrain.shape[0] + [1]*ytest.shape[0]) assert checkX(X) assert X.shape[0] == y.shape[0], "X and y length mismatch" assert X.shape[0] == cvcode.shape[0], "X and cvcode length mismatch" # CV cv = StratifiedKFold(cvcode, n_folds=2, indices=True) # Classifier if args.clf == "RandomForestClassifier": clf = RandomForestClassifier( n_estimators=500, max_features=None ) elif args.clf == "GradientBoostingClassifier": clf = GradientBoostingClassifier( n_estimators=100, learning_rate=1.0, max_depth=1, random_state=prng
def fir(X, y, trial_index, window, tr): """ Average trials for each feature in X, using Burock's (2000) method. Parameters ---------- X : 2D array-like (n_sample, n_feature) The data to decompose y : 1D array, None by default Sample labels for the data. In y, np.nan and 'nan' values are treated as baseline labels. trial_index : 1D array (n_sample, ) Each unique entry should match a trial. window : int Trial length Return ------ Xfir : a 2D arrays (n_feature*unique_y, window) The average trials feature_names : 1D array """ # Norm then pad. scaler = MinMaxScaler(feature_range=(0, 1)) X = scaler.fit_transform(X.astype(np.float)) X = np.vstack([X, np.ones((window, X.shape[1]), dtype=np.float)]) # Save the org y names ynames = sorted(np.unique(y)) ynames = unique_sorted_with_nan(ynames) # y becomes integers y = create_y(y) # Make the design matrix. dm = _create_dm(y, window) # dm DEBUG #import time #np.savetxt("dm-{0}".format(time.strftime("%m_%d_%Y_%H_%s_%m")), dm, fmt="%1.0f") dm = np.matrix(dm) # FIR! fir_names = [] firs = [] for j in range(X.shape[1]): x = np.matrix(X[:,j]) fir = np.array(np.linalg.pinv(dm.T * dm) * dm.T * x.T)[0:-1] ## Drop dummy fir = fir.reshape(len(ynames)-1, window) firs.append(fir) fir_names.extend(ynames[1:]) ## Drop nan/baseline Xfir = np.vstack(firs).transpose() fir_names = np.asarray(fir_names) assert checkX(Xfir) assert Xfir.shape[0] == window, ("After FIR rows not equal to window") assert Xfir.shape[1] == (len(ynames[1:]) * X.shape[1]), ("After" "FIR wrong number of features") assert fir_names.shape[0] == Xfir.shape[1], ("fir_names and Xfir" "don't match") return Xfir, fir_names