featureWeights_AI=np.empty([100,data_AI.shape[1]-3]) # adding MSE quantification mse_AI=np.empty([100,1]) # run real predictions 100 times. Allows for each subject to be randomly allocated to the testing third multiple times. for split in range(0,100): # for a few different train and test splits # Train and test split from data frame xtrain_AI,xtest_AI,ytrain_AI,ytest_AI,indices_train_AI,indices_test_AI=train_test_split(Featvecs_AI,varofintAI,indices,test_size=0.33,random_state=(split)) # make dataframe of non-brain variables to regress covariates from EF in training df=np.array([age[indices_train_AI],mot[indices_train_AI],varofintAI[indices_train_AI]]) # transpose so subjects are rows dft=np.transpose(df) # regress covariates from EF in training sample (Linear GAM still has spline term) GAMFit=LinearGAM(s(0,n_splines=5) + l(1)).fit(dft,dft[:,2]) # get residuals residsvec=GAMFit.deviance_residuals(dft[:,[0,1,2]],dft[:,2]) # set ytrain to residuals ytrain_AI=residsvec # make equivalent dataframe for testing sample, but fit age and motion effects from training model df2=np.array([age[indices_test_AI],mot[indices_test_AI],varofintAI[indices_test_AI]]) df2t=np.transpose(df2) # apply model to unseen data to get those residuals for testing set testResidsvec=GAMFit.deviance_residuals(df2t[:,[0,1,2]],df2t[:,2]) # replace y test with age/motion controlled EF ytest_AI=testResidsvec # fit model with gcv lm_AI = sklearn.linear_model.RidgeCV(alphas=alphas, store_cv_values=True).fit(xtrain_AI,ytrain_AI) # set prediction alpha to best performing alpha in training set alpha_AI=lm_AI.alpha_ # save regularization weightings for this split all_preds_alphas[split,0]=alpha_AI
def bcorsis(y, x, x_num, d="small", weight="constant", method="standard", dst_y=False, params=(5, 5), num_thread=0): """ Generic non-parametric sure independence screening (SIS) procedure based on Ball Correlation. Ball correlation is a generic multivariate measure of dependence in Banach space. Parameters ---------- x : numeric matirx included n rows and p columns. Each row is an observation vector and each column corresponding to a explanatory variable, generally p >> n. y : numeric vector, matrix x_num : array_like the array or list contains all the dimension of each variable. The length of x_num is the number of the variables d : integer, optional the hard cutoff rule suggests selecting d variables. Setting d = "large" or d = "small" means n - 1 or floor(n/log(n)) variables are selected. If d is a integer, d variables are selected. Default: d = "small". weight : {'constant', 'probability', 'chisquare'}, optional a character string used to choose the weight form of Ball Covariance statistic.. Input must be one of "constant", "probability", or "chisquare". Default: weight = FALSE. method : {'standard', 'interaction', 'lm', 'gam'}, optional specific method for the BCor-SIS procedure. It must be one of "standard", "interaction", "lm" or "gam". Setting method = "standard" means performing standard SIS procedure while the options "lm" and "gam" mean carrying out iterative SIS procedure with ordinary linear regression and generalized additive models, respectively. The options "interaction" is designed for detecting variables with potential linear interaction and associated with left censored responses, respectively. Default: method = "standard". dst_y : 'True' or 'False', optional if dst_y = TRUE, y will be considered as a distance matrix. Default: distance = FALSE. params : array_like parameters list only available when method = "lm" or "gam". num_threads : integer Number of threads. If num_threads = 0, then all of available cores will be used. Default num_threads = 0. Returns ------- A list of target variables Notes ----- bcorsis simultaneously computing Ball Correlation statistics with "constant", "probability", and "chisquare" weights. Users can get other Ball Correlation statistics with different weight. "gam" method is slower than other methods when the number of variables is large. References ---------- .. [1]Wenliang Pan, Xueqin Wang, Weinan Xiao & Hongtu Zhu (2018) A Generic Sure Independence Screening Procedure, Journal of the American Statistical Association .. [2]Jin, Zhu, Wenliang Pan, Wei Zheng, and Xueqin Wang (2018). Ball: An R package for detecting distribution difference and association in metric spaces. Examples -------- >>> np.random.seed(10) >>> x = np.random.normal(0, 1, n*p).reshape((n, p)) >>> error = np.random.normal(0, 1, n) >>> y = 3 * x[:, 1] * x[:, 5] * x[:, 10] + error >>> result = bcorsis(y, x, x_num, method = "interaction", d=10) [1, 10, 567, 5, 1661] >>> np.random.seed(1000) >>> n = 150 >>> p = 3000 >>> mean = np.zeros(p) >>> cov = np.array([0.5]*np.square(p)).reshape((p, p)) >>> cov[np.diag_indices(3000)] = 1 >>> x = np.random.multivariate_normal(mean, cov, n) >>> error = np.random.normal(0, 1, n) >>> y = 4*np.square(x[:, 2])+6*np.square(x[:, 1])+8*x[:, 3]-10*x[:,4]+error >>> x_num = np.ones(3000) >>> target = [4, 1, 924, 2, 692, 3, 400, 2241, 2839, 2194, 170] >>> result = bcorsis(y, x, x_num, method="lm", params = [5, 3], d = 11) [4, 1, 924, 2, 692, 3, 400, 2241, 2839, 2194, 170] """ examine_None(x) examine_None(y) x = np.array(x).T.flatten() examine_x_num_arguments(x, x_num) examine_method_arguments(method) f_num = len(x_num) if type(x_num) == np.ndarray: x_num = x_num.astype(int).tolist() n = int(len(x) / (np.sum(x_num))) d = select_d_arguments(n, d) if len(y.shape) > 1 and dst_y == True: examine_distance_matrix(y) y = [y[i][j] for i in range(np.alen(y)) for j in range(np.alen(y)) if i < j] else: y = np.array(y).T.flatten() method = method.lower() if method == "standard": bcor_stat = select_bcor_stat(y, x, x_num, f_num, n, dst_y, num_thread, weight) ind = [] for i in range(d): ind.append(np.argmax(bcor_stat)) bcor_stat[ind[i]] = -1 elif method == "interaction": x_square = np.square(x) bcor_stat = select_bcor_stat(y, x, x_num, f_num, n, dst_y, num_thread, weight) bcor_square_stat = select_bcor_stat(y, x_square, x_num, f_num, n, dst_y, num_thread, weight) ind1 = [] ind2 = [] for i in range(d): ind1.append(np.argmax(bcor_stat)) ind2.append(np.argmax(bcor_square_stat)) bcor_stat[ind1[i]] = -1 bcor_square_stat[ind2[i]] = -1 ind = [index for index in ind1 if index in ind2] else: examine_params_arguments(params) d1 = params[0] d2 = params[1] bcor_stat = select_bcor_stat(y, x, x_num, f_num, n, dst_y, num_thread, weight) ind_have = [] for i in range(d1): ind_have.append(np.argmax(bcor_stat)) bcor_stat[ind_have[i]] = -1 ind_last = ind_have x = x.reshape((np.sum(x_num), n)).T y = y.reshape((int(len(y) / n), n)).T ind_rest = [i for i in range(f_num) if i not in ind_have] if method == "lm": regr = linear_model.LinearRegression() while len(ind_have) < d: ind1 = get_x_index(ind_have, x_num) ind2 = get_x_index(ind_last, x_num) ind3 = get_x_index(ind_rest, x_num) regr.fit(x[:, ind1], x[:, ind3]) x_new = x[:, ind3] - regr.predict(x[:, ind1]) regr.fit(x[:, ind2], y) y = y - regr.predict(x[:, ind2]) x_new = np.array(x_new).T.flatten() y = np.array(y).T.flatten() temp_x_num = [x_num[i] for i in ind_rest] bcor_stat = select_bcor_stat(y, x_new, temp_x_num, len(ind_rest), n, dst_y, num_thread, weight) ind_last = [] for i in range(d2): temp_ind = np.argmax(bcor_stat) ind_last.append(ind_rest[temp_ind]) bcor_stat[temp_ind] = -1 ind_have = ind_have + ind_last ind_rest = [i for i in ind_rest if i not in ind_have] ind = ind_have else: while len(ind_have) < d: ind1 = get_x_index(ind_have, x_num) ind2 = get_x_index(ind_last, x_num) ind3 = get_x_index(ind_rest, x_num) gam = LinearGAM().fit(x[:, ind1], x[:, ind3[0]]) x_new = gam.deviance_residuals(x[:, ind1], x[:, ind3[0]]) for i in range(len(ind3)): if i == 0: continue gam = LinearGAM().fit(x[:, ind1], x[:, ind3[i]]) x_new = np.hstack((x_new, gam.deviance_residuals(x[:, ind1], x[:, ind3[i]]))) gam = LinearGAM().fit(x[:, ind2], y[:, 0]) y = gam.deviance_residuals(x[:, ind2], y[:, 0]) for i in range(y.shape[1]): if i == 0: continue gam = LinearGAM().fit(x[:, ind2], y[:, i]) y = np.hstack((y, gam.deviance_residuals(x[:, ind2], y[:, i]))) temp_x_num = [x_num[i] for i in ind_rest] bcor_stat = select_bcor_stat(y, x_new, temp_x_num, len(ind_rest), n, dst_y, num_thread, weight) ind_last = [] for i in range(d2): temp_ind = np.argmax(bcor_stat) ind_last.append(ind_rest[temp_ind]) bcor_stat[temp_ind] = -1 ind_have = ind_have + ind_last ind_rest = [i for i in ind_rest if i not in ind_have] ind = ind_have return ind