Example #1
0
def bstrp_pvalue(coord_mat, x_design, y_design, cdesign, gstat, num_bstrp,
                 thres, area):
    """
    Smoothing individual function without preselected bandwidth.

    Args:
        coord_mat (matrix): common coordinate matrix (l*d)
        x_design (matrix): design matrix (n*p)
        y_design (matrix): shape data (response matrix, n*l*m, m=d in MFSDA)
        cdesign (matrix): linear constraint matrix (1*(p-1))
        gstat (scalar): global test statistic
        num_bstrp (scalar): number of bootstrap
        thres (scalar): thresholding for clustering
        area (scalar): area of the largest connected region
    """

    # under the null hypothesis
    x_design0 = x_design[:, np.nonzero(cdesign == 0)[0]]

    efit_beta0, efity_design0, h_opt0 = lpks(coord_mat, x_design0, y_design)
    resy_design0 = y_design - efity_design0
    efit_eta0, res_eta0, esig_eta0 = sif(coord_mat, resy_design0, h_opt0)

    # Bootstrap procedures
    gstatvec = np.zeros((1, num_bstrp))
    simlpval_area = np.zeros((1, num_bstrp))

    for gii in range(num_bstrp):
        simy_design = grs(efity_design0, efit_eta0, res_eta0)
        simefit_beta = lpks_pre_bw(coord_mat, x_design, simy_design, h_opt0)[0]
        sim_gstat, sim_lstat = wald_ht(x_design, simefit_beta, esig_eta0,
                                       cdesign)
        gstatvec[0, gii] = sim_gstat

        sim_lpval = 1 - stats.chi2.cdf(sim_lstat, simy_design.shape[2])
        sim_ind_thres = sim_lpval <= 10**(-thres)
        simlpval_area[0, gii] = np.sum(sim_ind_thres)

    k1 = np.mean(gstatvec)
    k2 = np.var(gstatvec)
    k3 = np.mean((gstatvec - k1)**3)
    a = k3 / (4 * k2)
    b = k1 - 2 * k2**2 / k3
    d = 8 * k2**3 / (k3**2)
    gpval = 1 - stats.chi2.cdf((gstat - b) / a, d)

    clu_pval = np.sum(simlpval_area >= area) / num_bstrp

    return gpval, clu_pval
Example #2
0
def run_stats(y_design, coord_mat, design_data, var_type):

    n, l, m = y_design.shape

    print("+++++++Construct the design matrix: normalization+++++++")
    x_design = read_x(design_data, var_type)
    p = x_design.shape[1]
    print("The dimension of design matrix is ", str(x_design.shape))
    """+++++++++++++++++++++++++++++++++++"""
    """Step 2. Statistical analysis: including (1) smoothing and (2) hypothesis testing"""

    print("+++++++Local linear kernel smoothing+++++++")
    start = timeit.default_timer()
    efit_beta, efity_design, h_opt = lpks(coord_mat, x_design, y_design)
    stop = timeit.default_timer()
    delta_time = str(stop - start)
    # print(h_opt)
    print("Elapsed time is " + delta_time)

    print(
        "+++++++Kernel smoothing (order = 1) for smooth functions (eta)+++++++"
    )
    start = timeit.default_timer()
    resy_design = y_design - efity_design
    print(np.amax(resy_design))
    print(np.amin(resy_design))
    efit_eta, res_eta, esig_eta = sif(coord_mat, resy_design, h_opt)
    print(np.amax(res_eta))
    print(np.amin(res_eta))
    stop = timeit.default_timer()
    delta_time = str(stop - start)
    print("Elapsed time is " + delta_time)

    print("+++++++Hypothesis testing+++++++")
    # hypothesis: beta_pj(d)=0 v.s. beta_pj(d)~=0 for all j and d
    start = timeit.default_timer()
    lpvals = np.zeros((l, p - 1))
    lpvals_fdr = np.zeros((l, p - 1))
    gpvals = np.zeros((1, p - 1))
    clu_pvals = np.zeros((1, p - 1))
    areas = np.zeros((1, p - 1))
    num_bstrp = 500  # number of bootstrap samples
    thres = 2

    for pp in range(p - 1):
        print("Testing whether the covariate " + str(pp + 1) +
              " is zero or not...")
        """ local and global statistics calculation """
        cdesign = np.zeros((1, p))
        cdesign[0, pp + 1] = 1
        gstat, lstat = wald_ht(x_design, efit_beta, esig_eta, cdesign)
        lpvals[:, pp] = 1 - np.squeeze(stats.chi2.cdf(lstat, m))
        lpvals_fdr[:, pp] = fdrcorrection0(lpvals[:, pp])[1]
        ind_thres = -np.log10(lpvals[:, pp]) >= thres
        area = np.sum(ind_thres)
        """ Generate random samples and calculate the corresponding statistics and pvalues """
        gpval, clu_pval = bstrp_pvalue(coord_mat, x_design, y_design, cdesign,
                                       gstat, num_bstrp, thres, area)

        gpvals[0, pp] = gpval
        areas[0, pp] = area
        clu_pvals[0, pp] = clu_pval
        print("the global p-value for covariate " + str(pp + 1) + " is " +
              str(gpvals[0, pp]) + "...")
        print("the p-value of most significant subregion for covariate " +
              str(pp + 1) + " is " + str(clu_pvals[0, pp]) + "...")

    stop = timeit.default_timer()
    delta_time = str(stop - start)
    print("Elapsed time is " + delta_time)

    return gpvals, lpvals_fdr, clu_pvals, efit_beta, efity_design, efit_eta
Example #3
0
def run_script(input_dir, output_dir):
    """
    Run the commandline script for MFSDA.

    Args:
        input_dir (str): full path to the data folder
        output_dir (str): full path to the output folder
    """

    """+++++++++++++++++++++++++++++++++++"""
    """Step 1. load dataset """
    print("loading data ......")
    print("+++++++Read the surface shape data+++++++")
    shape_file_name = input_dir + "aligned_shapes.mat"
    mat = loadmat(shape_file_name)
    y_design = mat['aligned_shape']
    n, l, m = y_design.shape
    print("The dimension of shape matrix is " + str(y_design.shape))
    print("+++++++Read the sphere coordinate data+++++++")
    template_file_name = input_dir + "template.mat"
    mat = loadmat(template_file_name)
    coord_mat = mat['template']
    # d = coord_mat.shape[1]
    print("+++++++Read the design matrix+++++++")
    design_data_file_name = input_dir + "design_data.txt"
    design_data = np.loadtxt(design_data_file_name)
    # read the covariate type
    var_type_file_name = input_dir + "var_type.txt"
    var_type = np.loadtxt(var_type_file_name)
    print("+++++++Construct the design matrix: normalization+++++++")
    x_design = read_x(design_data, var_type)
    p = x_design.shape[1]
    print("The dimension of design matrix is ", str(x_design.shape))

    """+++++++++++++++++++++++++++++++++++"""
    """Step 2. Statistical analysis: including (1) smoothing and (2) hypothesis testing"""

    print("+++++++Local linear kernel smoothing+++++++")
    start = timeit.default_timer()
    efit_beta, efity_design, h_opt = lpks(coord_mat, x_design, y_design)
    stop = timeit.default_timer()
    delta_time = str(stop - start)
    # print(h_opt)
    print("Elapsed time is " + delta_time)

    print("+++++++Kernel smoothing (order = 1) for smooth functions (eta)+++++++")
    start = timeit.default_timer()
    resy_design = y_design - efity_design
    print(np.amax(resy_design))
    print(np.amin(resy_design))
    efit_eta, res_eta, esig_eta = sif(coord_mat, resy_design, h_opt)
    print(np.amax(res_eta))
    print(np.amin(res_eta))
    stop = timeit.default_timer()
    delta_time = str(stop - start)
    print("Elapsed time is " + delta_time)

    print("+++++++Hypothesis testing+++++++")
    # hypothesis: beta_pj(d)=0 v.s. beta_pj(d)~=0 for all j and d
    start = timeit.default_timer()
    lpvals = np.zeros((l, p-1))
    lpvals_fdr = np.zeros((l, p-1))
    gpvals = np.zeros((1, p-1))
    clu_pvals = np.zeros((1, p-1))
    areas = np.zeros((1, p-1))
    num_bstrp = 500  # number of bootstrap samples
    thres = 2

    for pp in range(p-1):
        print("Testing whether the covariate " + str(pp+1) + " is zero or not...")
        """ local and global statistics calculation """
        cdesign = np.zeros((1, p))
        cdesign[0, pp+1] = 1
        gstat, lstat = wald_ht(x_design, efit_beta, esig_eta, cdesign)
        lpvals[:, pp] = 1 - np.squeeze(stats.chi2.cdf(lstat, m))
        lpvals_fdr[:, pp] = fdrcorrection0(lpvals[:, pp])[1]
        ind_thres = -np.log10(lpvals[:, pp]) >= thres
        area = np.sum(ind_thres)

        """ Generate random samples and calculate the corresponding statistics and pvalues """
        gpval, clu_pval = bstrp_pvalue(coord_mat, x_design, y_design, cdesign, gstat, num_bstrp, thres, area)

        gpvals[0, pp] = gpval
        areas[0, pp] = area
        clu_pvals[0, pp] = clu_pval
        print("the global p-value for covariate " + str(pp+1) + " is " + str(gpvals[0, pp]) + "...")
        print("the p-value of most significant subregion for covariate " +
              str(pp+1) + " is " + str(clu_pvals[0, pp]) + "...")

    stop = timeit.default_timer()
    delta_time = str(stop - start)
    print("Elapsed time is " + delta_time)

    """+++++++++++++++++++++++++++++++++++"""
    """Step3. Save all the results"""
    gpvals_file_name = output_dir + "global_pvalue.txt"
    np.savetxt(gpvals_file_name, gpvals)
    lpvals_fdr_file_name = output_dir + "local_pvalue_fdr.txt"
    np.savetxt(lpvals_fdr_file_name, lpvals_fdr)
    clu_pvals_file_name = output_dir + "cluster_pvalue.txt"
    np.savetxt(clu_pvals_file_name, clu_pvals)