Beispiel #1
0
def _remma_epiDD_eff(y, xmat, zmat, gmat_lst, var_com, bed_file, snp_lst_0=None, var_app=1.0, p_cut=1.0e-5, out_file='epiDD_eff'):
    """
    Estimate dominance by dominance epistasis effects by random SNP-BLUP model.
    :param y: phenotypic vector
    :param xmat: Designed matrix for fixed effect
    :param zmat: csr sparse matrix. Designed matrix for random effect.
    :param gmat_lst: A list for relationship matrix
    :param var_com: Estimated variances
    :param bed_file: the prefix for plink binary file
    :param snp_lst_0: the first SNP list for the SNP pairs. the min value is 0 and the max value is num_snp-2. The
    default value is None, which means list [0, num_snp-1)
    :param var_app: the approximate variances for estimated SNP effects.
    :param p_cut: put cut value. default value is 1.0e-5.
    :param out_file: output file. default value is 'remma_epiDD_eff'.
    :return: 0
    """
    logging.info("Calculate the phenotypic covariance matrix and inversion")
    y = np.array(y).reshape(-1, 1)
    n = y.shape[0]
    xmat = np.array(xmat).reshape(n, -1)
    vmat = np.diag([var_com[-1]] * n)
    for val in range(len(gmat_lst)):
        vmat += zmat.dot((zmat.dot(gmat_lst[val])).T) * var_com[val]
    del gmat_lst
    gc.collect()
    vmat_inv = np.linalg.inv(vmat)
    logging.info("Calculate P matrix")
    vxmat = np.dot(vmat_inv, xmat)
    xvxmat = np.dot(xmat.T, vxmat)
    xvxmat = np.linalg.inv(xvxmat)
    pmat = reduce(np.dot, [vxmat, xvxmat, vxmat.T])
    pmat = vmat_inv - pmat
    pymat = zmat.T.dot(np.dot(pmat, y))
    del vmat, vmat_inv, pmat
    gc.collect()
    num_snp = pd.read_csv(bed_file+'.bim', header=None).shape[0]
    num_id = pd.read_csv(bed_file+'.fam', header=None).shape[0]
    if snp_lst_0 is None:
        snp_lst_0 = range(num_snp - 1)
    else:
        if max(snp_lst_0) >= num_snp - 1 or min(snp_lst_0) < 0:
            logging.error('snp_lst_0 is out of range!')
            sys.exit()
    logging.info("Convert python variates to C type")
    pbed_file = ffi.new("char[]", bed_file.encode('ascii'))
    pnum_id = ffi.cast("long long", num_id)
    pnum_snp = ffi.cast("long long", num_snp)
    snp_lst_0 = np.array(list(snp_lst_0), dtype=np.longlong)
    psnp_lst_0 = ffi.cast("long long *", snp_lst_0.ctypes.data)
    # psnp_lst_0 = ffi.cast("long long *", ffi.from_buffer(snp_lst_0))
    plen_snp_lst_0 = ffi.cast("long long", len(snp_lst_0))
    ppymat = ffi.cast("double *", pymat.ctypes.data)
    chi_cut = chi2.isf(p_cut, 1)
    eff_cut = np.sqrt(chi_cut * var_app)
    peff_cut = ffi.cast("double", eff_cut)
    temp_file = out_file + '.temp'
    pout_file = ffi.new("char[]", temp_file.encode('ascii'))
    logging.info('Test')
    clock_t0 = time.perf_counter()
    cpu_t0 = time.process_time()
    lib.remma_epiDD_eff_cpu(pbed_file, pnum_id, pnum_snp, psnp_lst_0, plen_snp_lst_0, ppymat, peff_cut, pout_file)
    clock_t1 = time.perf_counter()
    cpu_t1 = time.process_time()
    logging.info("Running time: Clock time, {:.5f} sec; CPU time, {:.5f} sec.".format(clock_t1 - clock_t0, cpu_t1 - cpu_t0))
    logging.info('Add the approximate P values')
    with open(temp_file) as fin, open(out_file, 'w') as fout:
        head_line = fin.readline()
        head_line = head_line.strip()
        head_line += ' chi_app p_app\n'
        fout.write(head_line)
        for line in fin:
            arr = line.split()
            chi_app = float(arr[-1]) * float(arr[-1]) / var_app
            p_app = chi2.sf(chi_app, 1)
            fout.write(' '.join(arr + [str(chi_app), str(p_app)]) + '\n')
    os.remove(temp_file)
    return 0
Beispiel #2
0
def remma_epiAA_eff_cpu_c(y,
                          xmat,
                          zmat,
                          gmat_lst,
                          var_com,
                          bed_file,
                          snp_lst_0=None,
                          eff_cut=-999.0,
                          out_file='remma_epiAA_eff_cpu_c'):
    """
    加加上位检验
    :param y: 表型
    :param xmat: 固定效应设计矩阵
    :param zmat: 随机效应设计矩阵,csr稀疏矩阵
    :param gmat_lst: 基因组关系矩阵列表
    :param var_com: 方差组分
    :param bed_file: plink文件
    :param snp_lst_0: 互作对第一个SNP列表,最大值为num_snp-2,最小值为0
    :param eff_cut: 依据阈值保留的互作对
    :param out_file: 输出文件
    :return:
    """
    logging.info("计算V矩阵及其逆矩阵")
    y = np.array(y).reshape(-1, 1)
    n = y.shape[0]
    xmat = np.array(xmat).reshape(n, -1)
    vmat = np.diag([var_com[-1]] * n)
    for val in range(len(gmat_lst)):
        vmat += zmat.dot((zmat.dot(gmat_lst[val])).T) * var_com[val]
    del gmat_lst
    gc.collect()
    vmat_inv = np.linalg.inv(vmat)
    logging.info("计算P矩阵")
    vxmat = np.dot(vmat_inv, xmat)
    xvxmat = np.dot(xmat.T, vxmat)
    xvxmat = np.linalg.inv(xvxmat)
    pmat = reduce(np.dot, [vxmat, xvxmat, vxmat.T])
    pmat = vmat_inv - pmat
    pymat = zmat.T.dot(np.dot(pmat, y))
    del vmat, vmat_inv, pmat
    gc.collect()
    num_snp = pd.read_csv(bed_file + '.bim', header=None).shape[0]
    num_id = pd.read_csv(bed_file + '.fam', header=None).shape[0]
    if snp_lst_0 is None:
        snp_lst_0 = range(num_snp - 1)
    else:
        if max(snp_lst_0) >= num_snp - 1 or min(snp_lst_0) < 0:
            logging.error('snp_lst_0 is out of range!')
            sys.exit()
    logging.info("python变量转化到C")
    pbed_file = ffi.new("char[]", bed_file.encode('ascii'))
    pnum_id = ffi.cast("long long", num_id)
    pnum_snp = ffi.cast("long long", num_snp)
    snp_lst_0 = np.array(list(snp_lst_0), dtype=np.longlong)
    psnp_lst_0 = ffi.cast("long long *", snp_lst_0.ctypes.data)
    # psnp_lst_0 = ffi.cast("long long *", ffi.from_buffer(snp_lst_0))
    plen_snp_lst_0 = ffi.cast("long long", len(snp_lst_0))
    ppymat = ffi.cast("double *", pymat.ctypes.data)
    peff_cut = ffi.cast("double", eff_cut)
    pout_file = ffi.new("char[]", out_file.encode('ascii'))
    logging.info('检验')
    clock_t0 = time.perf_counter()
    cpu_t0 = time.process_time()
    lib.remma_epiAA_eff_cpu(pbed_file, pnum_id, pnum_snp, psnp_lst_0,
                            plen_snp_lst_0, ppymat, peff_cut, pout_file)
    clock_t1 = time.perf_counter()
    cpu_t1 = time.process_time()
    logging.info(
        "Running time: Clock time, {:.5f} sec; CPU time, {:.5f} sec.".format(
            clock_t1 - clock_t0, cpu_t1 - cpu_t0))
    return 0