Exemple #1
0
def remma_epiAD_pair(pheno_file,
                     bed_file,
                     gmat_lst,
                     var_com,
                     snp_pair_file,
                     max_test_pair=50000,
                     p_cut=1.0e-4,
                     out_file='epiAD_pair'):
    """
    Given a SNP pair file, perform additive by additive epistasis test by random SNP-BLUP model.
    :param pheno_file: phenotypic file. The fist two columns are family id, individual id which are same as plink *.fam
    file. The third column is always ones for population mean. The last column is phenotypic values. The ohter covariate
    can be added between columns for population mean and phenotypic values.
    :param bed_file: the prefix for binary file
    :param gmat_lst: A list for relationship matrix
    :param var_com: Estimated variances
    :param snp_pair_file: a file containing index for SNP pairs. The program only reads the first two columns and test
    SNP pairs row by row. The max value is num_snp - 1, and the min value is 0.
    :param max_test_pair: The max number of SNP pairs stored in memory. Default value is 50000.
    :param p_cut: put cut value. default value is 0.0001.
    :param out_file: output file. default value is 'epiAD_pair'.
    :return: 0
    """
    y, xmat, zmat = design_matrix_wemai_multi_gmat(pheno_file, bed_file)
    res = _remma_epiAD_pair(y,
                            xmat,
                            zmat,
                            gmat_lst,
                            var_com,
                            bed_file,
                            snp_pair_file,
                            max_test_pair=max_test_pair,
                            p_cut=p_cut,
                            out_file=out_file)
    return res
Exemple #2
0
def remma_epiAD_parallel(pheno_file,
                         bed_file,
                         gmat_lst,
                         var_com,
                         parallel,
                         p_cut=1.0e-5,
                         out_file='epiAD_parallel'):
    """
    Parallel version. Additive by dominance epistasis test by random SNP-BLUP model.
    :param pheno_file: phenotypic file. The fist two columns are family id, individual id which are same as plink *.fam
    file. The third column is always ones for population mean. The last column is phenotypic values. The ohter covariate
    can be added between columns for population mean and phenotypic values.
    :param bed_file: the prefix for binary file
    :param gmat_lst: a list of genomic relationship matrixes.
    :param var_com: Estimated variances
    :param parallel: A list containing two integers. The first integer is the number of parts to parallel. The second
    integer is the part to run. For example, parallel = [3, 1], parallel = [3, 2] and parallel = [3, 3] mean to divide
    total number of tests into three parts and run parallelly.
    :param p_cut: put cut value. default value is 1.0e-5.
    :param out_file: output file. default value is 'epiAD_parallel'.
    :return: 0
    """
    y, xmat, zmat = design_matrix_wemai_multi_gmat(pheno_file, bed_file)
    res = _remma_epiAD_parallel(y,
                                xmat,
                                zmat,
                                gmat_lst,
                                var_com,
                                bed_file,
                                parallel,
                                p_cut=p_cut,
                                out_file=out_file)
    return res
Exemple #3
0
def remma_epiAD(pheno_file,
                bed_file,
                gmat_lst,
                var_com,
                snp_lst_0=None,
                p_cut=1.0e-5,
                out_file='epiAD'):
    """
    additive by dominance epistasis test by random SNP-BLUP model.
    :param pheno_file: phenotypic file. The fist two columns are family id, individual id which are same as plink *.fam
    file. The third column is always ones for population mean. The last column is phenotypic values. The ohter covariate
    can be added between columns for population mean and phenotypic values.
    :param bed_file: the prefix for binary file
    :param gmat_lst: a list of genomic relationship matrixes.
    :param var_com: Estimated variances
    :param snp_lst_0: the first SNP list for the SNP pairs. the min value is 0 and the max value is num_snp-1. The
    default value is None, which means list [0, num_snp-1]
    :param p_cut: put cut value. default value is 1.0e-5.
    :param out_file: output file. default value is 'epiAD'.
    :return: 0
    """
    y, xmat, zmat = design_matrix_wemai_multi_gmat(pheno_file, bed_file)
    res = _remma_epiAD(y,
                       xmat,
                       zmat,
                       gmat_lst,
                       var_com,
                       bed_file,
                       snp_lst_0=snp_lst_0,
                       p_cut=p_cut,
                       out_file=out_file)
    return res
Exemple #4
0
def lm_pred(pheno_file, bed_file, agmat, out_file='lm_pred'):
    """
    :param pheno_file: phenotypic file. The fist two columns are family id, individual id which are same as plink *.fam
    file. The third column is always ones for population mean. The last column is phenotypic values. The ohter covariate
    can be added between columns for population mean and phenotypic values.
    :param bed_file: the prefix for binary file
    :param agmat: additive genomic relationship matrix
    :param out_file: the output file
    :return: the estimated snp effect
    """
    y, xmat, zmat = design_matrix_wemai_multi_gmat(pheno_file, bed_file)
    vmat = np.diag([1] * y.shape[0])
    vxmat = np.dot(vmat, xmat)
    xvxmat = np.dot(xmat.T, vxmat)
    xvxmat = np.linalg.inv(xvxmat)
    pmat = reduce(np.dot, [vxmat, xvxmat, vxmat.T])
    pmat = vmat - pmat
    zpymat = zmat.T.dot(np.dot(pmat, y))
    eff = np.dot(agmat, zpymat)
    np.savetxt(out_file + '.rand_eff', eff)
Exemple #5
0
def wemai_multi_gmat(pheno_file, bed_file, gmat_lst, init=None, maxiter=200, cc_par=1.0e-8, cc_gra=1.0e-6, out_file='wemai_multi_gmat.var'):
    """
    Estimate variances for univariate linear mixed model. Multiple genomic relationship matrixes can be included.
    Weighted EM an AI algorithm are used.
    :param pheno_file: phenotypic file. The fist two columns are family id, individual id which are same as plink *.fam
    file. The third column is always ones for population mean. The last column is phenotypic values. The ohter covariate
    can be added between columns for population mean and phenotypic values.
    :param bed_file: the prefix for binary file
    :param gmat_lst: a list of genomic relationship matrixes.
    :param init: initial values for variances. default value is None.
    :param maxiter: the maximal number of interactions. default value is 200.
    :param cc_par: The convergence criteria for update vector.
    :param cc_gra: The convergence criteria for gradient vector
    :param out_file: output file to save the estimated variances
    :return: the estimated variances
    """
    y, xmat, zmat = design_matrix_wemai_multi_gmat(pheno_file, bed_file)
    var_com = _wemai_multi_gmat(y, xmat, zmat, gmat_lst, init=init, maxiter=maxiter, cc_par=cc_par, cc_gra=cc_gra)
    np.savetxt(out_file, var_com)
    return var_com
Exemple #6
0
def remma_dom(pheno_file, bed_file, gmat_lst, var_com, out_file='remma_dom'):
    """
    Dominance test by random SNP-BLUP model.
    :param pheno_file: phenotypic file. The fist two columns are family id, individual id which are same as plink *.fam
    file. The third column is always ones for population mean. The last column is phenotypic values. The ohter covariate
    can be added between columns for population mean and phenotypic values.
    :param bed_file: the prefix for binary file
    :param gmat_lst: a list of genomic relationship matrixes.
    :param var_com: Estimated variances
    :param out_file: output file. default value is 'remma_dom'.
    :return: pandas data frame for results.
    """
    y, xmat, zmat = design_matrix_wemai_multi_gmat(pheno_file, bed_file)
    res = _remma_dom(y,
                     xmat,
                     zmat,
                     gmat_lst,
                     var_com,
                     bed_file,
                     out_file=out_file)
    return res
Exemple #7
0
def lm_snp_eff(pheno_file, bed_file, out_file='lm_snp_eff'):
    """
    :param pheno_file: phenotypic file. The fist two columns are family id, individual id which are same as plink *.fam
    file. The third column is always ones for population mean. The last column is phenotypic values. The ohter covariate
    can be added between columns for population mean and phenotypic values.
    :param bed_file: the prefix for binary file
    :param out_file: the output file
    :return: the estimated snp effect
    """
    y, xmat, zmat = design_matrix_wemai_multi_gmat(pheno_file, bed_file)
    snp_mat = read_plink(bed_file)
    if np.any(np.isnan(snp_mat)):
        logging.warning('Missing genotypes are imputed with random genotypes.')
        snp_mat = impute_geno(snp_mat)
    snp_eff = []
    for i in tqdm(range(snp_mat.shape[1])):
        xmati = np.concatenate([xmat, snp_mat[:, i:(i + 1)]], axis=1)
        eff = np.dot(linalg.inv(np.dot(xmati.T, xmati)), np.dot(xmati.T, y))
        snp_eff.append(eff[-1, -1])
    df = pd.read_csv(bed_file + '.bim', sep='\s+', header=None)
    df['eff'] = snp_eff
    df.to_csv(out_file, sep=' ', header=False, index=False)