Example #1
0
def ginbreedcoef(bed_file):
    snp_mat = read_plink(bed_file)
    if np.any(np.isnan(snp_mat)):
        print('Missing genotypes are imputed with random genotypes.')
        snp_mat = impute_geno(snp_mat)
    print("There are {:d} individuals and {:d} SNPs.".format(
        snp_mat.shape[0], snp_mat.shape[1]))
    homo_f = 1 - np.sum(np.abs(snp_mat - 1.0) < 0.01,
                        axis=1) / snp_mat.shape[1]
    freq = np.sum(snp_mat, axis=0) / (2 * snp_mat.shape[0])
    freq.shape = (1, snp_mat.shape[1])
    scale_vec = 2 * freq * (1 - freq)
    scale = np.sum(scale_vec)
    snp_mat = snp_mat - 2 * freq
    grm_f1 = np.sum(np.multiply(snp_mat, snp_mat), axis=1) / scale - 1.0
    grm_f2 = np.sum(np.multiply(snp_mat, snp_mat) / scale_vec,
                    axis=1) / snp_mat.shape[1] - 1.0

    fam_info = pd.read_csv(bed_file + '.fam', sep='\s+', header=None)
    id = np.array(fam_info.iloc[:, 1])
    data_df = {'id': id, 'homo_F': homo_f, 'grm_F1': grm_f1, 'grm_F2': grm_f2}
    data_df = pd.DataFrame(data_df,
                           columns=['id', 'homo_F', 'grm_F1', 'grm_F2'])
    out_file = bed_file + '.ginbreedcoef'
    data_df.to_csv(out_file, sep=' ', header=True, index=False)
Example #2
0
def uvlmm_gwas_epiAA(y, xmat, gmat_lst, var_com, bed_file):
    """
    加加上位,将SNP效应当成固定效应的单标记检验,可以包含多个关系矩阵(必含加性关系矩阵)
    :param y:
    :param xmat:
    :param gmat_lst:
    :param var_com:
    :param bed_file:
    :return:
    """
    # 计算V矩阵、逆矩阵
    y = np.array(y).reshape(-1, 1)
    n = y.shape[0]
    xmat = np.array(xmat).reshape(n, -1)
    vmat = np.diag([var_com[-1]] * n)
    for val in range(len(gmat_lst)):
        vmat += gmat_lst[val] * var_com[val]
    vmat = np.linalg.inv(vmat)
    # 读取SNP
    snp_mat = read_plink(bed_file)
    if np.any(np.isnan(snp_mat)):
        print('Missing genotypes are imputed with random genotypes.')
        snp_mat = impute_geno(snp_mat)
    freq = np.sum(snp_mat, axis=0) / (2 * snp_mat.shape[0])
    freq.shape = (1, snp_mat.shape[1])
    snp_mat = snp_mat - 2 * freq
    # 检验
    snpi = []
    snpj = []
    eff_vec = []
    chi_vec = []
    p_vec = []
    for i in range(snp_mat.shape[1]-1):
        for j in range(i+1, snp_mat.shape[1]):
            snpi.append(i)
            snpj.append(j)
            xmat_snp = np.concatenate([xmat, snp_mat[:, i:(i + 1)], snp_mat[:, j:(j + 1)], snp_mat[:, i:(i + 1)]*snp_mat[:, j:(j + 1)]], axis=1)
            vxmat = np.dot(vmat, xmat_snp)
            xvxmat = np.dot(xmat_snp.T, vxmat)
            xvxmat = np.linalg.inv(xvxmat)
            snp_eff = np.dot(xvxmat, np.dot(vxmat.T, y))[-1, -1]
            snp_var = xvxmat[-1, -1]
            snp_chi = snp_eff * snp_eff / snp_var
            p_val = chi2.sf(snp_chi, 1)
            eff_vec.append(snp_eff)
            chi_vec.append(snp_chi)
            p_vec.append(p_val)
    res_df = pd.DataFrame({
        'snpi': snpi,
        'snpj': snpj,
        'snp_eff': eff_vec,
        'p_val': p_vec
    })
    return res_df
Example #3
0
def uvlmm_gwas_add(y, xmat, gmat_lst, var_com, bed_file):
    """
    将SNP效应当成固定效应的单标记检验,可以包含多个关系矩阵(必含加性关系矩阵)
    :param y:
    :param xmat:
    :param gmat_lst:
    :param var_com:
    :param bed_file:
    :return:
    """
    # 计算V矩阵、逆矩阵
    y = np.array(y).reshape(-1, 1)
    n = y.shape[0]
    xmat = np.array(xmat).reshape(n, -1)
    vmat = np.diag([var_com[-1]] * n)
    for val in range(len(gmat_lst)):
        vmat += gmat_lst[val] * var_com[val]
    vmat = np.linalg.inv(vmat)
    # 读取SNP
    snp_mat = read_plink(bed_file)
    if np.any(np.isnan(snp_mat)):
        print('Missing genotypes are imputed with random genotypes.')
        snp_mat = impute_geno(snp_mat)
    freq = np.sum(snp_mat, axis=0) / (2 * snp_mat.shape[0])
    scale = np.sum(2*freq*(1-freq))
    freq.shape = (1, snp_mat.shape[1])
    snp_mat = snp_mat - 2 * freq
    # 检验
    eff_vec = []
    chi_vec = []
    scale_vec = []
    p_vec = []
    for i in tdqm(range(snp_mat.shape[1])):
        xmat_snp = np.concatenate([xmat, snp_mat[:, i:(i+1)]], axis=1)
        vxmat = np.dot(vmat, xmat_snp)
        xvxmat = np.dot(xmat_snp.T, vxmat)
        xvxmat = np.linalg.inv(xvxmat)
        snp_eff = np.dot(xvxmat, np.dot(vxmat.T, y))[-1, -1]
        snp_var = xvxmat[-1, -1]
        snp_chi = snp_eff*snp_eff/snp_var
        p_val = chi2.sf(snp_chi, 1)
        eff_vec.append(snp_eff)
        scale_vec.append(var_com[0]/(scale*snp_var))
        chi_vec.append(snp_chi)
        p_vec.append(p_val)
    snp_info_file = bed_file + '.bim'
    snp_info = pd.read_csv(snp_info_file, sep='\s+', header=None)
    res_df = snp_info.iloc[:, [0, 1, 3, 4, 5]]
    res_df.columns = ['chro', 'snp_ID', 'pos', 'allele1', 'allele2']
    res_df.loc[:, 'eff_val'] = eff_vec
    res_df.loc[:, 'scale_val'] = scale_vec
    res_df.loc[:, 'chi_val'] = chi_vec
    res_df.loc[:, 'p_val'] = p_vec
    return res_df
Example #4
0
File: lm.py Project: yanjunzan/GMAT
def lm_snp_eff(pheno_file, bed_file, out_file='lm_snp_eff'):
    """
    :param pheno_file: phenotypic file. The fist two columns are family id, individual id which are same as plink *.fam
    file. The third column is always ones for population mean. The last column is phenotypic values. The ohter covariate
    can be added between columns for population mean and phenotypic values.
    :param bed_file: the prefix for binary file
    :param out_file: the output file
    :return: the estimated snp effect
    """
    y, xmat, zmat = design_matrix_wemai_multi_gmat(pheno_file, bed_file)
    snp_mat = read_plink(bed_file)
    if np.any(np.isnan(snp_mat)):
        logging.warning('Missing genotypes are imputed with random genotypes.')
        snp_mat = impute_geno(snp_mat)
    snp_eff = []
    for i in tqdm(range(snp_mat.shape[1])):
        xmati = np.concatenate([xmat, snp_mat[:, i:(i + 1)]], axis=1)
        eff = np.dot(linalg.inv(np.dot(xmati.T, xmati)), np.dot(xmati.T, y))
        snp_eff.append(eff[-1, -1])
    df = pd.read_csv(bed_file + '.bim', sep='\s+', header=None)
    df['eff'] = snp_eff
    df.to_csv(out_file, sep=' ', header=False, index=False)
def unbalance_longwas_fixed_permutation(
        data_file,
        id,
        tpoint,
        trait,
        bed_file,
        kin_file,
        var_com,
        snp_lst=None,
        permutation_lst=None,
        tfix=None,
        fix=None,
        forder=3,
        aorder=3,
        porder=3,
        na_method='omit',
        prefix_outfile='unbalance_longwas_fixed_permutation'):
    """
    the longitudinal GWAS for the unbalanced data treating the SNP as the time varied fixed effect.
    :param data_file: the data file. The first row is the variate names whose first initial position is alphabetical.
    For the class variates, the first letter must be capital; for the covariates (continuous variates), the first letter
    must be lowercase.
    :param id: A class variate name which indicates the individual id column in the data file.
    :param tpoint: A covariate names which indicates the time point column in the data file.
    :param trait: A variate name which indicates the analyzed trait column in the data file.
    :param bed_file: the prefix for the plink binary file.
    :param kin_file: the file for genomic relationship matrix. This file can be produced by
    gmat.gmatrix.agmat function using agmat(bed_file, inv=True, small_val=0.001, out_fmt='id_id_val')
    :param var_com: the estimated variance parameters by the gmat.longwas.unbalance.unbalance_varcom function.
    :param snp_lst: the snp list to test. Default is None.
    :param permutation_lst: the index list for permutation. Default is None ([0, 1000)].
    :param tfix: A class variate name for the time varied fixed effect. Default value is None. Only one time varied
    fixed effect can be included in the current version.
    :param fix: Expression for the time independent fixed effect. Default value is None. An example:
    fix = "Sex + age + Season".
    :param forder: the order of Legendre polynomials for the time varied fixed effect. The default value is 3.
    :param aorder: the order of Legendre polynomials for the additive genetic effect. The default value is 3.
    :param porder: the order of Legendre polynomials for the permanent environment effect. The default value is 3.
    :param na_method: The method to deal with missing values. The default value is 'omit'. 'omit' method will delete the
    row with missing values. 'include' method will fill the missing values with the adjacent values.
    :param prefix_outfile: the prefix for the output file. Default is 'unbalance_longwas_fixed'.
    :return: A pandas data frame for the test result.
    """
    logging.info('################################')
    logging.info('###Prepare the related matrix###')
    logging.info('################################')
    if var_com.shape[0] != aorder * (aorder + 1) / 2 + aorder + 1 + porder * (
            porder + 1) / 2 + porder + 1 + 1:
        logging.info('ERROR: Variances do not match the data, please check')
        exit()
    logging.info('***Read the data file***')
    logging.info('Data file: ' + data_file)
    data_df = pd.read_csv(data_file, sep='\s+', header=0)
    logging.info('NA method: ' + na_method)
    if na_method == 'omit':
        data_df = data_df.dropna()
    elif na_method == 'include':
        data_df = data_df.fillna(method='ffill')
        data_df = data_df.fillna(method='bfill')
    else:
        logging.info('na_method does not exist: ' + na_method)
        exit()
    col_names = data_df.columns
    logging.info('The column names of data file: ' + ' '.join(list(col_names)))
    logging.info(
        'Note: Variates beginning with a capital letter is converted into factors.'
    )
    class_vec = []
    for val in col_names:
        if not val[0].isalpha():
            logging.info(
                "The first character of columns names must be alphabet!")
            exit()
        if val[0] == val.capitalize()[0]:
            class_vec.append(val)
            data_df[val] = data_df[val].astype('str')
        else:
            try:
                data_df[val] = data_df[val].astype('float')
            except Exception as e:
                logging.info(e)
                logging.info(val + " may contain string, please check!")
                exit()
    logging.info('Individual column: ' + id)
    if id not in col_names:
        logging.info(id + ' is not in the data file, please check!')
        exit()
    if id not in class_vec:
        logging.info('The initial letter of {} should be capital'.format(id))
        exit()
    id_order = []
    id_arr = list(data_df[id])
    id_order.append(id_arr[0])
    for i in range(1, len(id_arr)):
        if id_arr[i] != id_arr[i - 1]:
            id_order.append(id_arr[i])
    id_in_data = set(data_df[id])
    if len(id_in_data) - len(id_order) != 0:
        logging.info('The data is not sored by individual ID!')
        exit()
    logging.info('Time points column: ' + tpoint)
    if tpoint not in col_names:
        logging.info(tpoint + ' is not in the data file, please check!')
        exit()
    if tpoint in class_vec:
        logging.info(
            'The initial letter of {} should be lowercase'.format(tpoint))
        exit()
    logging.info('Trait column: ' + trait)
    if trait not in col_names:
        logging.info(trait + ' is not in the data file, please check!')
        exit()
    if trait in class_vec:
        logging.info(
            'The initial letter of {} should be lowercase'.format(trait))
        exit()
    logging.info('Code factor variables of the data file: ' +
                 ' '.join(list(class_vec)))
    code_val = {}
    code_dct = dct_2D()
    for val in class_vec:
        code_val[val] = 0
        temp = []
        for i in range(data_df.shape[0]):
            if data_df[val][i] not in code_dct[val]:
                code_val[val] += 1
                code_dct[val][data_df[val][i]] = str(code_val[val])
            temp.append(code_dct[val][data_df[val][i]])
        data_df[val] = np.array(temp)
    for val in class_vec:
        data_df[val] = data_df[val].astype('int')
    logging.info('***Build the design matrix for fixed effect***')
    logging.info('Time dependent fixed effect: ' + str(tfix))
    leg_fix = leg(data_df[tpoint], forder)
    if tfix == None:
        xmat_t = np.concatenate(leg_fix, axis=1)
        xmat_t = csr_matrix(xmat_t)
    else:
        if tfix not in class_vec:
            logging.info(tfix + ' is not the class variate')
            exit()
        row = np.array(range(data_df.shape[0]))
        col = np.array(data_df[tfix]) - 1
        val = np.array([1.0] * data_df.shape[0])
        tfix_mat = csr_matrix((val, (row, col)))
        xmat_t = []
        for i in range(len(leg_fix)):
            xmat_t.append(tfix_mat.multiply(leg_fix[i]))
        xmat_t = hstack(xmat_t)
        del row, col, val
        gc.collect()
    logging.info('Time independent fix effect: ' + str(fix))
    xmat_nt = None
    if fix == None:
        xmat_nt = None
    else:
        try:
            fix_exp = ''
            vec = fix.split('+')
            for i in vec:
                val = i.strip()
                if val in class_vec:
                    fix_exp += 'C(' + val + ')'
                else:
                    fix_exp += val
            xmat_nt = dmatrix(fix_exp, data_df)
            logging.info('The expression for fixed effect: ' + fix_exp)
        except Exception as e:
            logging.info(e + ': Check the fix effect expression.')
            exit()
        xmat_nt = csr_matrix(xmat_nt[:, 1:])
    xmat = hstack([xmat_t, xmat_nt])
    xmat = xmat.toarray()
    max_id = max(data_df[id]) + 1
    tmin = min(data_df[tpoint])
    tmax = max(data_df[tpoint])
    leg_lst = [
    ]  # legendre polynomials for time dependent fixed SNP effects, save for each individuals
    for i in range(1, max_id):
        leg_lst.append(
            leg_mt(data_df[data_df[id] == i][tpoint], tmax, tmin, forder))
    tpoint_vec = sorted(set(data_df[tpoint]))
    leg_tpoint_mat = leg_mt(np.array(tpoint_vec), tmax, tmin, forder)
    leg_tpoint_accum = np.sum(leg_tpoint_mat, axis=0)
    logging.info('***Read the kinship matrix***')
    logging.info('Kinship file: ' + kin_file)
    with open(kin_file) as fin:
        row = []
        col = []
        kin = []
        id_in_kin = {}
        for line in fin:
            arr = line.split()
            id_in_kin[arr[0]] = 1
            id_in_kin[arr[1]] = 1
            if arr[0] not in code_dct[id]:
                logging.info(arr[0] + ' is not in the kinship inversion file!')
                exit()
            if arr[1] not in code_dct[id]:
                logging.info(arr[1], 'is not in the kinship inversion file!')
                exit()
            row.append(int(code_dct[id][arr[0]]))
            col.append(int(code_dct[id][arr[1]]))
            kin.append(float(arr[2]))
    id_not_in_kin = list(set(code_dct[id].keys()) - set(id_in_kin.keys()))
    if len(id_not_in_kin) != 0:
        logging.info(
            'The ID: {} in the data file is not in the kinship file!'.format(
                ' '.join(id_not_in_kin)))
        exit()
    kin = csr_matrix(
        (np.array(kin), (np.array(row) - 1, np.array(col) - 1))).toarray()
    kin = np.add(kin, kin.T)
    kin[np.diag_indices_from(kin)] = 0.5 * np.diag(kin)
    del row, col
    gc.collect()
    logging.info('***Build the dedign matrix for random effect***')
    logging.info('Legendre order for additive effects: ' + str(aorder))
    leg_add = leg(data_df[tpoint], aorder)
    row = np.array(range(data_df.shape[0]))
    col = np.array(data_df[id]) - 1
    val = np.array([1.0] * data_df.shape[0])
    add_mat = csr_matrix((val, (row, col)),
                         shape=(data_df.shape[0], kin.shape[0]))
    zmat_add = []
    for i in range(len(leg_add)):
        zmat_add.append(add_mat.multiply(leg_add[i]))
    logging.info('Legendre order for permanent environmental effect: ' +
                 str(porder))
    leg_per = leg(data_df[tpoint], porder)
    per_mat = csr_matrix((val, (row, col)))
    zmat_per = []
    for i in range(len(leg_per)):
        zmat_per.append((per_mat.multiply(leg_per[i])))
    del row, col, val
    gc.collect()
    zmat = [zmat_add, zmat_per]
    y = data_df[trait].values.reshape(data_df.shape[0], 1)
    # kin_inv = [kin_inv, sparse.eye(max(data_df[id]), format="csr")]
    logging.info('***Prepare the merged Z matrix***')
    eff_ind = [[0, xmat.shape[1]]]  # the index for all effects [start end]
    zmat_con_lst = []  # combined random matrix
    for i in range(len(zmat)):
        temp = [eff_ind[i][-1]]
        zmat_con_lst.append(hstack(zmat[i]))
        for j in range(len(zmat[i])):
            temp.append(temp[-1] + zmat[i][j].shape[1])
        eff_ind.append(temp)
    logging.info('***Calculate the phenotypic (co)variance***')
    add_cov = var_com.loc[var_com.loc[:, 'vari'] == 1, :]
    row = np.array(add_cov['varij']) - 1
    col = np.array(add_cov['varik']) - 1
    val = add_cov['var_val']
    add_cov = csr_matrix((val, (row, col))).toarray()
    add_cov = add_cov + np.tril(add_cov, k=-1).T
    per_cov = var_com.loc[var_com.loc[:, 'vari'] == 2, :]
    row = np.array(per_cov['varij']) - 1
    col = np.array(per_cov['varik']) - 1
    val = per_cov['var_val']
    per_cov = csr_matrix((val, (row, col))).toarray()
    per_cov = per_cov + np.tril(per_cov, k=-1).T
    res_var = np.array(var_com['var_val'])[-1]
    vmat = zmat_con_lst[0].dot((zmat_con_lst[0].dot(np.kron(add_cov, kin))).T)
    one_id = sparse.eye(zmat_con_lst[1].shape[1] / per_cov.shape[0])
    vmat = vmat + zmat_con_lst[1].dot(
        (zmat_con_lst[1].dot(sparse.kron(per_cov, one_id))).T)
    vmat_diag = np.diag(vmat) + res_var
    np.fill_diagonal(vmat, vmat_diag)
    vmat = linalg.inv(vmat)
    logging.info('***Read the snp data***')
    snp_mat = read_plink(bed_file)
    if np.any(np.isnan(snp_mat)):
        logging.info('Missing genotypes are imputed with random genotypes.')
        snp_mat = impute_geno(snp_mat)
    num_id = snp_mat.shape[0]
    num_snp = snp_mat.shape[1]
    logging.info("There are {:d} individuals and {:d} SNPs.".format(
        num_id, num_snp))
    fam_df = pd.read_csv(bed_file + '.fam', sep='\s+', header=None)
    id_geno = list(np.array(fam_df.iloc[:, 1], dtype=str))
    id_order_index = []
    for i in id_order:
        id_order_index.append(id_geno.index(i))
    if snp_lst is None:
        snp_lst = range(num_snp)
    snp_lst = list(snp_lst)
    if min(snp_lst) < 0 or max(snp_lst) >= num_snp:
        logging.info('The value in the snp list should be >= {} and < {}', 0,
                     num_snp)
        exit()
    snp_mat = snp_mat[id_order_index, :]
    snp_mat = snp_mat[:, snp_lst]
    logging.info(
        '#####################################################################'
    )
    logging.info(
        '###Start the fixed regression longitudinal GWAS for unbalance data###'
    )
    logging.info(
        '#####################################################################'
    )
    if permutation_lst is None:
        permutation_lst = range(1000)
    id_perm = list(range(num_id))
    for rep in permutation_lst:
        logging.info("***Permutation: {} ***".format(rep))
        random.shuffle(id_perm)
        snp_mat = snp_mat[id_perm, :]
        chi_df = leg_lst[0].shape[1]
        eff_vec = []
        chi_vec = []
        p_vec = []
        p_min_vec = []
        p_accum_vec = []
        for i in tqdm(range(snp_mat.shape[1])):
            snp_fix = list(
                map(lambda x, y: x * y, leg_lst, list(snp_mat[:, i])))
            snp_fix = np.concatenate(snp_fix, axis=0)
            snp_fix = np.concatenate((xmat, snp_fix), axis=1)
            xv = np.dot(snp_fix.T, vmat)
            xvx = np.dot(xv, snp_fix)
            xvx = np.linalg.inv(xvx)
            xvy = np.dot(xv, y)
            b = np.dot(xvx, xvy)
            eff = b[-chi_df:, -1]
            eff_var = xvx[-chi_df:, -chi_df:]
            chi_val = np.sum(np.dot(np.dot(eff.T, np.linalg.inv(eff_var)),
                                    eff))
            p_val = chi2.sf(chi_val, chi_df)
            eff_vec.append(b[-chi_df:, -1])
            chi_vec.append(chi_val)
            p_vec.append(p_val)
            p_tpoint_vec = []
            for k in range(leg_tpoint_mat.shape[0]):
                eff_tpoint = np.sum(np.dot(leg_tpoint_mat[k, :], eff))
                eff_var_tpoint = np.sum(
                    np.dot(leg_tpoint_mat[k, :],
                           np.dot(eff_var, leg_tpoint_mat[k, :])))
                chi_tpoint = eff_tpoint * eff_tpoint / eff_var_tpoint
                p_tpoint = chi2.sf(chi_tpoint, 1)
                p_tpoint_vec.append(p_tpoint)
            p_min_vec.append(min(p_tpoint_vec))
            eff_accum = np.sum(np.dot(leg_tpoint_accum, eff))
            eff_var_accum = np.sum(
                np.dot(leg_tpoint_accum, np.dot(eff_var, leg_tpoint_accum)))
            chi_accum = eff_accum * eff_accum / eff_var_accum
            p_accum = chi2.sf(chi_accum, 1)
            p_accum_vec.append(p_accum)
        logging.info('Finish association analysis')
        logging.info('Output')
        snp_info_file = bed_file + '.bim'
        snp_info = pd.read_csv(snp_info_file, sep='\s+', header=None)
        res_df = snp_info.iloc[snp_lst, [0, 1, 3, 4, 5]]
        res_df.columns = ['chro', 'snp_ID', 'pos', 'allele1', 'allele2']
        res_df.loc[:, 'order'] = snp_lst
        res_df = res_df.iloc[:, [5, 0, 1, 2, 3, 4]]
        eff_vec = np.array(eff_vec)
        for i in range(eff_vec.shape[1]):
            col_ind = 'eff' + str(i)
            res_df.loc[:, col_ind] = eff_vec[:, i]
        res_df.loc[:, 'chi_val'] = chi_vec
        res_df.loc[:, 'p_val'] = p_vec
        res_df.loc[:, 'p_min'] = p_min_vec
        res_df.loc[:, 'p_accum'] = p_accum_vec
        out_file = prefix_outfile + '.' + str(rep)
        res_df.to_csv(out_file, sep=' ', index=False)
    return 0
Example #6
0
def _remma_dom(y,
               xmat,
               zmat,
               gmat_lst,
               var_com,
               bed_file,
               out_file='remma_dom'):
    """
    Dominance test by random SNP-BLUP model.
    :param y: phenotypic vector
    :param xmat: Designed matrix for fixed effect
    :param zmat: csr sparse matrix. Designed matrix for random effect.
    :param gmat_lst: A list for relationship matrix
    :param var_com: A list of estimated variances. var_com[0]: additive variances; var_com[1]: dominance variances
    :param bed_file: the prefix for plink binary file
    :param out_file: The output file. Default is 'remma_dom'.
    :return: pandas data frame for results.
    """
    logging.info("Calculate the phenotypic covariance matrix and inversion")
    y = np.array(y).reshape(-1, 1)
    n = y.shape[0]
    xmat = np.array(xmat).reshape(n, -1)
    vmat = np.diag([var_com[-1]] * n)
    for val in range(len(gmat_lst)):
        vmat += zmat.dot((zmat.dot(gmat_lst[val])).T) * var_com[val]
    # del gmat_lst
    # gc.collect()
    vmat_inv = linalg.inv(vmat)
    logging.info("Calculate P matrix")
    vxmat = np.dot(vmat_inv, xmat)
    xvxmat = np.dot(xmat.T, vxmat)
    xvxmat = linalg.inv(xvxmat)
    pmat = reduce(np.dot, [vxmat, xvxmat, vxmat.T])
    pmat = vmat_inv - pmat
    pymat = zmat.T.dot(np.dot(pmat, y))
    # pvpmat = reduce(np.dot, [pmat, vmat, pmat])
    pvpmat = zmat.T.dot((zmat.T.dot(pmat)).T)  # pvp = p
    del vmat, vmat_inv, pmat
    gc.collect()
    logging.info("Read the SNP")
    snp_mat = read_plink(bed_file)
    num_id, num_snp = snp_mat.shape
    if np.any(np.isnan(snp_mat)):
        logging.warning('Missing genotypes are imputed with random genotypes.')
        snp_mat = impute_geno(snp_mat)
    freq = np.sum(snp_mat, axis=0) / (2 * num_id)
    freq.shape = (1, num_snp)
    scale_vec = 2 * freq * (1 - freq)
    scale = np.sum(scale_vec * (1 - scale_vec))
    logging.info('The scaled factor is: {:.3f}'.format(scale))
    snp_mat[snp_mat > 1.5] = 0.0
    snp_mat = snp_mat - scale_vec
    eff_vec = np.dot(snp_mat.T, pymat)[:, -1] * var_com[1] / scale
    var_vec = np.sum(snp_mat * np.dot(pvpmat, snp_mat),
                     axis=0) * var_com[1] * var_com[1] / (scale * scale)
    eff_vec_to_fixed = eff_vec * var_com[1] / (var_vec * scale)
    chi_vec = eff_vec * eff_vec / var_vec
    p_vec = chi2.sf(chi_vec, 1)
    snp_info_file = bed_file + '.bim'
    snp_info = pd.read_csv(snp_info_file, sep='\s+', header=None)
    res_df = snp_info.iloc[:, [0, 1, 3, 4, 5]]
    res_df.columns = ['chro', 'snp_ID', 'pos', 'allele1', 'allele2']
    res_df.loc[:, 'eff_val'] = eff_vec
    res_df.loc[:, 'chi_val'] = chi_vec
    res_df.loc[:, 'eff_val_to_fixed'] = eff_vec_to_fixed
    res_df.loc[:, 'p_val'] = p_vec
    try:
        res_df.to_csv(out_file, index=False, header=True, sep=' ')
    except Exception as e:
        logging.error(e)
        sys.exit()
    return res_df
Example #7
0
def _remma_epiAA(y, xmat, zmat, gmat_lst, var_com, bed_file, snp_lst_0=None, p_cut=1.0e-5, out_file='epiAA'):
    """
    additive by additive epistasis test by random SNP-BLUP model.
    :param y: phenotypic vector
    :param xmat: Designed matrix for fixed effect
    :param zmat: csr sparse matrix. Designed matrix for random effect.
    :param gmat_lst: A list for relationship matrix
    :param var_com: Estimated variances
    :param bed_file: the prefix for plink binary file
    :param snp_lst_0: the first SNP list for the SNP pairs. the min value is 0 and the max value is num_snp-2. The
    default value is None, which means list [0, num_snp-1)
    :param p_cut: put cut value. default value is 1.0e-5.
    :param out_file: output file. default value is 'epiAA'.
    :return: 0
    """
    logging.info("Calculate the phenotypic covariance matrix and inversion")
    y = np.array(y).reshape(-1, 1)
    n = y.shape[0]
    xmat = np.array(xmat).reshape(n, -1)
    vmat = np.diag([var_com[-1]] * n)
    for val in range(len(gmat_lst)):
        vmat += zmat.dot((zmat.dot(gmat_lst[val])).T) * var_com[val]
    # del gmat_lst
    # gc.collect()
    vmat_inv = linalg.inv(vmat)
    logging.info("Calculate P matrix")
    vxmat = np.dot(vmat_inv, xmat)
    xvxmat = np.dot(xmat.T, vxmat)
    xvxmat = linalg.inv(xvxmat)
    pmat = reduce(np.dot, [vxmat, xvxmat, vxmat.T])
    pmat = vmat_inv - pmat
    pymat = zmat.T.dot(np.dot(pmat, y))
    # pvpmat = reduce(np.dot, [pmat, vmat, pmat]) # pvp = p
    pvpmat = zmat.T.dot((zmat.T.dot(pmat)).T)
    del vmat, vmat_inv, pmat
    gc.collect()
    logging.info("Read the SNP")
    np.savetxt(out_file, ['snp_0 snp_1 eff chi p_val'], fmt='%s')
    snp_mat = read_plink(bed_file)
    num_id, num_snp = snp_mat.shape
    if np.any(np.isnan(snp_mat)):
        logging.warning('Missing genotypes are imputed with random genotypes.')
        snp_mat = impute_geno(snp_mat)
    freq = np.sum(snp_mat, axis=0) / (2 * num_id)
    freq.shape = (1, num_snp)
    snp_mat = snp_mat - 2 * freq
    logging.info('Test')
    if snp_lst_0 is None:
        snp_lst_0 = range(num_snp - 1)
    else:
        if max(snp_lst_0) >= num_snp - 1 or min(snp_lst_0) < 0:
            logging.error('snp_lst_0 is out of range!')
            sys.exit()
    clock_t0 = time.perf_counter()
    cpu_t0 = time.process_time()
    for i in tqdm(snp_lst_0):
        epi_mat = snp_mat[:, i:(i+1)] * snp_mat[:, (i+1):]
        eff_vec = np.dot(epi_mat.T, pymat)
        var_vec = np.sum(epi_mat * np.dot(pvpmat, epi_mat), axis=0)
        var_vec = var_vec.reshape(-1, 1)
        chi_vec = eff_vec * eff_vec / var_vec
        p_vec = chi2.sf(chi_vec, 1)
        res = pd.DataFrame(
            {0: np.array([i]*(num_snp-i-1)), 1: np.arange((i+1), num_snp), 2: eff_vec[:, -1], 3: chi_vec[:, -1],
             4: p_vec[:, -1]})
        res = res[res[4] < p_cut]
        res.to_csv(out_file, sep=' ', header=False, index=False, mode='a')
    clock_t1 = time.perf_counter()
    cpu_t1 = time.process_time()
    logging.info("Running time: Clock time, {:.5f} sec; CPU time, {:.5f} sec.".format(clock_t1 - clock_t0, cpu_t1 - cpu_t0))
    return 0
Example #8
0
def simu_epistasis(bed_file, add_file, dom_file, epiAA_file, epiAD_file, epiDD_file, ratio=None, mean=1.0, res_var=1.0, out_file='simu_epistasis'):
    logging.info("Read the SNP")
    snp_mat = read_plink(bed_file)
    num_id, num_snp = snp_mat.shape
    if np.any(np.isnan(snp_mat)):
        logging.warning('Missing genotypes are imputed with random genotypes.')
        snp_mat = impute_geno(snp_mat)
    freq = np.sum(snp_mat, axis=0) / (2 * num_id)
    freq.shape = (1, num_snp)
    snp_matA = snp_mat - 2 * freq
    snp_mat[snp_mat > 1.5] = 0.0  # 2替换为0, 变为0、1、0编码
    snp_matD = snp_mat - 2 * freq * (1 - freq)
    del snp_mat
    gc.collect()
    if ratio is None:
        ratio = np.array([2.0, 1.0, 0.5, 0.5, 0.5, 1.0])
    else:
        ratio = np.array(ratio)
    logging.info("Additive")
    add_snp = pd.read_csv(add_file, header=None, sep='\s+')
    add_var = ratio[0]/ratio[-1]*res_var
    add_snp_eff_var = np.var(snp_matA[:, add_snp.iloc[:, 0]] * np.array(add_snp.iloc[:, 1]), axis=0)
    add_snp.iloc[:, 1] = add_snp.iloc[:, 1]/np.sqrt(np.sum(add_snp_eff_var)/add_var)
    add_snp.to_csv(add_file + '.norm', sep=' ', header=False, index=False)
    logging.info("Dominance")
    dom_snp = pd.read_csv(dom_file, header=None, sep='\s+')
    dom_var = ratio[1] / ratio[-1] * res_var
    dom_snp_eff_var = np.var(snp_matD[:, dom_snp.iloc[:, 0]] * np.array(dom_snp.iloc[:, 1]), axis=0)
    dom_snp.iloc[:, 1] = dom_snp.iloc[:, 1] / np.sqrt(np.sum(dom_snp_eff_var)/dom_var)
    dom_snp.to_csv(dom_file + '.norm', sep=' ', header=False, index=False)
    logging.info("Additive by additive epistasis")
    epiAA_snp = pd.read_csv(epiAA_file, header=None, sep='\s+')
    epiAA_var = ratio[2] / ratio[-1] * res_var
    epiAA_snp_eff_var = np.var(snp_matA[:, epiAA_snp.iloc[:, 0]] * snp_matA[:, epiAA_snp.iloc[:, 1]] * np.array(epiAA_snp.iloc[:, 2]), axis=0)
    epiAA_snp.iloc[:, 2] = epiAA_snp.iloc[:, 2]/np.sqrt(np.sum(epiAA_snp_eff_var)/epiAA_var)
    epiAA_snp.to_csv(epiAA_file + '.norm', sep=' ', header=False, index=False)
    logging.info("Additive by dominance epistasis")
    epiAD_snp = pd.read_csv(epiAD_file, header=None, sep='\s+')
    epiAD_var = ratio[3] / ratio[-1] * res_var
    epiAD_snp_eff_var = np.var(
        snp_matA[:, epiAD_snp.iloc[:, 0]] * snp_matD[:, epiAD_snp.iloc[:, 1]] * np.array(epiAD_snp.iloc[:, 2]), axis=0)
    epiAD_snp.iloc[:, 2] = epiAD_snp.iloc[:, 2] / np.sqrt(np.sum(epiAD_snp_eff_var)/epiAD_var)
    epiAD_snp.to_csv(epiAD_file + '.norm', sep=' ', header=False, index=False)
    logging.info("Dominance by dominance epistasis")
    epiDD_snp = pd.read_csv(epiDD_file, header=None, sep='\s+')
    epiDD_var = ratio[3] / ratio[-1] * res_var
    epiDD_snp_eff_var = np.var(
        snp_matD[:, epiDD_snp.iloc[:, 0]] * snp_matD[:, epiDD_snp.iloc[:, 1]] * np.array(epiDD_snp.iloc[:, 2]), axis=0)
    epiDD_snp.iloc[:, 2] = epiDD_snp.iloc[:, 2] / np.sqrt(np.sum(epiDD_snp_eff_var)/epiDD_var)
    epiDD_snp.to_csv(epiDD_file + '.norm', sep=' ', header=False, index=False)
    logging.info("Residual")
    res_vec = np.random.normal(0, np.sqrt(res_var), snp_matA.shape[0])
    np.savetxt(out_file + '.res', res_vec)
    logging.info("Phenotypic values")
    pheno_vec = mean + np.sum(snp_matA[:, add_snp.iloc[:, 0]] * np.array(add_snp.iloc[:, 1]), axis=1) + \
                np.sum(snp_matD[:, dom_snp.iloc[:, 0]] * np.array(dom_snp.iloc[:, 1]), axis=1) + \
    np.sum(snp_matA[:, epiAA_snp.iloc[:, 0]] * snp_matA[:, epiAA_snp.iloc[:, 1]] * np.array(epiAA_snp.iloc[:, 2]), axis=1) + \
    np.sum(snp_matA[:, epiAD_snp.iloc[:, 0]] * snp_matD[:, epiAD_snp.iloc[:, 1]] * np.array(epiAD_snp.iloc[:, 2]), axis=1) + \
    np.sum(snp_matD[:, epiDD_snp.iloc[:, 0]] * snp_matD[:, epiDD_snp.iloc[:, 1]] * np.array(epiDD_snp.iloc[:, 2]), axis=1) + \
    res_vec
    fam_df = pd.read_csv(bed_file + '.fam', sep='\s+', header=None)
    fam_df.iloc[:, 2] = 1
    fam_df.iloc[:, 3] = pheno_vec
    res_df = fam_df.iloc[:, 0:4]
    res_df.to_csv(out_file + '.pheno', sep=' ', header=False, index=False)
    return res_df
Example #9
0
def balance_longwas_fixed(data_file,
                          id,
                          tpoint,
                          trait,
                          kin_file,
                          bed_file,
                          var_com,
                          snp_lst=None,
                          tfix=None,
                          fix=None,
                          forder=3,
                          rorder=3,
                          na_method='omit',
                          maxiter=10,
                          cc_par=1.0e-6,
                          cc_gra=1.0e6,
                          em_weight_step=0.001,
                          prefix_outfile='balance_longwas_fixed'):
    """
    Longitudinal GWAS for balanced data.
    :param data_file:the data file. The first row is the variate names whose first initial position is alphabetical.
    For the class variates, the first letter must be capital; for the covariates (continuous variates), the first letter
    must be lowercase.
    :param id: A class variate name which indicates the individual id column in the data file.
    :param tpoint: A list of corresponding time points for phenotypic values.
    :param trait: A list indicating the columns for recorded phenotypic values. The column index starts from 0 in the
    data file.
    :param kin_file: the file for genomic relationship matrix. This file can be produced by
    gmat.gmatrix.agmat function using agmat(bed_file, inv=True, small_val=0.001, out_fmt='id_id_val')
    :param bed_file: the plink binary file
    :param var_com: variances parameters from the balance_varcom function.
    :param snp_lst: A list of snp to test. Default is None.
    :param tfix: A class variate name for the time varied fixed effect. Default value is None. The value must be None
    in the current version.
    :param fix: Expression for the time independent fixed effect. Default value is None. The value must be None
    in the current version.
    :param forder: the order of Legendre polynomials for the time varied fixed effect. The default value is 3.
    :param rorder: the order of Legendre polynomials for time varied random effects (additive genetic effects and
    permanent environment effects). The default value is 3.
    :param na_method: The method to deal with missing values. The default value is 'omit'. 'omit' method will delete the
    row with missing values. 'include' method will fill the missing values with the adjacent values.
    :param maxiter: the maximum number of iteration. Default is 10.
    :param cc_par: Convergence criteria for the changed variance parameters. Default is 1.0e-6.
    :param cc_gra: Convergence criteria for the norm of gradient vector. Default is 1.0e6.
    :param em_weight_step: the step of the em weight. Default is 0.001.
    :param prefix_outfile: the prefix for the output file. Default is 'balance_longwas_fixed'.
    :return: A pandas dataframe of GWAS results.
    """
    logging.info('################################')
    logging.info('###Prepare the related matrix###')
    logging.info('################################')
    if var_com.shape[0] != rorder * (rorder + 1) + 2 * (rorder + 1) + 1:
        logging.info('Variances do not match the data, please check')
        exit()
    logging.info('***Read the data file***')
    logging.info('Data file: ' + data_file)
    data_df = pd.read_csv(data_file, sep='\s+', header=0)
    logging.info('NA method: ' + na_method)
    if na_method == 'omit':
        data_df = data_df.dropna()
    elif na_method == 'include':
        data_df = data_df.fillna(method='ffill')
        data_df = data_df.fillna(method='bfill')
    else:
        logging.info('na_method does not exist: ' + na_method)
        exit()
    col_names = data_df.columns
    logging.info('The column names of data file: ' + ' '.join(list(col_names)))
    logging.info(
        'Note: Variates beginning with a capital letter is converted into factors.'
    )
    class_vec = []
    for val in col_names:
        if not val[0].isalpha():
            logging.info(
                "The first character of columns names must be alphabet!")
            exit()
        if val[0] == val.capitalize()[0]:
            class_vec.append(val)
            data_df[val] = data_df[val].astype('str')
        else:
            try:
                data_df[val] = data_df[val].astype('float')
            except Exception as e:
                logging.info(e)
                logging.info(val + ": may contain string, please check!")
                exit()
    logging.info('Individual column: ' + id)
    if id not in col_names:
        logging.info(id + ' is not in the data file, please check!')
        exit()
    if id not in class_vec:
        logging.info('The initial letter of {} should be capital'.format(id))
        exit()
    id_in_data_lst = list(data_df[id])
    id_in_data = set(id_in_data_lst)
    logging.info('Trait column: ' + ' '.join(np.array(trait, dtype=str)))
    logging.info('Trait column name: ' + ' '.join(list(col_names[trait])))
    if len(set(col_names[trait]) & set(class_vec)) != 0:
        logging.info(
            'Phenotype should not be defined as class variable, please check!')
        exit()
    logging.info('Code factor variables of the data file: ' +
                 ' '.join(list(class_vec)))
    code_dct = {}
    for val in class_vec:
        code_val = 0
        code_dct[val] = {}
        col_vec = []
        for i in range(data_df.shape[0]):
            if data_df[val][i] not in code_dct[val]:
                code_val += 1
                code_dct[val][data_df[val][i]] = str(code_val)
            col_vec.append(code_dct[val][data_df[val][i]])
        data_df[val] = np.array(col_vec)
        data_df[val] = data_df[val].astype('int')
    logging.info('***Read the kinship matrix***')
    with open(kin_file) as fin:
        row = []
        col = []
        kin = []
        id_in_kin = set()
        for line in fin:
            arr = line.split()
            if arr[0] not in code_dct[id] or arr[1] not in code_dct[id]:
                continue
            id_in_kin.add(arr[0])
            id_in_kin.add(arr[1])
            row.append(int(code_dct[id][arr[0]]))
            col.append(int(code_dct[id][arr[1]]))
            kin.append(float(arr[2]))
        kin = csr_matrix(
            (np.array(kin), (np.array(row) - 1, np.array(col) - 1))).toarray()
        kin = np.add(kin, kin.T)
        np.fill_diagonal(kin, 0.5 * np.diag(kin))
        del row, col
        gc.collect()
    logging.info('***Eigen decomposition of kinship matrix***')
    id_not_in_kin = list(id_in_data - id_in_kin)
    if len(id_not_in_kin) != 0:
        logging.info(
            'The ID: {} in the data file is not in the kinship file, please remove these IDs!'
            .format(' '.join(id_not_in_kin)))
        exit()
    kin_eigen_val, kin_eigen_vec = linalg.eigh(kin)
    logging.info('***Build the design matrix for fixed effect***')
    leg_fix = leg(np.array(tpoint), forder)
    leg_fix = np.array(leg_fix).reshape(forder + 1, 1, len(tpoint))
    leg_fix = np.concatenate([leg_fix] * data_df.shape[0], axis=1)
    xmat_t = leg_fix.copy()
    if tfix is not None:
        logging.info('The parameter tfix should be None in current version.')
        exit()
    xmat_t = np.matmul(np.array([kin_eigen_vec.T]), xmat_t)
    xmat_t = xmat_t.transpose(1, 2, 0)
    if fix is not None:
        logging.info('The parameter fix should be None in current version')
        exit()
    # T matrix for random effect
    leg_tp = leg(np.array(tpoint), rorder)
    leg_tp = np.concatenate(leg_tp, axis=1)
    y = np.array(data_df.iloc[:, trait])
    y = np.dot(kin_eigen_vec.T, y)
    y = y.reshape(data_df.shape[0], len(tpoint), 1)
    logging.info('***Read the snp data***')
    snp_mat = read_plink(bed_file)
    if np.any(np.isnan(snp_mat)):
        logging.info('Missing genotypes are imputed with random genotypes.')
        snp_mat = impute_geno(snp_mat)
    num_id = snp_mat.shape[0]
    num_snp = snp_mat.shape[1]
    logging.info("There are {:d} individuals and {:d} SNPs.".format(
        num_id, num_snp))
    fam_df = pd.read_csv(bed_file + '.fam', sep='\s+', header=None)
    id_in_geno = list(np.array(fam_df.iloc[:, 1], dtype=str))
    if len(set(id_in_data_lst) - set(id_in_geno)) != 0:
        logging.info(' '.join(list(set(id_in_data_lst) - set(id_in_geno))) +
                     ' in the data file is not in the snp file!')
        exit()
    # snp list
    if snp_lst is None:
        snp_lst = range(num_snp)
    else:
        try:
            snp_lst = np.array(snp_lst, dtype=int)
        except Exception as e:
            logging.info(e)
            logging.info('The snp list value should be int')
            exit()
    snp_lst = list(snp_lst)
    id_in_data_index = []
    for i in id_in_data_lst:
        id_in_data_index.append(id_in_geno.index(i))
    snp_mat = snp_mat[id_in_data_index, :]
    snp_mat = snp_mat[:, snp_lst]
    logging.info(
        '###################################################################')
    logging.info(
        '###Start the fixed regression longitudinal GWAS for balance data###')
    logging.info(
        '###################################################################')
    leg_tpoint_mat = leg_mt(np.array(tpoint), max(tpoint), min(tpoint), forder)
    leg_tpoint_accum = np.sum(leg_tpoint_mat, axis=0)
    cc_par_vec = []
    cc_gra_vec = []
    eff_vec = []
    chi_vec = []
    p_vec = []
    p_min_vec = []
    p_accum_vec = []
    for i in tqdm(range(snp_mat.shape[1])):
        snp_fix = np.multiply(leg_fix, snp_mat[:, i].reshape(1, num_id, 1))
        snp_fix = np.matmul(np.array([kin_eigen_vec.T]), snp_fix)
        snp_fix = snp_fix.transpose(1, 2, 0)
        snp_fix = np.concatenate((xmat_t, snp_fix), axis=2)
        res1 = balance_longwas_emai(y,
                                    snp_fix,
                                    leg_tp,
                                    kin_eigen_val,
                                    init=var_com['var_val'],
                                    maxiter=maxiter,
                                    cc_par=cc_par,
                                    cc_gra=cc_gra,
                                    em_weight_step=em_weight_step)
        cc_par_vec.append(res1[0])
        cc_gra_vec.append(res1[1])
        eff = res1[2]
        eff_vec.append(eff)
        chi_vec.append(res1[3])
        p_vec.append((res1[4]))
        eff_var = res1[5]
        p_tpoint_vec = []
        for k in range(leg_tpoint_mat.shape[0]):
            eff_tpoint = np.sum(np.dot(leg_tpoint_mat[k, :], eff))
            eff_var_tpoint = np.sum(
                np.dot(leg_tpoint_mat[k, :],
                       np.dot(eff_var, leg_tpoint_mat[k, :])))
            chi_tpoint = eff_tpoint * eff_tpoint / eff_var_tpoint
            p_tpoint = chi2.sf(chi_tpoint, 1)
            p_tpoint_vec.append(p_tpoint)
        p_min_vec.append(min(p_tpoint_vec))
        eff_accum = np.sum(np.dot(leg_tpoint_accum, eff))
        eff_var_accum = np.sum(
            np.dot(leg_tpoint_accum, np.dot(eff_var, leg_tpoint_accum)))
        chi_accum = eff_accum * eff_accum / eff_var_accum
        p_accum = chi2.sf(chi_accum, 1)
        p_accum_vec.append(p_accum)
    logging.info('Finish association analysis')
    logging.info('***Output***')
    snp_info_file = bed_file + '.bim'
    snp_info = pd.read_csv(snp_info_file, sep='\s+', header=None)
    res_df = snp_info.iloc[snp_lst, [0, 1, 3, 4, 5]]
    res_df.columns = ['chro', 'snp_ID', 'pos', 'allele1', 'allele2']
    res_df.loc[:, 'order'] = snp_lst
    res_df = res_df.iloc[:, [5, 0, 1, 2, 3, 4]]
    res_df.loc[:, 'cc_par_val'] = cc_par_vec
    res_df.loc[:, 'cc_gra_val'] = cc_gra_vec
    eff_vec = np.array(eff_vec)
    for i in range(eff_vec.shape[1]):
        col_ind = 'eff' + str(i)
        res_df.loc[:, col_ind] = eff_vec[:, i]
    res_df.loc[:, 'chi_val'] = chi_vec
    res_df.loc[:, 'p_val'] = p_vec
    res_df.loc[:, 'p_min'] = p_min_vec
    res_df.loc[:, 'p_accum'] = p_accum_vec
    out_file = prefix_outfile + '.res'
    res_df.to_csv(out_file, sep=' ', index=False)
    return res_df
Example #10
0
def remma_epiAA_eff_cpu(y,
                        xmat,
                        zmat,
                        gmat_lst,
                        var_com,
                        bed_file,
                        snp_lst_0=None,
                        eff_cut=-999.0,
                        out_file='remma_epiAA_eff_cpu'):
    """
    加加上位检验
    :param y: 表型
    :param xmat: 固定效应设计矩阵
    :param zmat: 随机效应设计矩阵,csr稀疏矩阵
    :param gmat_lst: 基因组关系矩阵列表
    :param var_com: 方差组分
    :param bed_file: plink文件
    :param snp_lst_0: 互作对第一个SNP列表,最小值为0,最大值为num_snp-2
    :param eff_cut: 依据阈值保留的互作对
    :param out_file: 输出文件
    :return:
    """
    logging.info("计算V矩阵及其逆矩阵")
    y = np.array(y).reshape(-1, 1)
    n = y.shape[0]
    xmat = np.array(xmat).reshape(n, -1)
    vmat = np.diag([var_com[-1]] * n)
    for val in range(len(gmat_lst)):
        vmat += zmat.dot((zmat.dot(gmat_lst[val])).T) * var_com[val]
    del gmat_lst
    gc.collect()
    vmat_inv = np.linalg.inv(vmat)
    logging.info("计算P矩阵")
    vxmat = np.dot(vmat_inv, xmat)
    xvxmat = np.dot(xmat.T, vxmat)
    xvxmat = np.linalg.inv(xvxmat)
    pmat = reduce(np.dot, [vxmat, xvxmat, vxmat.T])
    pmat = vmat_inv - pmat
    pymat = zmat.T.dot(np.dot(pmat, y))
    del vmat, vmat_inv, pmat
    gc.collect()
    logging.info("读取SNP文件")
    snp_mat = read_plink(bed_file)
    num_id, num_snp = snp_mat.shape
    if np.any(np.isnan(snp_mat)):
        logging.warning('Missing genotypes are imputed with random genotypes.')
        snp_mat = impute_geno(snp_mat)
    freq = np.sum(snp_mat, axis=0) / (2 * num_id)
    freq.shape = (1, -1)
    snp_mat = snp_mat - 2 * freq
    logging.info('检验')
    if snp_lst_0 is None:
        snp_lst_0 = range(num_snp - 1)
    else:
        if max(snp_lst_0) >= num_snp - 1 or min(snp_lst_0) < 0:
            logging.error('snp_lst_0 is out of range!')
            sys.exit()
    clock_t0 = time.perf_counter()
    cpu_t0 = time.process_time()
    res_lst = []
    for i in snp_lst_0:
        epi_mat = snp_mat[:, i:(i + 1)] * snp_mat[:, (i + 1):]
        eff_vec = np.dot(epi_mat.T, pymat)
        res = np.concatenate([
            np.array([i] * (num_snp - i - 1)).reshape(-1, 1),
            np.arange((i + 1), num_snp).reshape(-1, 1), eff_vec
        ],
                             axis=1)
        res_lst.append(res[np.abs(res[:, -1]) > eff_cut, :])
    clock_t1 = time.perf_counter()
    cpu_t1 = time.process_time()
    logging.info(
        "Running time: Clock time, {:.5f} sec; CPU time, {:.5f} sec.".format(
            clock_t1 - clock_t0, cpu_t1 - cpu_t0))
    res_lst = np.concatenate(res_lst, axis=0)
    np.savetxt(out_file, res_lst, header='snp_0 snp_1 eff', comments='')
    return res_lst
Example #11
0
def remma_epiAA_pair_cpu(y,
                         xmat,
                         zmat,
                         gmat_lst,
                         var_com,
                         bed_file,
                         snp_pair_file,
                         max_test_pair=50000,
                         p_cut=1.0e-4,
                         out_file='remma_epiAA_pair_cpu'):
    """
    加加上位检验
    :param y: 表型
    :param xmat: 固定效应设计矩阵
    :param zmat: 随机效应设计矩阵,csr稀疏矩阵
    :param gmat_lst: 基因组关系矩阵列表
    :param var_com: 方差组分
    :param bed_file: plink文件
    :param snp_pair_file: SNP互作对文件,取前两列,最小值为0,最大值为num_snp-1
    :param max_test_pair: 分批次检验,每次检验的最大互作对数
    :param p_cut: 阈值
    :param out_file: 输出文件
    :return:
    """
    logging.info("计算V矩阵及其逆矩阵")
    y = np.array(y).reshape(-1, 1)
    n = y.shape[0]
    xmat = np.array(xmat).reshape(n, -1)
    vmat = np.diag([var_com[-1]] * n)
    for val in range(len(gmat_lst)):
        vmat += zmat.dot((zmat.dot(gmat_lst[val])).T) * var_com[val]
    del gmat_lst
    gc.collect()
    vmat_inv = np.linalg.inv(vmat)
    logging.info("计算P矩阵")
    vxmat = np.dot(vmat_inv, xmat)
    xvxmat = np.dot(xmat.T, vxmat)
    xvxmat = np.linalg.inv(xvxmat)
    pmat = reduce(np.dot, [vxmat, xvxmat, vxmat.T])
    pmat = vmat_inv - pmat
    pymat = zmat.T.dot(np.dot(pmat, y))
    pvpmat = reduce(np.dot, [pmat, vmat, pmat])
    pvpmat = zmat.T.dot((zmat.T.dot(pvpmat)).T)
    del vmat, vmat_inv, pmat
    gc.collect()
    logging.info("读取SNP文件")
    snp_mat = read_plink(bed_file)
    num_id, num_snp = snp_mat.shape
    if np.any(np.isnan(snp_mat)):
        logging.warning('Missing genotypes are imputed with random genotypes.')
        snp_mat = impute_geno(snp_mat)
    freq = np.sum(snp_mat, axis=0) / (2 * num_id)
    freq.shape = (1, -1)
    snp_mat = snp_mat - 2 * freq
    logging.info("开始检验")
    np.savetxt(out_file, ['snp_0 snp_1 eff var chi p'], fmt='%s')
    clock_t0 = time.perf_counter()
    cpu_t0 = time.process_time()
    ipart = -1
    while True:
        ipart += 1
        skiprows = 1 + ipart * max_test_pair
        try:
            snp_pair = pd.read_csv(snp_pair_file,
                                   header=None,
                                   sep='\s+',
                                   skiprows=skiprows,
                                   nrows=max_test_pair)
        except Exception as e:
            logging.info(e)
            break
        snp_pair = np.array(snp_pair.iloc[:, 0:2], dtype=np.int)
        if np.max(snp_pair) > num_snp - 1 or np.min(snp_pair) < 0:
            logging.error('snp_pair is out of range!')
            sys.exit()
        epi_mat = snp_mat[:, snp_pair[:, 0]] * snp_mat[:, snp_pair[:, 1]]
        eff_vec = np.dot(epi_mat.T, pymat)
        var_vec = np.sum(epi_mat * np.dot(pvpmat, epi_mat), axis=0)
        var_vec = var_vec.reshape(-1, 1)
        chi_vec = eff_vec * eff_vec / var_vec
        p_vec = chi2.sf(chi_vec, 1)
        res = pd.DataFrame({
            0: snp_pair[:, 0],
            1: snp_pair[:, 1],
            2: eff_vec[:, -1],
            3: var_vec[:, -1],
            4: chi_vec[:, -1],
            5: p_vec[:, -1]
        })
        res = res[res[5] < p_cut]
        res.to_csv(out_file, sep=' ', header=False, index=False, mode='a')
    clock_t1 = time.perf_counter()
    cpu_t1 = time.process_time()
    logging.info(
        "Running time: Clock time, {:.5f} sec; CPU time, {:.5f} sec.".format(
            clock_t1 - clock_t0, cpu_t1 - cpu_t0))
    return 0
Example #12
0
def remma_epiAA_select_cpu(y,
                           xmat,
                           zmat,
                           gmat_lst,
                           var_com,
                           bed_file,
                           snp_lst_0=None,
                           snp_lst_1=None,
                           p_cut=1.0,
                           out_file='remma_epiAA_select_cpu'):
    """
    加加上位检验
    :param y: 表型
    :param xmat: 固定效应设计矩阵
    :param zmat: 随机效应设计矩阵,csr稀疏矩阵
    :param gmat_lst: 基因组关系矩阵列表
    :param var_com: 方差组分
    :param bed_file: plink文件
    :param snp_lst_0: 互作对第一个SNP列表,列表最小值为0,最大值为num_snp-1
    :param snp_lst_1: 互作对第一个SNP列表,列表最小值为0,最大值为num_snp-1
    :param p_cut: 依据阈值保留的互作对
    :param out_file: 输出文件
    :return:
    """
    logging.info("计算V矩阵及其逆矩阵")
    y = np.array(y).reshape(-1, 1)
    n = y.shape[0]
    xmat = np.array(xmat).reshape(n, -1)
    vmat = np.diag([var_com[-1]] * n)
    for val in range(len(gmat_lst)):
        vmat += zmat.dot((zmat.dot(gmat_lst[val])).T) * var_com[val]
    del gmat_lst
    gc.collect()
    vmat_inv = np.linalg.inv(vmat)
    logging.info("计算P矩阵")
    vxmat = np.dot(vmat_inv, xmat)
    xvxmat = np.dot(xmat.T, vxmat)
    xvxmat = np.linalg.inv(xvxmat)
    pmat = reduce(np.dot, [vxmat, xvxmat, vxmat.T])
    pmat = vmat_inv - pmat
    pymat = zmat.T.dot(np.dot(pmat, y))
    pvpmat = reduce(np.dot, [pmat, vmat, pmat])
    pvpmat = zmat.T.dot((zmat.T.dot(pvpmat)).T)
    del vmat, vmat_inv, pmat
    gc.collect()
    logging.info("读取SNP文件")
    snp_mat = read_plink(bed_file)
    num_id, num_snp = snp_mat.shape
    if np.any(np.isnan(snp_mat)):
        logging.warning('Missing genotypes are imputed with random genotypes.')
        snp_mat = impute_geno(snp_mat)
    freq = np.sum(snp_mat, axis=0) / (2 * num_id)
    freq.shape = (1, num_snp)
    snp_mat = snp_mat - 2 * freq
    logging.info("开始检验")
    if snp_lst_0 is None:
        snp_lst_0 = range(num_snp)
    else:
        if max(snp_lst_0) > num_snp - 1 or min(snp_lst_0) < 0:
            logging.error('snp_lst_0 is out of range!')
            sys.exit()
    if snp_lst_1 is None:
        snp_lst_1 = range(num_snp)
    else:
        if max(snp_lst_1) > num_snp - 1 or min(snp_lst_1) < 0:
            logging.error('snp_lst_1 is out of range!')
            sys.exit()
    snp_lst_0 = list(snp_lst_0)
    snp_lst_1 = list(snp_lst_1)
    chi2_cut = chi2.isf(p_cut, 1)
    clock_t0 = time.perf_counter()
    cpu_t0 = time.process_time()
    res_lst = []
    for i in snp_lst_0:
        snp_lst_11 = snp_lst_1[:]
        try:
            snp_lst_11.remove(i)
        except Exception as e:
            del e
        epi_mat = snp_mat[:, i:(i + 1)] * snp_mat[:, snp_lst_11]
        eff_vec = np.dot(epi_mat.T, pymat)
        var_vec = np.sum(epi_mat * np.dot(pvpmat, epi_mat), axis=0)
        var_vec = var_vec.reshape(-1, 1)
        chi_vec = eff_vec * eff_vec / var_vec
        res = np.concatenate([
            np.array([i] * len(snp_lst_11)).reshape(-1, 1),
            np.array(snp_lst_11).reshape(-1, 1), eff_vec, var_vec, chi_vec
        ],
                             axis=1)
        res_lst.append(res[res[:, -1] > chi2_cut, :])
    clock_t1 = time.perf_counter()
    cpu_t1 = time.process_time()
    logging.info(
        "Running time: Clock time, {:.5f} sec; CPU time, {:.5f} sec.".format(
            clock_t1 - clock_t0, cpu_t1 - cpu_t0))
    res_lst = np.concatenate(res_lst, axis=0)
    np.savetxt(out_file,
               res_lst,
               header='snp_0 snp_1 eff var chi',
               comments='')
    return res_lst
Example #13
0
def _remma_epiAD_pair(y,
                      xmat,
                      zmat,
                      gmat_lst,
                      var_com,
                      bed_file,
                      snp_pair_file,
                      max_test_pair=50000,
                      p_cut=1.0e-4,
                      out_file='epiAD_pair'):
    """
    Given a SNP pair file, perform additive by dominance epistasis test by random SNP-BLUP model.
    :param y: phenotypic vector
    :param xmat: Designed matrix for fixed effect
    :param zmat: csr sparse matrix. Designed matrix for random effect.
    :param gmat_lst: A list for relationship matrix
    :param var_com: Estimated variances
    :param bed_file: the prefix for plink binary file
    :param snp_pair_file: a file containing index for SNP pairs. The program only reads the first two columns and test
    SNP pairs row by row. The max value is num_snp - 1, and the min value is 0.
    :param max_test_pair: The max number of SNP pairs stored in memory. Default value is 50000.
    :param p_cut: put cut value. default value is 0.0001.
    :param out_file: output file. default value is 'remma_epiAA_pair'.
    :return: 0
    """
    logging.info("Calculate the phenotypic covariance matrix and inversion")
    y = np.array(y).reshape(-1, 1)
    n = y.shape[0]
    xmat = np.array(xmat).reshape(n, -1)
    vmat = np.diag([var_com[-1]] * n)
    for val in range(len(gmat_lst)):
        vmat += zmat.dot((zmat.dot(gmat_lst[val])).T) * var_com[val]
    del gmat_lst
    gc.collect()
    vmat_inv = np.linalg.inv(vmat)
    logging.info("Calculate P matrix")
    vxmat = np.dot(vmat_inv, xmat)
    xvxmat = np.dot(xmat.T, vxmat)
    xvxmat = np.linalg.inv(xvxmat)
    pmat = reduce(np.dot, [vxmat, xvxmat, vxmat.T])
    pmat = vmat_inv - pmat
    pymat = zmat.T.dot(np.dot(pmat, y))
    # pvpmat = reduce(np.dot, [pmat, vmat, pmat])  # pvp = p
    pvpmat = zmat.T.dot((zmat.T.dot(pmat)).T)
    del vmat, vmat_inv, pmat
    gc.collect()
    logging.info("Read the SNP")
    snp_mat = read_plink(bed_file)
    num_id, num_snp = snp_mat.shape
    if np.any(np.isnan(snp_mat)):
        logging.warning('Missing genotypes are imputed with random genotypes.')
        snp_mat = impute_geno(snp_mat)
    freq = np.sum(snp_mat, axis=0) / (2 * num_id)
    freq.shape = (1, -1)
    snp_matA = snp_mat - 2 * freq
    snp_mat[snp_mat > 1.5] = 0.0  # 2替换为0, 变为0、1、0编码
    snp_matD = snp_mat - 2 * freq * (1 - freq)
    del snp_mat
    gc.collect()
    logging.info("Test")
    np.savetxt(out_file, ['snp_0 snp_1 eff var chi p'], fmt='%s')
    clock_t0 = time.perf_counter()
    cpu_t0 = time.process_time()
    ipart = -1
    while True:
        ipart += 1
        skiprows = 1 + ipart * max_test_pair
        try:
            snp_pair = pd.read_csv(snp_pair_file,
                                   header=None,
                                   sep='\s+',
                                   skiprows=skiprows,
                                   nrows=max_test_pair)
        except Exception as e:
            logging.info(e)
            break
        snp_pair = np.array(snp_pair.iloc[:, 0:2], dtype=np.int)
        if np.max(snp_pair) > num_snp - 1 or np.min(snp_pair) < 0:
            logging.error('snp_pair is out of range!')
            sys.exit()
        epi_mat = snp_matA[:, snp_pair[:, 0]] * snp_matD[:, snp_pair[:, 1]]
        eff_vec = np.dot(epi_mat.T, pymat)
        var_vec = np.sum(epi_mat * np.dot(pvpmat, epi_mat), axis=0)
        var_vec = var_vec.reshape(-1, 1)
        chi_vec = eff_vec * eff_vec / var_vec
        p_vec = chi2.sf(chi_vec, 1)
        res = pd.DataFrame({
            0: snp_pair[:, 0],
            1: snp_pair[:, 1],
            2: eff_vec[:, -1],
            3: var_vec[:, -1],
            4: chi_vec[:, -1],
            5: p_vec[:, -1]
        })
        res = res[res[5] < p_cut]
        res.to_csv(out_file, sep=' ', header=False, index=False, mode='a')
    clock_t1 = time.perf_counter()
    cpu_t1 = time.process_time()
    logging.info(
        "Running time: Clock time, {:.5f} sec; CPU time, {:.5f} sec.".format(
            clock_t1 - clock_t0, cpu_t1 - cpu_t0))
    return 0
Example #14
0
def agmat(bed_file, inv=False, small_val=0.001, out_fmt='mat'):
    """
    additive genomic relationship matrix and its inversion
    :param bed_file: The prefix for plink binary file
    :param inv: Whether to calculate the inversion. Default value is True
    :param small_val: A small vale added to the diagonal to grant the positive definite. Default value is 0.001.
    :param out_fmt: the output format. mat: matrix format (default); row_col_val: row-column-value format;
    id_id_val: id-id-value format.
    :return: return numpy array for genomic relationship matrix and its inversion. Output the matrixes into the file
    with prefix of bed_file.
    """
    logging.info("{:#^80}".format("Read the SNP data"))
    snp_mat = read_plink(bed_file)
    if np.any(np.isnan(snp_mat)):
        logging.info('Missing genotypes are imputed with random genotypes.')
        snp_mat = impute_geno(snp_mat)
    num_id = snp_mat.shape[0]  # 个体数
    num_snp = snp_mat.shape[1]  # SNP数
    logging.info("There are {:d} individuals and {:d} SNPs.".format(
        num_id, num_snp))
    freq = np.sum(snp_mat, axis=0) / (2 * num_id)
    freq.shape = (1, num_snp)
    scale = 2 * freq * (1 - freq)  # 标准化因子
    scale = np.sum(scale)
    logging.info('The scaled factor is: {:.3f}'.format(scale))
    snp_mat = snp_mat - 2 * freq

    logging.info(
        "{:#^80}".format("Calculate the additive genomic relationship matrix"))
    clock_t0 = time.perf_counter()
    cpu_t0 = time.process_time()
    kin = np.dot(snp_mat, snp_mat.T) / scale
    kin_diag = np.diag(kin)
    kin_diag = kin_diag + kin_diag * small_val
    np.fill_diagonal(kin, kin_diag)
    clock_t1 = time.perf_counter()
    cpu_t1 = time.process_time()
    logging.info(
        "Running time: Clock time, {:.5f} sec; CPU time, {:.5f} sec.".format(
            clock_t1 - clock_t0, cpu_t1 - cpu_t0))

    logging.info("{:#^80}".format("Output"))
    fam_info = pd.read_csv(bed_file + '.fam', sep='\s+', header=None)
    id = np.array(fam_info.iloc[:, 1])
    out_file = bed_file + '.agrm'
    logging.info("The output file is " + out_file)
    res = output_mat(kin, id, out_file, out_fmt)
    if res == 0:
        logging.error('Not Recognized output format: ' + out_fmt)
        sys.exit()
    kin_inv = None
    if inv:
        logging.info("{:#^80}".format("Calculate the inversion of kinship"))
        clock_t0 = time.perf_counter()
        cpu_t0 = time.process_time()
        kin_inv = linalg.inv(kin)
        clock_t1 = time.perf_counter()
        cpu_t1 = time.process_time()
        logging.info(
            "Running time: Clock time, {:.5f} sec; CPU time, {:.5f} sec.".
            format(clock_t1 - clock_t0, cpu_t1 - cpu_t0))
        logging.info("{:#^80}".format("Output the inversion"))
        out_file = bed_file + '.agiv'
        logging.info("The output file is: " + out_file)
        output_mat(kin_inv, id, out_file, out_fmt)
    return kin, kin_inv
Example #15
0
def remma_epiAA_maf_approx_parallel(pheno_file,
                                    bed_file,
                                    gmat_lst,
                                    var_com,
                                    parallel,
                                    p_cut=1.0e-5,
                                    num_random_pair=100000,
                                    out_file='epiAA_maf_approx_parallel'):
    """
    additive by additive epistasis test by random SNP-BLUP model based on approximate test
    :param pheno_file: phenotypic file. The fist two columns are family id, individual id which are same as plink *.fam
    file. The third column is always ones for population mean. The last column is phenotypic values. The ohter covariate
    can be added between columns for population mean and phenotypic values.
    :param bed_file: the prefix for binary file
    :param gmat_lst: a list of genomic relationship matrixes.
    :param var_com: Estimated variances
    :param parallel: A list containing two integers. The first integer is the number of parts to parallel. The second
    integer is the part to run. For example, parallel = [3, 1], parallel = [3, 2] and parallel = [3, 3] mean to divide
    :param p_cut: put cut value. default value is 1.0e-5.
    :param num_random_pair: the number of random pairs to estimate the approximate variances of estimated effects. Default value is 100,000
    :param out_file: The prefix for output file. Default value is 'epiAA_approx'
    :return: 0
    """
    logging.info(
        "\n\n#####Randomly select {:d} pairs, and test these SNP pairs#####".
        format(num_random_pair))
    bim_df = pd.read_csv(bed_file + '.bim', header=None)
    num_snp = bim_df.shape[0]
    random_pair(num_snp,
                out_file=out_file + '.random_pair.' + str(parallel[1]),
                num_pair=num_random_pair)
    remma_epiAA_pair(pheno_file,
                     bed_file,
                     gmat_lst,
                     var_com,
                     snp_pair_file=out_file + '.random_pair.' +
                     str(parallel[1]),
                     p_cut=1,
                     out_file=out_file + '.random.' + str(parallel[1]))
    os.remove(out_file + '.random_pair.' + str(parallel[1]))
    logging.info(
        "\n\n#####Calcualte the approximate denominator for Wald chi-square test#####"
    )
    snp_mat = read_plink(bed_file)
    num_id, num_snp = snp_mat.shape
    if np.any(np.isnan(snp_mat)):
        logging.warning('Missing genotypes are imputed with random genotypes.')
        snp_mat = impute_geno(snp_mat)
    freq = 1 - np.sum(snp_mat, axis=0) / (2 * num_id)
    freq[freq > 0.5] = 1 - freq[freq > 0.5]  # maf
    np.savetxt(out_file + '.freq.' + str(parallel[1]), freq)
    freq = list(map(np.longlong, freq * 20))  # allele frequency * 20
    freq_dct = dict(zip(np.array(list(range(len(freq))), dtype=str), freq))
    median_dct = {}
    median_count_dct = {}
    with open(out_file + '.random.' + str(parallel[1])) as fin:
        fin.readline()
        for line in fin:
            arr = line.split()
            key_val = ' '.join([str(freq_dct[arr[0]]), str(freq_dct[arr[1]])])
            median_count_dct[key_val] = median_count_dct.get(key_val, 0) + 1
            median_dct[key_val] = median_dct.get(key_val, 0.0) + float(arr[-3])
            key_val = ' '.join([str(freq_dct[arr[1]]), str(freq_dct[arr[0]])])
            median_count_dct[key_val] = median_count_dct.get(key_val, 0) + 1
            median_dct[key_val] = median_dct.get(key_val, 0.0) + float(arr[-3])
    all_median = 0
    all_count = 0
    for val in median_count_dct:
        all_median += median_dct[val]
        all_count += median_count_dct[val]
        median_dct[val] = median_dct[val] / median_count_dct[val]
    all_median = all_median / all_count
    freq_deno = np.ones(111)
    with open(out_file + '.freq_denominator.' + str(parallel[1]), 'w') as fout:
        for key1 in set(freq):
            for key2 in set(freq):
                key_val = ' '.join([str(key1), str(key2)])
                if key_val not in median_dct:
                    median_dct[key_val] = all_median
                fout.write(key_val + ' ' + str(median_dct[key_val]) + '\n')
                freq_deno[key1 * 10 + key2] = median_dct[key_val]
    logging.info(
        "\n\n#####Screen the epistatic effects and select top SNP pairs based on approximate test#####"
    )
    remma_epiAA_maf_eff_parallel(pheno_file,
                                 bed_file,
                                 gmat_lst,
                                 var_com,
                                 parallel=parallel,
                                 freq=np.array(freq, dtype=np.longlong),
                                 freq_deno=freq_deno,
                                 p_cut=p_cut,
                                 out_file=out_file + '.approx_p')
    logging.info("\n\n#####Calculate exact p values for top SNP pairs#####")
    remma_epiAA_pair(pheno_file,
                     bed_file,
                     gmat_lst,
                     var_com,
                     snp_pair_file=out_file + '.approx_p.' + str(parallel[1]),
                     p_cut=1,
                     out_file=out_file + '.exact_p.' + str(parallel[1]))
    logging.info("\n\n#####Merge the results#####")
    p_dct = {}
    with open(out_file + '.approx_p.' + str(parallel[1]), 'r') as fin:
        for line in fin:
            arr = line.split()
            p_dct[' '.join(arr[:2])] = arr[-1]
    with open(out_file + '.exact_p.' + str(parallel[1]),
              'r') as fin, open(out_file + '.' + str(parallel[1]),
                                'w') as fout:
        for line in fin:
            arr = line.split()
            arr.insert(-1, p_dct[' '.join(arr[:2])])
            fout.write(' '.join(arr) + '\n')
    os.remove(out_file + '.approx_p.' + str(parallel[1]))
    os.remove(out_file + '.exact_p.' + str(parallel[1]))
    return 0
Example #16
0
def remma_epiAA_eff_gpu(y,
                        xmat,
                        gmat_lst,
                        var_com,
                        bed_file,
                        snp_lst_0=None,
                        max_test_pair=50000,
                        eff_cut=-999.0,
                        out_file='remma_epiAA_eff_gpu'):
    """
    加加上位检验,GPU加速
    :param y: 表型
    :param xmat: 固定效应设计矩阵
    :param gmat_lst: 基因组关系矩阵列表
    :param var_com: 方差组分
    :param bed_file: plink文件
    :param snp_lst_0: 互作对第一个SNP列表
    :param max_test_pair: 最大检验互作对数
    :param eff_cut: 依据阈值保留的互作对
    :param out_file: 输出文件
    :return:
    """
    try:
        import cupy as cp
    except Exception as e:
        logging.error(e)
        return e
    logging.info("计算V矩阵及其逆矩阵")
    y = np.array(y).reshape(-1, 1)
    n = y.shape[0]
    xmat = np.array(xmat).reshape(n, -1)
    vmat = np.diag([var_com[-1]] * n)
    for val in range(len(gmat_lst)):
        vmat += gmat_lst[val] * var_com[val]
    vmat_inv = np.linalg.inv(vmat)
    logging.info("计算P矩阵")
    vxmat = np.dot(vmat_inv, xmat)
    xvxmat = np.dot(xmat.T, vxmat)
    xvxmat = np.linalg.inv(xvxmat)
    pmat = reduce(np.dot, [vxmat, xvxmat, vxmat.T])
    pmat = vmat_inv - pmat
    pymat = np.dot(pmat, y)
    del vmat, vmat_inv, pmat
    gc.collect()
    logging.info("读取SNP文件")
    snp_mat = read_plink(bed_file)
    num_id, num_snp = snp_mat.shape
    if np.any(np.isnan(snp_mat)):
        logging.warning('Missing genotypes are imputed with random genotypes.')
        snp_mat = impute_geno(snp_mat)
    freq = np.sum(snp_mat, axis=0) / (2 * num_id)
    freq.shape = (1, num_snp)
    snp_mat = snp_mat - 2 * freq
    logging.info('检验')
    if snp_lst_0 is None:
        snp_lst_0 = range(num_snp - 1)
    else:
        if max(snp_lst_0) >= num_snp - 1 or min(snp_lst_0) < 0:
            logging.error('snp_lst_0 is out of range!')
            sys.exit()
    snp_mat0 = cp.array(snp_mat[:, snp_lst_0])
    pymat = cp.array(pymat)
    clock_t0 = time.perf_counter()
    cpu_t0 = time.process_time()
    res_lst = []
    start, end = 0, 0
    while True:
        start = end
        if start >= num_snp:
            break
        end = start + max_test_pair
        if end >= num_snp:
            end = num_snp
        snp_mat1 = cp.array(snp_mat[:, start:end])
        num_snp1 = snp_mat1.shape[1]
        for i in range(len(snp_lst_0)):
            if end >= i + 2 and start <= i:
                epi_mat = snp_mat0[:, i:(i + 1)] * snp_mat1[:, (i + 1):]
                eff_vec = cp.dot(epi_mat.T, pymat)
                res = cp.concatenate([
                    cp.array([snp_lst_0[i]] *
                             (snp_mat1.shape[1] - i - 1)).reshape(-1, 1),
                    cp.arange(i + 1, snp_mat1.shape[1]).reshape(-1, 1), eff_vec
                ],
                                     axis=1)
                res_lst.append(res[cp.absolute(res[:, -1]) > eff_cut, :])
            elif start > i:
                epi_mat = snp_mat0[:, i:(i + 1)] * snp_mat1
                eff_vec = cp.dot(epi_mat.T, pymat)
                res = cp.concatenate([
                    cp.array([snp_lst_0[i]] * num_snp1).reshape(-1, 1),
                    cp.arange(start, end).reshape(-1, 1), eff_vec
                ],
                                     axis=1)
                res_lst.append(res[cp.absolute(res[:, -1]) > eff_cut, :])
            else:
                continue
    clock_t1 = time.perf_counter()
    cpu_t1 = time.process_time()
    logging.info(
        "Running time: Clock time, {:.5f} sec; CPU time, {:.5f} sec.".format(
            clock_t1 - clock_t0, cpu_t1 - cpu_t0))
    res_lst = cp.asnumpy(cp.concatenate(res_lst, axis=0))
    np.savetxt(out_file, res_lst, header='snp_0 snp_1 eff', comments='')
    return res_lst