def generate_geno_batch(mTrait_qtl, mTrait, pTrait, geno, threads, bed_dir, rs_dir): if os.path.exists(bed_dir): shutil.rmtree(bed_dir) os.mkdir(bed_dir) if os.path.exists(rs_dir): shutil.rmtree(rs_dir) os.mkdir(rs_dir) plink_extract = 'plink -bfile {} -extract {} --make-bed -out {}' geno_batch = list() for mTrait_name in mTrait_qtl.phe_name.unique(): out_name = bed_dir.strip('/') + '/' + mTrait_name rs = mTrait_qtl.loc[mTrait_qtl.phe_name == mTrait_name, 'SNP'] rs_name = rs_dir.strip('/') + '/' + '_'.join([mTrait_name, 'rs.txt']) pd.Series(rs).to_frame().to_csv(rs_name, index=False, header=None) geno_batch.append((plink_extract.format(geno, rs_name, out_name), )) out_name = bed_dir.strip('/') + '/pTrait' rs_name = rs_dir.strip('/') + '/pTrait_rs.txt' mTrait_qtl['SNP'].to_frame().to_csv(rs_name, index=False, header=None) geno_batch.append((plink_extract.format(geno, rs_name, out_name), )) mp.parallel(mp.run, geno_batch, threads) for fn in glob.glob(bed_dir.strip('/') + '/*fam'): fam = pd.read_csv(fn, sep=' ', header=None) mTrait_name = fn.split('/')[-1].replace('.fam', '') if mTrait_name == 'pTrait': pTrait = pTrait.reindex(fam[0]) fam.index = fam[0] fam = pd.concat([fam, pTrait], axis=1) else: fam.loc[:, 5] = mTrait.loc[:, mTrait_name].reindex(fam[0]).values fam.to_csv(fn, index=False, header=None, sep=' ', na_rep='NA')
def generate_qtl_batch(omics_phe, phe_sig_qtl, geno_name, threads, bed_dir, rs_dir): plink_extract = 'plink -bfile {} --extract {} --make-bed -out {}' bim = pd.read_csv(geno_name + '.bim', sep='\t', header=None) qtl_batch = list() rs = dict() for index, row in phe_sig_qtl.iterrows(): rs.setdefault(row['phe_name'], []).extend( bim.loc[(bim[0] == row['chr']) & (bim[3] >= row['start']) & (bim[3] <= row['end']), 1].values.tolist()) for phe_name in rs: out_name = bed_dir.strip('/') + '/' + '_'.join(['tmp', phe_name]) rs_name = rs_dir.strip('/') + '/' + '_'.join( ['tmp', phe_name, 'rs.txt']) pd.Series(rs[phe_name]).to_frame().to_csv(rs_name, index=False, header=False) qtl_batch.append((plink_extract.format(geno_name, rs_name, out_name), )) mp.parallel(mp.run, qtl_batch, threads) for fn in glob.glob(bed_dir.strip('/') + '/*fam'): fam = pd.read_csv(fn, sep=' ', header=None) phe_name = '_'.join(fn.split('/')[-1].split('_')[1:]).replace( 'm.z', 'm/z').replace('.fam', '') fam.loc[:, 5] = omics_phe.loc[:, phe_name].reindex(fam.loc[:, 0]).values fam.to_csv(fn, index=False, header=None, sep=' ', na_rep='NA')
def gwas(phe, geno, num_threads, phe_fn): geno_prefix = geno.split('/')[-1] related_matrix_cmd = 'gemma.linux -bfile {0}.link -gk 1 -o {1}'.format( geno_prefix, geno_prefix) gwas_cmd = 'gemma.linux -bfile {0}.link -k output/{0}.cXX.txt -lmm -n {1} -o {2}' fam = pd.read_csv(geno + '.fam', sep=r'\s+', header=None) fam[5] = 1 fam = pd.merge(fam, phe, left_on=0, right_index=True, how='left') fam.to_csv(geno_prefix + '.link.fam', sep='\t', na_rep='NA', header=None, index=False) if os.path.exists(geno_prefix + '.link.bed'): os.remove(geno_prefix + '.link.bed') if os.path.exists(geno_prefix + '.link.bim'): os.remove(geno_prefix + '.link.bim') os.symlink(geno + '.bed', geno_prefix + '.link.bed') os.symlink(geno + '.bim', geno_prefix + '.link.bim') values = list() for _, p in enumerate(phe.columns): p = p.replace('/', '.') values.append((gwas_cmd.format(*[ geno_prefix, _ + 2, '.'.join(phe_fn.split('/')[-1].split('.')[:-1]) + '_' + str(p) ]), )) s = mp.run(related_matrix_cmd) if s != 0: return None else: s = mp.parallel(mp.run, values, num_threads) os.remove(geno_prefix + '.link.bed') os.remove(geno_prefix + '.link.bim') os.remove(geno_prefix + '.link.fam') return s
def region_gwas_parallel(bed_dir, threads, geno): local_gwas_args = list() geno_prefix = geno.split('/')[-1] fam = pd.read_csv(geno + '.fam', sep=r'\s+', header=None) fam[5] = 1 fam.to_csv(geno_prefix + '.link.fam', sep='\t', na_rep='NA', header=None, index=False) if os.path.exists(geno_prefix + '.link.bed'): os.remove(geno_prefix + '.link.bed') if os.path.exists(geno_prefix + '.link.bim'): os.remove(geno_prefix + '.link.bim') os.symlink(geno + '.bed', geno_prefix + '.link.bed') os.symlink(geno + '.bim', geno_prefix + '.link.bim') related_matrix_cmd = 'gemma.linux -bfile {0}.link -gk 1 -o {1}'.format( geno_prefix, geno_prefix) s = mp.run(related_matrix_cmd) if s != 0: return None gemma_cmd = 'gemma.linux -bfile {0} -k ./output/{1}.cXX.txt -lmm -n 1 -o {2}' for i in glob.glob(bed_dir + '/*.bed'): i = i.replace('.bed', '') i = i.replace('m/z', 'm.z') prefix = i.split('/')[-1] local_gwas_args.append((gemma_cmd.format(i, geno_prefix, prefix + '_plink'), )) s = mp.parallel(mp.run, local_gwas_args, threads) os.remove(geno_prefix + '.link.bed') os.remove(geno_prefix + '.link.bim') os.remove(geno_prefix + '.link.fam') return s
def gwas_plot_parallel(phe, p, threads, t, phe_fn): values = list() for i in phe.columns: i = i.replace('/', '.') values.append( ('output/' + '.'.join(phe_fn.split('/')[-1].split('.')[:-1]) + '_' + str(i) + '.assoc.txt', p, '.'.join(phe_fn.split('/')[-1].split('.')[:-1]) + '_' + str(i), t)) s = mp.parallel(gwas_plot, values, threads) return s
def MR_parallel(mTrait_qtl, mTrait, pTrait, geno, threads, pvalue_cutoff): args = list() for index, row in mTrait_qtl.iterrows(): rs = row['SNP'] mTrait_name = row['phe_name'] args.append( (mTrait.loc[:, mTrait_name], pTrait, geno.loc[:, rs], pvalue_cutoff)) res = mp.parallel(MR, args, threads) res = pd.concat([i for i in res]) return res
def MR_MLM_parallel(mTrait_qtl, mTrait_effect, pTrait_effect, pTrait_se, threads, pvalue_cutoff): args = [] for index, row in mTrait_qtl.iterrows(): mTrait_name = row['phe_name'] rs = row['SNP'] args.append( (mTrait_effect.loc[';'.join([mTrait_name, rs]), :], pTrait_effect.loc[rs, :], pTrait_se.loc[rs, :], pvalue_cutoff)) res = mp.parallel(MR_MLM, args, threads) res = pd.concat([i for i in res]) return res
def genome_cluster(G, window, step, threads): paras = list() if threads > np.unique(G.chrom).shape[0]: threads = np.unique(G.chrom).shape[0] for chrom in np.unique(G.chrom): G_chr = G.where(G.chrom == chrom, drop=True) paras.append((G_chr, chrom, window, step)) res = mp.parallel(chr_cluster_pca, paras, threads) #res_pc = pd.concat([i[0] for i in res], axis=1) res_pc = pd.concat(res, axis=1) res_pc.loc[:, :] = np.around( MinMaxScaler(feature_range=(0, 2)).fit_transform(res_pc.values), decimals=3) #res_variant = pd.concat([i[1] for i in res]) #return res_pc,res_variant return res_pc
def qtl_pc_lmm_gwas_parallel(omics_phe, bimbam_dir, threads, geno, sample_id): qtl_pc_lmm_args = list() #g = read_plink1_bin(geno+'.bed', geno+'.bim', geno+'.fam', verbose=False) #g = g.sel(sample=sample_id) geno_prefix = geno.split('/')[-1] #if os.path.exists(geno_prefix+'.link.bed'): # os.remove(geno_prefix+'.link.bed') #if os.path.exists(geno_prefix+'.link.bim'): # os.remove(geno_prefix+'.link.bim') #write_plink1_bin(g,geno_prefix+'.link.bed', geno_prefix+'.link.bim,', geno_prefix+'.link.fam',verbose=False) fam = pd.read_csv(geno + '.fam', sep=r'\s+', header=None) fam[5] = 1 fam.to_csv(geno_prefix + '.link.fam', sep='\t', na_rep='NA', header=None, index=False) omics_phe = omics_phe.reindex(fam[0].values) omics_phe.to_csv('bimbam_phe.txt', sep='\t', index=False, header=None, na_rep='NA') if os.path.exists(geno_prefix + '.link.bed'): os.remove(geno_prefix + '.link.bed') if os.path.exists(geno_prefix + '.link.bim'): os.remove(geno_prefix + '.link.bim') os.symlink(geno + '.bed', geno_prefix + '.link.bed') os.symlink(geno + '.bim', geno_prefix + '.link.bim') related_matrix_cmd = 'gemma.linux -bfile {0}.link -gk 1 -o {1}'.format( geno_prefix, geno_prefix) s = mp.run(related_matrix_cmd) if s != 0: return None gemma_cmd = 'gemma.linux -g {0} -a {1} -p bimbam_phe.txt -k ./output/{2}.cXX.txt -lmm -n {3} -o {4}' for _, m in enumerate(omics_phe.columns): m = m.replace('m/z', 'm.z') qtl_pc_lmm_args.append((gemma_cmd.format( bimbam_dir.strip('/') + '/tmp_' + m + '.geno.txt', bimbam_dir.strip('/') + '/tmp_' + m + '.anno.txt', geno_prefix, _ + 1, m + '_bimbam'), )) s = mp.parallel(mp.run, qtl_pc_lmm_args, threads) os.remove(geno_prefix + '.link.bed') os.remove(geno_prefix + '.link.bim') os.remove(geno_prefix + '.link.fam') return s
def qtl_pc_lm_gwas_parallel(omics_phe, bimbam_dir, threads, geno): qtl_pc_lm_args = list() geno_prefix = geno.split('/')[-1] gemma_cmd = 'gemma.linux -g {0} -a {1} -p {2} -lm -o {3}' for m in omics_phe.columns: phe = omics_phe[m].to_frame() m = m.replace('m/z', 'm.z') phe.to_csv(bimbam_dir.strip('/') + '/' + m + '_phe.txt', index=False, header=None, na_rep='NA') qtl_pc_lm_args.append((gemma_cmd.format( bimbam_dir.strip('/') + '/' + geno_prefix + '_qtl_pc.geno.txt', bimbam_dir.strip('/') + '/' + geno_prefix + '_qtl_pc.anno.txt', bimbam_dir.strip('/') + '/' + m + '_phe.txt', m + '_bimbam_lm'), )) s = mp.parallel(mp.run, qtl_pc_lm_args, threads) return s
def plink_clump(geno_path, p1, p2, num_threads): if os.path.exists('./clump_result'): shutil.rmtree('./clump_result') os.mkdir('./clump_result') cmd = 'plink --bfile {0} --clump {1} --clump-p1 {2} --clump-p2 {3} --clump-kb {4} --clump-r2 0.2 --out {5}' cmds = list() ms = list() for fn in glob.glob('./clump_input/*'): phe_name = fn.split('/')[-1].replace('.assoc', '') cmds.append( (cmd.format(geno_path + '/' + phe_name, fn, p1, p2, str(500), './clump_result/' + phe_name + '_' + str(500)), )) ms.append(phe_name) s = mp.parallel(mp.run, cmds, num_threads) if sum(s) != 0: print(','.join(list(np.array(ms)[s])) + ' do not successfully generated clumped file.') return s
def get_MLM_effect_parallell(assoc_dir, threads): mTrait_effect = pd.DataFrame() args = [] pTrait_name = [] for fn in glob.glob(assoc_dir.strip('/') + '/mTrait*.assoc.txt'): mTrait_name = fn.split('/')[-1].split('_')[-1].replace( '.assoc.txt', '') assoc = pd.read_csv(fn, sep='\t') assoc.index = mTrait_name + ';' + assoc['rs'] mTrait_effect = pd.concat([mTrait_effect, assoc[['beta', 'se']]]) for fn in glob.glob(assoc_dir.strip('/') + '/pTrait*assoc.txt'): pTrait_name.append( fn.split('/')[-1].split('_')[-1].replace('.assoc.txt', '')) args.append((fn, )) pTrait_res = mp.parallel(get_MLM_effect, args, threads) pTrait_effect = pd.concat([i['beta'] for i in pTrait_res], axis=1) pTrait_effect.columns = pTrait_name pTrait_se = pd.concat([i['se'] for i in pTrait_res], axis=1) pTrait_se.columns = pTrait_name return mTrait_effect, pTrait_effect, pTrait_se
def calc_MLM_effect(bed_dir, pTrait, threads, geno): args = list() geno_prefix = geno.split('/')[-1] fam = pd.read_csv(geno + '.fam', sep=r'\s+', header=None) fam[5] = 1 fam.to_csv(geno_prefix + '.link.fam', sep='\t', na_rep='NA', header=None, index=False) if os.path.exists(geno_prefix + '.link.bed'): os.remove(geno_prefix + '.link.bed') if os.path.exists(geno_prefix + '.link.bim'): os.remove(geno_prefix + '.link.bim') os.symlink(geno + '.bed', geno_prefix + '.link.bed') os.symlink(geno + '.bim', geno_prefix + '.link.bim') related_matrix_cmd = 'gemma.linux -bfile {0}.link -gk 1 -o {1}'.format( geno_prefix, geno_prefix) s = mp.run(related_matrix_cmd) if s != 0: return None gemma_cmd_mTrait = 'gemma.linux -bfile {0} -k ./output/{1}.cXX.txt -lmm -n 1 -o {2}' gemma_cmd_pTrait = 'gemma.linux -bfile {0} -k ./output/{1}.cXX.txt -lmm -n {2} -o {3}' for i in glob.glob(bed_dir + '/*.bed'): i = i.replace('.bed', '') if i.split('/')[-1] != 'pTrait': prefix = i.split('/')[-1] args.append((gemma_cmd_mTrait.format(i, geno_prefix, 'mTrait_' + prefix), )) else: for _, pTrait_name in enumerate(pTrait.columns): args.append( (gemma_cmd_pTrait.format(i, geno_prefix, _ + 2, 'pTrait_' + pTrait_name), )) s = mp.parallel(mp.run, args, threads) os.remove(geno_prefix + '.link.bed') os.remove(geno_prefix + '.link.bim') os.remove(geno_prefix + '.link.fam') return s