def BDF(prefix): (bim, fam, bed) = read_plink(prefix, verbose=True) bdf = pd.DataFrame(bed.compute().astype('int8')).join( bim[['snp']]).set_index('snp').append( fam.trait.astype('int8')).transpose().astype('category') bdf['cnt'] = 1 return bdf
def __init__(self, plink_file, scratch_dir, overwrite=False): self.options = tf.python_io.TFRecordOptions( tf.python_io.TFRecordCompressionType.ZLIB) self.plink_file = plink_file self.scratch_dir = scratch_dir # read plink data print('\nReading PLINK data...') self.bim, self.fam, G = read_plink(plink_file) # import ipdb; ipdb.set_trace() print('Done') # write tf.records if overwrite: G_df = dd.from_dask_array(da.transpose(G)) G_df = G_df.fillna(value=1) # (. _ . ) G_df = G_df.astype(np.int8) tf_records_filenames = G_df.apply(self._write_records, axis=1).compute() print('Done') else: root, dirs, files = next(os.walk(scratch_dir)) tf_records_filenames = [ root + f for f in files if f.endswith('.tfrecords') ] # split into training and test batches self.train_files, self.test_files = train_test_split( tf_records_filenames, test_size=0.20, random_state=42)
def from_plink(cls, path): with np.warnings.catch_warnings(): np.warnings.filterwarnings('ignore', 'FutureWarning') bim, fam, bed = read_plink(path, verbose=False) bim.chrom = bim.chrom.astype(str) return cls(bed, fam, bim)
def __main__(plink_file, tfrecords_file, tf_opts): bim, fam, G = read_plink(plink_file) G = np.array(G.T, dtype=np.int8) G[np.isnan(G)] = 0 N = G.shape[0] M = G.shape[1] def write_record(row, writer_handle): ''' row: a sample's genotype vector. ''' # wrap raw byte values genotypes_feature = tf.train.Feature(bytes_list=tf.train.BytesList( value=[row.tostring()])) # convert to Example example = tf.train.Example(features=tf.train.Features( feature={'genotypes': genotypes_feature})) writer_handle.write(example.SerializeToString()) with tf.python_io.TFRecordWriter(tfrecords_file, options=tf_opts) as tfwriter: np.apply_along_axis(write_record, axis=1, arr=G, writer_handle=tfwriter)
def test_read_plink(): datafiles = join(dirname(realpath(__file__)), "data_files") file_prefix = join(datafiles, "data") (bim, fam, bed) = read_plink(file_prefix, verbose=False) assert_equal(bed.dtype, dtype("float64")) assert_array_equal( bim.query("chrom=='1' and pos==72515")["snp"], ["rs4030300"]) assert_array_equal(bim.query("chrom=='1'").shape, [10, 7]) assert_array_equal( fam.query("fid=='Sample_2' and iid=='Sample_2'")["trait"], [-9]) assert_array_equal( bed, array([ [2, 2, 1], [2, 1, 2], [nan, nan, nan], [nan, nan, 1], [2, 2, 2], [2, 2, 2], [2, 1, 0], [2, 2, 2], [1, 2, 2], [2, 1, 2], ]), )
class TestObjective(unittest.TestCase): """Tests for the feems Objective """ # path to example data data_path = pkg_resources.resource_filename("feems", "data/") # read the genotype data and mean impute missing data (bim, fam, G) = read_plink("{}/wolvesadmix".format(data_path)) imp = SimpleImputer(missing_values=np.nan, strategy="mean") genotypes = imp.fit_transform((np.array(G)).T) # setup graph coord = np.loadtxt("{}/wolvesadmix.coord".format(data_path)) outer = np.loadtxt("{}/wolvesadmix.outer".format(data_path)) grid_path = "{}/grid_250.shp".format(data_path) outer, edges, grid, ipmap = prepare_graph_inputs(coord=coord, ggrid=grid_path, translated=True, buffer=0, outer=outer) sp_graph = SpatialGraph(genotypes, coord, grid, edges) obj = Objective(sp_graph) def test_n_observed_nodes(self): """Tests the right number of observed nodes """ self.assertEqual(self.sp_graph.n_observed_nodes, 78)
def test_read_plink_wildcard(): datafiles = join(dirname(realpath(__file__)), "data_files") file_prefix = join(datafiles, "chr*") (bim, fam, bed) = read_plink(file_prefix, verbose=False) assert_array_equal(bim[bim["chrom"] == "11"]["i"].values[:2], [0, 1]) assert_array_equal(bim[bim["chrom"] == "12"]["i"].values[:2], [779, 780])
def __init__(self, plink_prefix_path, select_samples=None, verbose=True, dtype=np.float32): """ Class for reading genotypes from PLINK bed files plink_prefix_path: prefix to PLINK bed,bim,fam files select_samples: specify a subset of samples Notes: Use this command to convert a VCF to PLINK format: plink2 --make-bed \ --output-chr chrM \ --vcf ${plink_prefix_path}.vcf.gz \ --out ${plink_prefix_path} If using plink v1, the --keep-allele-order flag must be included. Uses read_plink from pandas_plink. """ self.bim, self.fam, self.bed = read_plink(plink_prefix_path, verbose=verbose) self.bed = 2 - self.bed # flip allele order: PLINK uses REF as effect allele if dtype==np.int8: self.bed[np.isnan(self.bed)] = -1 # convert missing (NaN) to -1 for int8 self.bed = self.bed.astype(dtype, copy=False) self.sample_ids = self.fam['iid'].tolist() if select_samples is not None: ix = [self.sample_ids.index(i) for i in select_samples] self.fam = self.fam.loc[ix] self.bed = self.bed[:,ix] self.sample_ids = self.fam['iid'].tolist() self.n_samples = self.fam.shape[0] self.variant_pos = {i:g['pos'] for i,g in self.bim.set_index('snp')[['chrom', 'pos']].groupby('chrom')} self.variant_pos_dict = self.bim.set_index('snp')['pos'].to_dict()
def test_qtl_interact_paolo_ex(): from limix.qtl import st_iscan from numpy.random import RandomState import pandas as pd import scipy as sp import scipy.linalg as la from limix_core.util.preprocess import gaussianize from limix_lmm import download, unzip from pandas_plink import read_plink random = RandomState(1) # download data download("http://rest.s3for.me/limix/data_structlmm.zip") unzip("data_structlmm.zip") # import snp data bedfile = "data_structlmm/chrom22_subsample20_maf0.10" (bim, fam, G) = read_plink(bedfile, verbose=False) # consider the first 100 snps snps = G[:100].compute().T # define genetic relatedness matrix W_R = random.randn(fam.shape[0], 20) R = sp.dot(W_R, W_R.T) R /= R.diagonal().mean() S_R, U_R = la.eigh(R) # load phenotype data phenofile = "data_structlmm/expr.csv" dfp = pd.read_csv(phenofile, index_col=0) pheno = gaussianize(dfp.loc["gene1"].values[:, None]) # define covs covs = sp.ones([pheno.shape[0], 1]) res = st_iscan(snps, pheno, M=covs, verbose=True) try: assert_allclose( res["pv"][:3], [0.5621242538994103, 0.7764976679506745, 0.8846952467562864]) assert_allclose( res["beta"][:3], [0.08270087514483888, -0.02774487670737916, -0.014210408938382794], ) assert_allclose( res["beta_ste"][:3], [0.14266417362656036, 0.09773242355610584, 0.09798944635609126], ) assert_allclose( res["lrt"][:3], [0.3360395236287443, 0.08059131858936965, 0.021030739508237833], ) finally: os.unlink("data_structlmm.zip") shutil.rmtree("data_structlmm")
def read_plink_pandas(basepath): """Read a plink file. """ bim, fam, G = pandas_plink.read_plink(basepath, verbose=False) # G is a dask array Gp = np.array(G.compute()) # turn the Dask array into a numpy array Gp[np.isnan(Gp)] = 9 # use 9 for missing values, rather than nan Gp = Gp.astype('i1') return(fam, bim, Gp, (Gp > 8).any())
def __init__(self, bed_file_path, bim_file_path, fam_file_path, temp_dir): try: from pandas_plink import read_plink except ImportError: raise ImportError('Failed importing pandas_plink.read_plink. Make sure pandas-plink is installed. See: https://pypi.org/project/pandas-plink/.') plink_path_prefix = _create_plink_links(bed_file_path, bim_file_path, fam_file_path, temp_dir) self.bim, self.fam, self.G = read_plink(plink_path_prefix)
def __init__(self, chrom): """ positions is a list of positions of the variants. """ self.chrom = chrom self.plink_path = '/broad/compbio/data/1KG_phase3/plink/chr' + str(chrom) self.bim, _, self.bed = pandas_plink.read_plink(self.plink_path) self.indexes = ['pos', 'a1', 'a0'] self.bim_indexed = self.bim.set_index(self.indexes)
def __init__(self, plink_fn, pheno_fn, nbant, nbt, evaporation_rate, init_val, total_fitness_evals): self.pheno = self.read_pheno(pheno_fn) self.bim, self.fam, self.bed = read_plink(plink_fn) self.cases_i, self.controls_i = self.cases_controls() self.nbant = nbant self.nbt = nbt self.evaporation_rate = evaporation_rate self.init_val = init_val self.total_fitness_evals = total_fitness_evals
def read_geno(bedfileset, normalize=True): # read geno bim, fam, g = read_plink(bedfileset) if normalize: # normalize geno std = g.std(axis=1) mean = g.mean(axis=1) ng = (g.T - mean) / std return ng, bim, fam else: return g, bim, fam
def get_chromo_snp_dict(thousand_G_dir): chromo_snp_dict = {} for i in range(1, 23): chromo_dir = os.path.join(thousand_G_dir, "1000G.EUR.{}".format(i)) (bim, fam, bed) = read_plink(chromo_dir, verbose=False) chromo_snp = np.array(bim['snp']) X = bed.compute().T # columns as SNP and row as number of individuals X_df = pd.DataFrame(data=X, columns=chromo_snp) chromo_snp_dict[i] = X_df return chromo_snp_dict
def read_geno(bfile, freq_thresh, threads, check=False, max_memory=None, usable_snps=None, normalize=False): # set Cache to protect memory spilling if max_memory is not None: available_memory = max_memory else: available_memory = psutil.virtual_memory().available cache = Chest(available_memory=available_memory) (bim, fam, g) = read_plink(bfile) # read the files using pandas_plink g_std = g.std(axis=1) if check: with ProgressBar(), dask.config.set(pool=ThreadPool(threads)): print('Removing invariant sites') idx = (g_std != 0).compute(cache=cache) g = g[idx, :] bim = bim[idx].copy().reset_index(drop=True) bim.i = bim.index.tolist() del idx gc.collect() if usable_snps is not None: idx = bim[bim.snp.isin(usable_snps)].i.tolist() g = g[idx, :] bim = bim[bim.i.isin(idx)].copy().reset_index(drop=True) bim.i = bim.index.tolist() mafs = g.sum(axis=1) / (2 * n) if freq_thresh > 0 else None # Filter MAF if freq_thresh > 0: print('Filtering MAFs smaller than', freq_thresh) print(' Genotype matrix shape before', g.shape) assert freq_thresh < 0.5 good = (mafs < (1 - float(freq_thresh))) & (mafs > float(freq_thresh)) with ProgressBar(): with dask.config.set(pool=ThreadPool(threads)): good, mafs = dask.compute(good, mafs, cache=cache) g = g[good, :] print(' Genotype matrix shape after', g.shape) bim = bim[good] bim['mafs'] = mafs[good] del good gc.collect() if normalize: mean = g.mean(axis=1) g = (g.T - mean) / g_std else: g = g.T return g, bim, fam
def test_read_plink(): datafiles = join(dirname(realpath(__file__)), 'data_files') file_prefix = join(datafiles, 'data') (bim, fam, bed) = read_plink(file_prefix) assert_array_equal( bim.query("chrom=='1' and pos==72515")['snp'], ['rs4030300']) assert_array_equal(bim.query("chrom=='1'").shape, [10, 7]) assert_array_equal( fam.query("fid=='Sample_2' and iid=='Sample_2'")["trait"], ['-9']) assert_array_equal( bed, array([[2, 2, 1], [2, 1, 2], [nan, nan, nan], [nan, nan, 1], [2, 2, 2], [2, 2, 2], [2, 1, 0], [2, 2, 2], [1, 2, 2], [2, 1, 2]]))
def get_chrom_raw_marker_data(chrom): ''' A helper function to read the UKBB's raw-marker genetic data of a given chromosome using the read_plink function in the pandas_plink module (https://pypi.org/project/pandas-plink/). Obviously, using this function requires this module to be installed. The function assumes the following paths: <CALL_DIR>/ukb_snp_chr<CHR>_v2.bim, <CALL_DIR>/ukb_cal_chr<CHR>_v2.bed and <FAM_FILE_PATH>. @param chrom (str): The name of the chromosome to load the data for (could be: '1', '2', ..., '22', 'X', 'Y', 'XY', 'MT'). @return: The outputs returned by the pandas_plink.read_plink function (bim, fam, G). ''' try: from pandas_plink import read_plink except ImportError: raise ImportError('Failed importing pandas_plink.read_plink. Make sure pandas-plink is installed. See: https://pypi.org/project/pandas-plink/.') _create_chrom_raw_marker_links(chrom) return read_plink(_get_chrom_raw_marker_links_path_prefix(chrom))
def read_bed_files(file_list): """Read one or a set of bed files and accopanying fam and bim files. The content is merged into one dask array for bed and one dataframe for bim, with correct chromosome ordering (provided that there is a strict chromosome ordering within and between files). """ parts = [read_plink(f) for f in file_list] #parts = sort_bed_by_chromosome(parts) bims, fams, beds = zip(*parts) #assert all([len(fam) == len(fams[0]) for fam in fams]) bim = pd.concat(bims) # TODO: Do we want to reindex like this? bim.i = np.arange(1, len(bim)+1) fam = fams[0] bed = da.concatenate(beds) return bim, fam, bed
def read_geno(bfile, freq_thresh, threads, flip=False, check=False): (bim, fam, G) = read_plink(bfile) m, n = G.shape # remove invariant sites if check: # remove constant variants G_std = G.std(axis=1) # with ProgressBar(): print('Removing invariant sites') with dask.set_options(pool=ThreadPool(threads)): idx = (G_std != 0).compute() G = G[idx, :] bim = bim[idx].copy() mafs = G.sum(axis=1) / (2 * n) if flip: # check possible flips flips = np.zeros(bim.shape[0], dtype=bool) flips[np.where(mafs > 0.5)[0]] = True bim['flip'] = flips vec = np.zeros(flips.shape[0]) vec[flips] = 2 # perform the flipping G = abs(G.T - vec) else: G = G.T # Filter MAF if freq_thresh > 0: print('Filtering MAFs smaller than', freq_thresh) print(' Genotype matrix shape before', G.shape) good = (mafs < (1 - float(freq_thresh))) & (mafs > float(freq_thresh)) with ProgressBar(): with dask.set_options(pool=ThreadPool(threads)): good, mafs = dask.compute(good, mafs) # good = good.compute(num_workers=threads) G = G[:, good] bim = bim[good] bim['mafs'] = mafs[good] print(' Genotype matrix shape after', G.shape) bim = bim.reset_index(drop=True) bim['i'] = bim.index.tolist() return bim, fam, G
def __init__(self, tf_records_dir='/plink_tensorflow/data/', test_prop=0.8, raw_data_dir='/plink_tensorflow/data/'): ''' Map a directory of plink files to dask arrays and pandas dataframes. @test_prop: The rough proportion of sample to dedicate to training. @raw_data_dir: Directory containing PLINK formatted files for each study. ''' self.test_prop = test_prop self.options = tf.python_io.TFRecordOptions( tf.python_io.TFRecordCompressionType.NONE) # map the input files into pandas dataframes and dask arrays root, dirs, files = next(os.walk(raw_data_dir)) study_plink_prefixes = [ root + f.replace('.bim', '') for f in files if f.endswith('.bim') ] # read_plink -> (bim, fam, G) print('Generating Dask arrays from study PLINK files...') ## TODO: check that all studies contain the same variants self.study_arrays = { os.path.basename(f): read_plink(f) for f in study_plink_prefixes } print('Done') self.m_variants = sum( [bim.shape[0] for (bim, fam, G) in self.study_arrays.values()]) # write tf.records self.study_records = self.make_tf_records( tf_records_dir=tf_records_dir) print(self.study_records.values())
def main(args): argp = ap.ArgumentParser( description="Simulate TWAS using real genotype data", formatter_class=ap.ArgumentDefaultsHelpFormatter) argp.add_argument("prefix", help="Prefix to PLINK-formatted data") argp.add_argument("--ngwas", default=100000, type=int, help="Sample size for GWAS panel") argp.add_argument("--nqtl", default=500, type=int, help="Sample size for eQTL panel") argp.add_argument( "--model", choices=["10pct", "1pct", "1snp"], default="10pct", help= "SNP model for generating gene expression. 10pct = 10%% of SNPs, 1pct = 1%% of SNPs, 1snp = 1 SNP" ) argp.add_argument("--eqtl-h2", default=0.1, type=float, help="The narrow-sense heritability of gene expression") argp.add_argument( "--var-explained", default=0.01, type=float, help="Variance explained in complex trait by gene expression") argp.add_argument("-o", "--output", help="Output prefix") args = argp.parse_args(args) # read in plink data bim, fam, G = read_plink(args.prefix, verbose=False) G = G.T # estimate LD for population from PLINK data n, p = [float(x) for x in G.shape] p_int = int(p) mafs = np.mean(G, axis=0) / 2 G -= mafs * 2 G /= np.std(G, axis=0) # regularize so that LD is PSD LD = np.dot(G.T, G) / n + np.eye(p_int) * 0.1 # compute cholesky decomp for faster sampling/simulation L = linalg.cholesky(LD, lower=True) # compute LD-scores for reports ldscs = np.sum(LD**2, axis=0) b_qtls = sim_beta(args.model, args.eqtl_h2, p) # simulate GWAS under assumption that expression => downstream trait gwas, alpha = sim_gwas(L, args.ngwas, b_qtls, args.var_explained) # sample eQTL reference pop genotypes from MVN approx and perform eQTL scan + fit LASSO eqtl, coef, LD_qtl = sim_eqtl(L, args.nqtl, b_qtls, args.eqtl_h2) # compute TWAS statistics score, within_var = compute_twas(gwas, coef, LD) min_p_val = np.min(gwas.pval.values) mean_chi2 = np.mean((gwas.beta.values / gwas.se.values)**2) med_chi2 = np.median((gwas.beta.values / gwas.se.values)**2) if within_var > 0: z_twas = score / np.sqrt(within_var) p_twas = 2 * stats.norm.sf(np.abs(z_twas)) else: # on underpowered/low-h2g genes LASSO can set all weights to 0 and effectively break the variance estimate z_twas = 0 p_twas = 1 # output the GWAS, eQTL, and LASSO estimates output = bim.drop(columns=["cm", "i"]) output["maf"] = mafs output["ld.score"] = ldscs output["gwas.beta"] = gwas.beta output["gwas.se"] = gwas.se output["gwas.true"] = b_qtls * alpha output["eqtl.beta"] = eqtl.beta output["eqtl.se"] = eqtl.se output["eqtl.true"] = b_qtls output["eqtl.lasso"] = coef output.to_csv("{}.scan.tsv".format(args.output), sep="\t", index=False) # output a summary that contains the actual TWAS test statistic df = pd.DataFrame({ "stat": [ "ngwas", "nqtl", "nsnps", "h2ge", "h2g", "avg.ldsc", "min.gwas.p", "mean.gwas.chi2", "median.gwas.chi2", "twas.z", "twas.p" ], "values": [ args.ngwas, args.nqtl, int(p), args.var_explained, args.eqtl_h2, np.mean(ldscs), min_p_val, mean_chi2, med_chi2, z_twas, p_twas ] }) df.to_csv("{}.summary.tsv".format(args.output), sep="\t", index=False) return 0
"gene_to_chromosome.csv") gene_to_chromo = {} with open(gene_chromo_dir) as gene_chromo_file: gene_chromo_file.readline() for l in gene_chromo_file: l = l.split(',') if l[1] not in ['X', 'Y', 'MT']: gene_to_chromo[l[0].split('.')[0]] = l[1][:-1] thousand_G_dir = os.path.join(data_dir, "LDREF") one_KG_SNPs_dict = {} for i in range(1, 23): chromo_dir = os.path.join(thousand_G_dir, "1000G.EUR.{}".format(i)) (bim, fam, bed) = read_plink(chromo_dir, verbose=False) chromo_snp = np.array(bim['snp']) X = bed.compute().T # columns as SNP and row as number of individuals X_df = pd.DataFrame(data=X, columns=chromo_snp) one_KG_SNPs_dict[str(i)] = X_df # ## Randomly pick LD blocks # first loading block information snps_LD_blocks_dir = os.path.join(data_dir, "LD_blocks", "snps2LDblock.csv") snps_LD_blocks = {} with open(snps_LD_blocks_dir) as f: f.readline()
def read_geno(bfile, freq_thresh, threads, flip=False, check=False, max_memory=None, usable_snps=None): """ Read the plink bed fileset, restrict to a given frequency (optional, freq_thresh), flip the sequence to match the MAF (optional; flip), and check if constant variants present (optional; check) :param max_memory: Maximum allowed memory :param bfile: Prefix of the bed (plink) fileset :param freq_thresh: If greater than 0, limit MAF to at least freq_thresh :param threads: Number of threads to use in computation :param flip: Whether to check for flips and to fix the genotype file :param check: Whether to check for constant sites :return: Dataframes (bim, fam) and array corresponding to the bed fileset """ # set Cache to protect memory spilling if max_memory is not None: available_memory = max_memory else: available_memory = psutil.virtual_memory().available cache = Chest(available_memory=available_memory) (bim, fam, g) = read_plink(bfile) # read the files using pandas_plink m, n = g.shape # get the dimensions of the genotype # remove invariant sites if check: g_std = g.std(axis=1) with ProgressBar(): print('Removing invariant sites') with dask.config.set(pool=ThreadPool(threads)): idx = (g_std != 0).compute(cache=cache) g = g[idx, :] bim = bim[idx].copy().reset_index(drop=True) bim.i = bim.index.tolist() del g_std, idx gc.collect() if usable_snps is not None: idx = bim[bim.snp.isin(usable_snps)].i.tolist() g = g[idx, :] bim = bim[bim.i.isin(idx)].copy().reset_index(drop=True) bim.i = bim.index.tolist() # compute the mafs if required mafs = g.sum(axis=1) / (2 * n) if flip or freq_thresh > 0 else None if flip: # check possible flips flips = np.zeros(bim.shape[0], dtype=bool) flips[np.where(mafs > 0.5)[0]] = True bim['flip'] = flips vec = np.zeros(flips.shape[0]) vec[flips] = 2 # perform the flipping g = abs(g.T - vec) del flips gc.collect() else: g = g.T # Filter MAF if freq_thresh > 0: print('Filtering MAFs smaller than', freq_thresh) print(' Genotype matrix shape before', g.shape) assert freq_thresh < 0.5 good = (mafs < (1 - float(freq_thresh))) & (mafs > float(freq_thresh)) with ProgressBar(): with dask.config.set(pool=ThreadPool(threads)): good, mafs = dask.compute(good, mafs, cache=cache) g = g[:, good] print(' Genotype matrix shape after', g.shape) print(bim.shape) bim = bim[good] bim['mafs'] = mafs[good] del good gc.collect() bim = bim.reset_index(drop=True) # Get the indices in order # Fix the i such that it matches the genotype indices bim['i'] = bim.index.tolist() # Get chunks apropriate with the number of threads g = g.rechunk(estimate_chunks(g.shape, threads, memory=available_memory)) del mafs gc.collect() return bim, fam, g
def load_genetics(fname: str, gene_list: str = None) -> (pd.DataFrame, pd.DataFrame): """ Loads PPMI genotyping data stored at `fname` Parameters ---------- fname : str Filepath to genotyping PLINK files gene_list : str, optional Path to pandas-compatible csv with at least 'snp', 'target', and 'odds_ratio' columns denoting rs#, target (effect) allele, and odds ratio of target allele in population. Returns ------- data : (N, G) :obj:`pandas.DataFrame` Wide-format genetics data where `N` is participants and `G` is SNPs info : (G, 5) :obj:`pandas.DataFrame` Information on SNPs in `data`, including 'odds_ratio' for genetic risk score calculation """ try: from pandas_plink import read_plink except ImportError: raise ImportError('Loading genotyping data requires installing the ' '`pandas_plink` module. Please install that and try ' 'again.') # make helper function for extracting SNP rs# from PLINK files def extract(x): try: return re.findall('[-_]*(rs[0-9]+)[-_]*', x)[0] except IndexError: return None # load PLINK data bim, fam, gen = read_plink(fname, verbose=False) participant_id = pd.Series(fam.fid.get_values(), name='participant') cols = ['snp', 'a0', 'a1'] if gene_list is not None: # load gene list gene_info = pd.read_csv(gene_list).drop_duplicates(subset=['snp']) # check where SNPs match desired gene list & subset data inds = bim.snp.apply(extract).isin(gene_info.snp.dropna()).get_values() bim, gen = bim[inds], gen[inds] # clean up ugly bim.snp names with just rs# of SNPs bim.loc[:, 'snp'] = bim.snp.map({f: extract(f) for f in bim.snp}) # get allele info for making sense of the data cols += ['target', 'odds_ratio', 'study'] info = pd.merge(bim, gene_info, on='snp')[cols] # if a0/a1 alleles don't match target, confusion ensues # drop the non-matched ones and then grab SNPs that need to be reversed info = info[~((info.a0 != info.target) & (info.a1 != info.target))] flip = info[info.a1 != info.target].snp info = info[['snp', 'odds_ratio', 'study']] else: # placeholders so below code doesn't fail info = bim[cols] flip = pd.Series([], name='snp') # make wide-format participant x SNP dataframe data = pd.DataFrame(gen.compute().T, index=participant_id, columns=bim.snp) # if multiple columns represent same snp, combine them # THEY SHOULD ALL BE THE SAME -- if they aren't, that's bad... data = (data.dropna(axis=1, how='all') .groupby(level=0, axis=1) .mean() .dropna(axis=0, how='all') .sort_index()) # flip reverse-coded SNPs data[flip] = data[flip].applymap(lambda x: {0: 2, 1: 1, 2: 0}.get(x)) # retain only relevant SNPs in allele info = info[info.snp.isin(data.columns)] info = info.drop_duplicates(subset=['snp']).reset_index(drop=True) # return sorted data and info return data[info.snp], info
else: sys.stderr.write('invalid use_group_lasso value: true of false') if sys.argv[4].lower() == 'true': use_lasso = True elif sys.argv[4].lower() == 'false': use_lasso = False else: sys.stderr.write('invalid use_lasso value: true of false') root = 'data' gene_file = os.path.join(root, sys.argv[1]) kinship_file = os.path.join(root, sys.argv[2]) # load genotypes [bim, fam, G] = read_plink(gene_file) X = SP.array(G.compute()).astype(float) [n_f, n_s] = X.shape for i in xrange(X.shape[0]): m = X[i].mean() std = X[i].std() X[i] = (X[i] - m) / std X = X.T # simulate phenotype y = SP.array(list(fam['i'])).astype(float) # init debug = False
# Parse additional filters if args.subj_list is not None: logger.info('Extracting subjects from ' + args.subj_list) subjlist = pd.read_csv(args.subj_list,names=list({'IID'})) subjlist['IID'] = subjlist['IID'].apply(str) c, ia, ib = intersect_mtlb(subjlist['IID'],pheno['IID']) pheno = pheno.iloc[ib] pheno = pheno.reset_index(drop=True) logger.info(str(pheno.shape[0]) + ' subjects remains after keep\n') # Run the block correlation fit # imp = preprocessing.Imputer(strategy='mean', axis=1) logger.info('Processing genotype data: ' + geno_prefix) (bim, fam, geno) = read_plink(geno_prefix) # intersect data c, ia, ib = intersect_mtlb(fam['iid'],pheno['IID']) logger.info(str(len(ia)) + ' subjects found to have genotype data\n') # Final sample assignment pheno = pheno.iloc[ib] pheno = pheno.reset_index(drop=True) geno_ia = geno[:,ia] # Function for null model logger.info('Generating Null models\n') cph = CoxPHFitter() cph.fit(pheno[[T_name, event_name] + covname], T_name, event_col=event_name) # res_surv = cph.compute_residuals(pheno[[T_name, event_name] + covname], 'deviance').sort_index()['deviance']
def load_and_prepare_data(x_file, y_file, k_file, m_phe, cof_file): ''' etl the data ''' if k_file != 'not_prov': type_k = k_file.split(".")[-1] type_x = x_file.split(".")[-1] y_phe = pd.read_csv(y_file, engine='python').sort_values( ['accession_id']).groupby('accession_id').mean() y_phe = pd.DataFrame({ 'accession_id': y_phe.index, 'phenotype_value': y_phe[m_phe] }) if type_x in ('hdf5', 'h5py'): snp = h5py.File(x_file, 'r') markers = np.asarray(snp['positions']) acc_x = np.asarray(snp['accessions'][:], dtype=np.int) elif type_x == 'csv': x_gen = pd.read_csv(x_file, index_col=0) markers = x_gen.columns.values acc_x = x_gen.index x_gen = np.asarray(x_gen, dtype=np.float32) / 2 elif type_x.lower() == 'plink': my_prefix = x_file.split(".")[0] (bim, fam, bed) = read_plink(my_prefix) acc_x = np.array(fam[['fid']], dtype=np.int).flatten() markers = np.array(bim[['snp']]).flatten() else: sys.exit("Only hdf5, h5py, plink and csv files are supported") if k_file != 'not_prov': if type_k in ('hdf5', 'h5py'): k = h5py.File(k_file, 'r') acc_k = np.asarray(k['accessions'][:], dtype=np.int) elif type_k == 'csv': k = pd.read_csv(k_file, index_col=0) acc_k = k.index k = np.array(k, dtype=np.float32) acc_y = np.asarray(y_phe[['accession_id']]).flatten() acc_isec = [isec for isec in acc_x if isec in acc_y] idx_acc = list(map(lambda itt: itt in acc_isec, acc_x)) idy_acc = list(map(lambda itt: itt in acc_isec, acc_y)) if k_file != 'not_prov': idk_acc = list(map(lambda itt: itt in acc_isec, acc_k)) if cof_file != 0: cof = pd.read_csv(cof_file, index_col=0) idc = cof.index cof = np.array(cof['cof']) acc_isec = [isec for isec in idc if isec in acc_y] #idc_acc = list(map(lambda x: x in acc_isec, idc)) if not all(idx_acc): print(''' accessions ids in the covariate file must be identical to the ones in the phenotype file ''') sys.exit() else: cof = 0 y_phe_ = np.asarray(y_phe.drop('accession_id', 1), dtype=np.float32)[idy_acc, :] if type_x in ('hdf5', 'h5py'): x_gen = np.asarray(snp['snps'][0:(len(snp['snps']) + 1), ], dtype=np.float32)[:, idx_acc].T x_gen = x_gen[np.argsort(acc_x[idx_acc]), :] if k_file != 'not_prov': k_1 = np.asarray(k['kinship'][:])[idk_acc, :] kin_vr = k_1[:, idk_acc] kin_vr = kin_vr[np.argsort(acc_x[idx_acc]), :] kin_vr = kin_vr[:, np.argsort(acc_x[idx_acc])] else: kin_vr = kinship(x_gen) elif type_x.lower() == 'plink': x_gen = np.asarray(bed.compute() / 2, dtype=np.float32)[:, idx_acc].T if k_file != 'not_prov': k_1 = np.asarray(k['kinship'][:])[idk_acc, :] kin_vr = k_1[:, idk_acc] kin_vr = kin_vr[np.argsort(acc_x[idx_acc]), :] kin_vr = kin_vr[:, np.argsort(acc_x[idx_acc])] else: kin_vr = kinship(x_gen) else: x_gen = x_gen[idx_acc, :] if k_file != 'not_prov': k_1 = k[idk_acc, :] kin_vr = k_1[:, idk_acc] else: kin_vr = kinship(x_gen) print("data has been imported") return x_gen, kin_vr, y_phe_, markers, cof
def fetch_dosage(prefix, verbose): from pandas_plink import read_plink return read_plink(prefix, verbose=verbose)[2].T
def pairwise_fst(prefix): #get unique fids (bim, fam, bed) = read_plink(prefix, verbose=True) print("Bfiles mapped") unique_fam = pd.DataFrame(fam['fid'].unique()) fam_list = unique_fam[0].tolist() fam_list2 = unique_fam[0].tolist() #create pairwise df index = pd.MultiIndex.from_product([fam_list, fam_list2], names=['pop1', 'pop2']) paired_df = pd.DataFrame(index=index).reset_index() paired_df['pops'] = paired_df[['pop1', 'pop2']].agg('.'.join, axis=1) os.mkdir('output') os.chdir('output') paired_df['pops'].to_csv('paired_pops.csv', index=False, header=False) print("Populations paired") #Now we need to calculate pairwise FST for all pairs in the pairwise csv file #What needs to be done: create clust files for each population pair #This means for each pair we need to grab all the info from the fam file and print to a clust #Then we use the clust to calculate fst for the pair of populations paired_list1 = paired_df['pop1'].to_list() paired_list2 = paired_df['pop2'].to_list() original_stdout = sys.stdout filtered_fam = pd.DataFrame(fam[['fid', 'iid']]) filtered_fam['group'] = filtered_fam['fid'] for (a, b) in zip(paired_list1, paired_list2): filtered_fam.loc[filtered_fam['fid'].isin([a, b])].to_csv( str(a) + '.' + str(b) + '.clust', encoding='utf-8', sep='\t', index=False, header=False) os.chdir('../') #calculate FST with plink FST = subprocess.Popen('for i in $(less output/paired_pops.csv); do \ plink --bfile' + ' ' + prefix + ' ' + '--within output/$i.clust --double-id --fst \ --allow-no-sex --out output/$i; done', shell=True) FST.communicate() print("Fst Calculated") #Now we need the pop names and mean fst values from the log files #create lists for pops and FST os.chdir('output') #grab the values we need from all the files pops = [] fst = [] pattern1 = re.compile('Mean Fst', re.IGNORECASE) pattern2 = re.compile( 'Error: --fst requires at least two nonempty clusters.') for i, file in enumerate(os.listdir()): if file.endswith('.log'): with open(str(file), 'rt') as f: lines = f.readlines() pops.append(lines[6].strip(' --output/ ').rstrip('\n')) for i, file in enumerate(os.listdir()): if file.endswith('.log'): with open(str(file), 'rt') as f: for line in f: if pattern1.search(line) or pattern2.search(line) != None: fst.append(line) pairwise_fst = list(zip(pops, fst)) pairwise_fst = pd.DataFrame(pairwise_fst, columns=['pops', 'fst']) pairwise_fst['fst'] = pairwise_fst['fst'].map( lambda x: x.lstrip('Mean Fst estimate: ').rstrip('\n')) pairwise_fst['fst'] = pairwise_fst['fst'].replace( 'Error: --fst requires at least two nonempty clusters.', 0, regex=True) pairwise_fst = pairwise_fst.mask( pairwise_fst.applymap(lambda s: 'End time:' in s if isinstance(s, str) else False)) pairwise_fst['fst'] = pd.to_numeric(pairwise_fst['fst']) pairwise_fst['fst'] = pairwise_fst['fst'].apply(lambda x: x if x > 0 else 0) pairwise_fst['col_name'] = pairwise_fst['pops'].str.split('.').map( lambda x: x[1]) pairwise_fst['row_name'] = pairwise_fst['pops'].str.split('.').map( lambda x: x[0]) pairwise_fst = pairwise_fst.pivot(index='row_name', columns='col_name', values='fst') pairwise_fst.index.name = None pairwise_fst.columns.name = None pairwise_fst.to_csv('pairwise_fst.csv', sep=',') clean4 = subprocess.Popen('mkdir PLINK_out', shell=True) clean5 = subprocess.Popen('mv *.fst PLINK_out', shell=True) clean6 = subprocess.Popen('mv *.log PLINK_out', shell=True) clean7 = subprocess.Popen('mv *.clust PLINK_out', shell=True) clean8 = subprocess.Popen('mv *.nosex PLINK_out', shell=True) clean4.communicate() clean5.communicate() clean6.communicate() clean7.communicate() clean8.communicate() os.chdir('../') #now output heatmap from R (this Rscript can be changed depending on how you would like your figure to look #Simply edit the script or write a new one and pipe it through this command to customize the output make_matrix = subprocess.Popen('Rscript matrix.R', shell=True)