def add_pheno(plink_in, multigenecity, out, h=0.85, p_cases=0.5): plink_file = plinkfile.open( plink_in ) if not plink_file.one_locus_per_row( ): print( "This script requires that snps are rows and samples columns." ) exit(1) sample_list = plink_file.get_samples() locus_list = plink_file.get_loci() n = len(sample_list) p = len(locus_list) edge_offset = 100 causal_mut_index = np.linspace(edge_offset, p-edge_offset, multigenecity, dtype=int) gen_effect_size_unnormalized = {item: np.random.normal(loc=0, scale=float(h)/np.sqrt(multigenecity)) for item in causal_mut_index} print causal_mut_index causal_mutations = set() mutation_meta = {} prs = np.zeros(n) for i, variant in enumerate(locus_list): row = plink_file.next() if i in causal_mut_index: genotypes = np.fromiter(row, dtype=float) genotypes[genotypes==3] = np.mean(genotypes[genotypes!=3]) prs += genotypes * gen_effect_size_unnormalized[i] plink_file.close() del causal_mut_index, gen_effect_size_unnormalized env_rs_unnormalized = np.random.normal(loc=0, scale=np.sqrt(1-h**2), size=n) gen_effect_size = h * (prs - np.mean(prs)) / np.std(prs) env_effect_size = np.sqrt(1-h**2) * (env_rs_unnormalized - np.mean(env_rs_unnormalized) ) / np.std(env_rs_unnormalized) burden = gen_effect_size + env_effect_size sorted_i = np.argsort(burden)[::-1] ncases = int(n * p_cases) cases_i = set(sorted_i[:ncases]) # write new plink file for i, sample in enumerate(sample_list): sample_list[i].affection = int(i in cases_i) #plink_write = plinkfile.create(out, sample_list) plink_write = plinkfile.WritablePlinkFile( out, sample_list ) #plinkio doesn't have seek? so we close it when we don't need it and reopen it here plink_file = plinkfile.open( plink_in ) for i, variant in tqdm.tqdm(enumerate(locus_list)): row = plink_file.next() plink_write.write_row(variant, row) plink_write.close() plink_file.close()
def main(plink_fn, pheno_fn, output_plink_fn): plink_file = plinkfile.open(plink_fn) if not plink_file.one_locus_per_row(): print("This script requires that snps are rows and samples columns.") exit(1) sample_list = plink_file.get_samples() for x in sample_list: print x.iid pheno_df = pd.read_csv(pheno_fn, sep="\t") iids = pheno_df['Strain'] + '__' + pheno_df['Animal_Id'].astype(str) # print iids strain_count = pheno_df.groupby(['Strain']).count()['Animal_Id'].to_dict() # print strain_count output_sample_list = create_output_sample_list(sample_list, iids) # for x in output_sample_list: # print x.iid out_plink = plinkfile.create(output_plink_fn, output_sample_list) locus_list = plink_file.get_loci() count = 0 for locus, row in zip(locus_list, plink_file): count += 1 if count % 1000 == 0: print 'At SNP {}'.format(count) strain_genotype_dict = {x[0].iid: x[1] for x in zip(sample_list, row)} sample_strains = [sample.iid.split('__')[0].replace('-', '') for sample in output_sample_list] output_row = [strain_genotype_dict[x] for x in sample_strains] out_plink.write_row(locus, output_row) return
def test_read_write(): with tempfile.TemporaryDirectory() as temp_dir: plink_prefix = os.path.join(temp_dir, "test") samples = [ Sample("fid1", "iid1", "0", "0", 0, 0), Sample("fid2", "iid2", "0", "0", 0, 1) ] loci = [ Locus(1, "chr1:1", 1.0, 1, "A", "C"), Locus(2, "chr1:2", 2.0, 2, "G", "T") ] rows = [[0, 1], [1, 2]] writer = plinkfile.create(plink_prefix, samples) for i, locus in enumerate(loci): writer.write_row(locus, rows[i]) writer.close() reader = plinkfile.open(plink_prefix) assert samples == reader.get_samples() assert loci == reader.get_loci() for row, reader_row in zip(rows, reader): assert row == list(reader_row)
def getdata(chr): total = 0 sel = 1 infile = "{}/2_Exome_36K_Ws_chr{}".format(data_dir, chr) print infile plink_file = plinkfile.open( infile ) # sample_list = plink_file.get_samples( ) locus_list = plink_file.get_loci( ) # pl.cplinkio.reset_row(pl.plink_file.handle) snps = dict() for locus in locus_list: total+=1 if not(total%10000): print total, sel if (locus.chromosome == chr): iter = plink_file.next() maf_ = iter.allele_counts() maf = (2*maf_[2]+maf_[1])/(2.*float(maf_[0]+maf_[1]+maf_[2])) if not(total%10000): print total, sel, maf if not(sel%1000): print total, sel, maf sel+=1 snps[locus.name] = [int(locus.bp_position), maf, iter] return snps
def get_pred_exp_correlation_matrix(ref_panel, locus_genes): # Get predicted expression correlations across STARNET individuals assert top_gene in locus_genes model_weights = {} for gene in locus_genes: model_weights[gene] = get_gene_model(gene, ref_panel) model_weights = pd.DataFrame(model_weights, columns=model_weights) model_weights.fillna(0, inplace=True) rs_numbers_in_block = model_weights.index.values model_weights = model_weights.values STARNET_rs_number_file = 'STARNET_rs_numbers.txt' open(STARNET_rs_number_file, 'w').write('\n'.join(rs_numbers_in_block)) weight_dir = f'fusion_twas/WEIGHTS/{ref_panel}' plink_file = f'{weight_dir}/{top_gene}_locus' individuals_with_expression_file = f'{weight_dir}/individuals_with_expression_thread_0.txt' subprocess.check_call( f'plink --bfile data/STARNET/genotypes/STARNET ' f'--extract {STARNET_rs_number_file} ' f'--keep-fam {individuals_with_expression_file} ' f'--maf 1e-10 ' # re-filter on MAF because we've removed some indivs f'--make-bed --out {plink_file} > /dev/null', shell=True) plinkio_file = plinkfile.open(plink_file) variants = np.array(tuple(plinkio_file)) plinkio_rs_numbers = np.array( [locus.name for locus in plinkio_file.get_loci()]) sorted_order = np.argsort(plinkio_rs_numbers)[np.argsort( np.argsort(rs_numbers_in_block))] assert (plinkio_rs_numbers[sorted_order] == rs_numbers_in_block).all() variants = variants[sorted_order] predicted_expression = model_weights.T.dot(variants) pred_exp_correlation_matrix = np.corrcoef(predicted_expression) return pred_exp_correlation_matrix
def test_merge( self, my_result_dir, my_interim_dir, my_processed_dir): # create a temporary directory using the context manager with tempfile.TemporaryDirectory() as tmpdirname: working_dir = pathlib.Path(tmpdirname) results_dir = DATA_DIR / "processed" # assign return value to mocked property my_result_dir.return_value = results_dir my_interim_dir.return_value = working_dir my_processed_dir.return_value = working_dir result = self.runner.invoke( merge_datasets, [ "--species", "sheep", "--assembly", "OAR3" ] ) self.assertEqual(0, result.exit_code, msg=result.exception) smarter_tag = f"SMARTER-OA-OAR3-top-{__version__}" plink_path = working_dir / "OAR3" / smarter_tag plink_file = plinkfile.open(str(plink_path)) sample_list = plink_file.get_samples() locus_list = plink_file.get_loci() self.assertEqual(len(sample_list), 2) self.assertEqual(len(locus_list), 3)
def compare(hdf_file, plink_file): pfile = plinkfile.open(plink_file) if not pfile.one_locus_per_row(): logging.error("""This script requires the snps to be rows and samples to be columns.""") sys.exit(1) locus_list = pfile.get_loci( ) pset = { (l.chromosome, l.bp_position) for l in locus_list } # pset = {item for item in pset if item[0] == 1}#TODO remove total_intersection = 0 total_hset_length = 0 total_pset_length = len(pset) with h5py.File(hdf_file, "r") as hfile: for key in hfile.keys(): if key == "meta": continue ikey = int(key) hset = {(ikey, int(pos)) for pos in hfile[key].keys()} # testpset= {(i[0], i[1]) for i in pset if i[0] == ikey} # if len(testpset - hset) > 0 or len( hset - testpset) > 0: # pdb.set_trace() pset_len = len(pset) # hmp = sorted([i for _,i in hset - pset]) # pmh = sorted([i for _,i in pset - hset]) pset = pset - hset total_intersection += pset_len - len(pset) total_hset_length += len(hset) return(total_intersection, total_pset_length, total_hset_length)
def test_import_from_text_plink(self, my_working_dir, my_result_dir): # create a temporary directory using the context manager with tempfile.TemporaryDirectory() as tmpdirname: working_dir = pathlib.Path(tmpdirname) results_dir = working_dir / "results" # assign return value to mocked property my_working_dir.return_value = working_dir my_result_dir.return_value = results_dir # copy test data files self.link_files(working_dir) result = self.runner.invoke(import_from_plink, [ "--dataset", "test.zip", "--file", "plinktest", "--chip_name", self.chip_name, "--assembly", "OAR3" ]) self.assertEqual(0, result.exit_code, msg=result.exception) self.assertEqual(SampleSheep.objects.count(), 2) # check imported chip_name attribute for sample in SampleSheep.objects: self.assertEqual(sample.chip_name, self.chip_name) plink_path = results_dir / "OAR3" / "plinktest_updated" plink_file = plinkfile.open(str(plink_path)) sample_list = plink_file.get_samples() locus_list = plink_file.get_loci() self.assertEqual(len(sample_list), 2) self.assertEqual(len(locus_list), 3)
def main(plink_fn, pheno_fn, output_plink_fn): plink_file = plinkfile.open(plink_fn) if not plink_file.one_locus_per_row(): print("This script requires that snps are rows and samples columns.") exit(1) sample_list = plink_file.get_samples() for x in sample_list: print x.iid pheno_df = pd.read_csv(pheno_fn, sep="\t") iids = pheno_df['Strain'] + '__' + pheno_df['Animal_Id'].astype(str) # print iids strain_count = pheno_df.groupby(['Strain']).count()['Animal_Id'].to_dict() # print strain_count output_sample_list = create_output_sample_list(sample_list, iids) # for x in output_sample_list: # print x.iid out_plink = plinkfile.create(output_plink_fn, output_sample_list) locus_list = plink_file.get_loci() count = 0 for locus, row in zip(locus_list, plink_file): count += 1 if count % 1000 == 0: print 'At SNP {}'.format(count) strain_genotype_dict = {x[0].iid: x[1] for x in zip(sample_list, row)} sample_strains = [ sample.iid.split('__')[0].replace('-', '') for sample in output_sample_list ] output_row = [strain_genotype_dict[x] for x in sample_strains] out_plink.write_row(locus, output_row) return
def test_iter(self): pf = plinkfile.open( "./data/wgas" ) num_rows = 0 for row in pf: num_rows += 1 self.assertEqual( num_rows, 228694 )
def test_iter(self): pf = plinkfile.open("./data/wgas") num_rows = 0 for row in pf: num_rows += 1 self.assertEqual(num_rows, 228694)
def load_plinkfile(basepath): plink_file = plinkfile.open(basepath) sample_list = plink_file.get_samples() locus_list = plink_file.get_loci() my_array = np.zeros((len(plink_file.get_loci( )), len(plink_file.get_samples( )))) for i, el in enumerate(plink_file): my_array[i] = el return(sample_list, locus_list, my_array.astype(np.int))
def check_plinkfile(plink_fn): out_plink = plinkfile.open(plink_fn) samples = out_plink.get_samples() locuses = out_plink.get_loci() count = 0 for locus, row in zip(locuses, out_plink): for sample, genotype in zip(samples, row): print("Individual {0} has genotype {1} for snp {2}.".format(sample.iid, genotype, locus.name)) count += 1 if count >= 2: break print len(locuses)
def check_plinkfile(plink_fn): out_plink = plinkfile.open(plink_fn) samples = out_plink.get_samples() locuses = out_plink.get_loci() count = 0 for locus, row in zip(locuses, out_plink): for sample, genotype in zip(samples, row): print("Individual {0} has genotype {1} for snp {2}.".format( sample.iid, genotype, locus.name)) count += 1 if count >= 2: break print len(locuses)
def run_gwas(imputed, toPCA, out, npcs=5): # Compute PCA plinkpca = plinkfile.open(toPCA) if not plinkpca.one_locus_per_row(): print("The plink file is f****d") exit(1) sample_list = plinkpca.get_samples() locus_list = plinkpca.get_loci() demo = pd.read_table('data/popres_European.ind', delimiter='\t') famIDs = set(int(i.fid) for i in sample_list) demography = [ row.country for _, row in demo.iterrows() if row.famID in famIDs ] demo = pd.read_table('data/popres_European.ind', delimiter='\t') ids = [int(row.famID) for _, row in demo.iterrows() if row.famID in famIDs] del demo, famIDs n = len(sample_list) p = len(locus_list) gen_mat = np.empty((n, p), dtype=np.float32) loc = 0 for i, row in enumerate(plinkpca): arr = np.fromiter(row, dtype=np.float32) arr[arr == 3] = np.nan sd = np.nanstd(arr) mu = np.nanmean(arr) arr -= mu arr[np.isnan(arr)] = 0 arr /= sd gen_mat[:, loc] = arr #np.fromiter(row, dtype=np.float32) loc += 1 pca = decomp.PCA() U, S, V = pca._fit_truncated(gen_mat, n_components=npcs, svd_solver='arpack') np.savetxt(out + '.V.txt', V) np.savetxt(out + '.U.txt', U) np.savetxt(out + '.sigma.txt', S) np.savetxt(out + '.ids.txt', ids, fmt='%i') with open(out + '.countries', 'w') as f: f.write("\n".join(demography)) #del S, V, gen_mat U_id_dict = dict((key, value) for (key, value) in zip(ids, U[:, :npcs])) run_regressions(imputed, U_id_dict, out + 'betas.txt', npcs)
def load_genotypes(self): """ Load the plink BED format genotype data file. Assumes samples in columns and SNP loci in rows. Needs plinkio. https://github.com/fadern/libplinkio """ from plinkio import plinkfile bed_file = plinkfile.open(self.file_name) for counter, row in enumerate(bed_file): self.genotype[counter,:] = list(row) if counter % 100000 == 99999: print(counter+1) bed_file.close()
def load_genotypes(self): """ Load the plink BED format genotype data file. Assumes samples in columns and SNP loci in rows. Needs plinkio. https://github.com/fadern/libplinkio """ from plinkio import plinkfile bed_file = plinkfile.open(self.file_name) for counter, row in enumerate(bed_file): self.genotype[counter, :] = list(row) if counter % 100000 == 99999: print(counter + 1) bed_file.close()
def add_pheno(plink_in, multigenecity, out, h=0.85, p_cases=0.5): plink_file = plinkfile.open(plink_in) if not plink_file.one_locus_per_row(): print("This script requires that snps are rows and samples columns.") exit(1) sample_list = plink_file.get_samples() locus_list = plink_file.get_loci() n = len(sample_list) p = len(locus_list) fids = np.array([item.fid for item in sample_list]) iids = np.array([item.iid for item in sample_list]) edge_offset = 100 causal_mut_index = np.linspace( edge_offset, p-edge_offset, multigenecity, dtype=int) gen_effect_size_unnormalized = { item: np.random.normal(loc=0, scale=float(h)/np.sqrt(multigenecity)) for item in causal_mut_index} prs = np.zeros(n) for i, variant in enumerate(locus_list): row = plink_file.next() if i in causal_mut_index: genotypes = np.fromiter(row, dtype=float) genotypes[genotypes == 3] = np.mean(genotypes[genotypes != 3]) prs += genotypes * gen_effect_size_unnormalized[i] plink_file.close() del causal_mut_index, gen_effect_size_unnormalized # Draw random environmental effects env_rs_unnormalized = np.random.normal( loc=0, scale=np.sqrt(1-h**2), size=n) gen_effect_size = h * (prs - np.mean(prs)) / np.std(prs) env_effect_size = (np.sqrt(1-h**2) * (env_rs_unnormalized - np.mean(env_rs_unnormalized)) / np.std(env_rs_unnormalized)) burden = gen_effect_size + env_effect_size sorted_i = np.argsort(burden)[::-1] ncases = int(n * p_cases) cases_i = sorted_i[:ncases] affection = np.zeros(n, dtype=np.int8) affection[cases_i] = 2 affection[affection == 0] = 1 towrite = np.column_stack((fids, iids, affection)) np.savetxt(out, towrite, delimiter='\t', fmt=['%s', '%s', '%s'], header='FID\tID\tpheno',)
def read_geno_mat_plinkio(bed_location): """ Reads in a genotype matrix, using plinkio, Takes a lot of mem, and a lot of time if there are a lot of individuals. :param bed_location: path of the bed file for reading. make sure you've limited the it enough before you continue. :return: genotype matrix, and the plinkio documentation """ plink_file = plinkfile.open(bed_location) genotype_mat = np.zeros((len(plink_file.loci),len(plink_file.samples)), dtype=float) snp_names = [x.name for x in plink_file.loci] i_ids = [x.fid + " " + x.iid for x in plink_file.samples] for i in range(len(plink_file.loci)): check_object = plink_file.__next__() genotype_mat[i, :] = check_object return genotype_mat, plink_file
def getdata2(chr, df_catalog): snps_cmn = dict() snp_list_cmn = ['exm-'+str(x) for x in df_catalog.SNP.tolist()] infile ="{}/2_Exome_36K_Ws_chr{}".format(data_dir, chr) plink_file = plinkfile.open( infile ) locus_list = plink_file.get_loci( ) total = 0 for locus in locus_list: total+=1 iter = plink_file.next() if (locus.name in snp_list_cmn) and (locus.chromosome == chr): maf_ = iter.allele_counts() maf = (2*maf_[2]+maf_[1])/(2.*float(maf_[0]+maf_[1]+maf_[2])) if not(total%10000): print total, maf snps_cmn[locus.name] = [int(locus.bp_position), maf, iter] return snps_cmn
def snps_match(plinkName, store_name, position_dset=None): # WARNING: this only works if positions are unique. with h5py.File(store_name, 'r', libver='latest') as store: # check the plink file plink_file = plinkfile.open(plinkName) locus_list = plink_file.get_loci() plink_file.close() plinkSet = set((l.chromosome, l.bp_position) for l in locus_list) del locus_list len_plink = len(plinkSet) if position_dset is None: position_dset = 'positions' for key in store: if key == 'meta': continue positions = store["{}/{}".format(key, position_dset)].value ikey = int(key) hset = set((ikey, int(pos)) for pos in positions) len_plink -= len(hset) plinkSet -= hset if len(plinkSet) == 0 and len_plink == 0: return True return False
def main(args): usage = """python %s <plink root> <h5 file> Convert binary PLINK files into h5 file. E.g.: py plink2h5.py mydata_final_clean mydata_final_clean.h5\n""" % args[0] if len(args) != 3: sys.stderr.write(usage) sys.exit(0) plinkRoot = args[1] h5fname = args[2] plinkTitle = plinkRoot.split("/")[-1] # Read binary PLINK files plinkF = pf.open(plinkRoot) numSnps = len(plinkF.get_loci()) numSamples = len(plinkF.get_samples()) print "%d SNPs x %d samples" % (numSnps, numSamples) # Create the empty array to store genotypes atom = tables.Int8Atom() h5F = tables.openFile(h5fname, 'w', title=plinkTitle) genotype = h5F.createCArray(h5F.root, 'genotype', atom, (numSnps, numSamples), title='Genotype', filters=tables.Filters(complevel=5, complib='blosc')) # populate for counter, row in enumerate(plinkF): genotype[counter, :] = list(row) if counter % 10000 == 9999: print(counter + 1), 'SNPs read' plinkF.close() h5F.close()
def __init__(self, path, chrm=None, fids=None, iids=None): self.chrm = chrm self.handle = plinkfile.open(path) if not self.handle.one_locus_per_row(): raise Exception("This script requires that SNPs are rows and samples columns.") samples = self.handle.get_samples() self._subset_idxs = None if fids is None and iids is None: self._samples = dict((s.iid, s) for s in samples) else: if fids is not None: fids = set(fids) if iids is not None: iids = set(iids) def keep(s): return (fids is None or s.fid in fids) and (iids is None or s.iid in iids) self._subset_idxs = set(i for i, sample in enumerate(samples) if keep(sample)) self._samples = index_map(samples[i].iid for i in self._subset_idxs) self._loci = self.handle.get_loci() self._iter = izip(self._loci, self.handle)
def run_regressions(plink_file, U_dict, out_file, npcs, buf=100 ): plink_file = plinkfile.open(plink_file) if not plink_file.one_locus_per_row(): print("The plink file is f****d") exit(1) locus_list = plink_file.get_loci() sample_list = plink_file.get_samples() n = len(sample_list) p = len(locus_list) y = np.array([sample.affection for sample in sample_list]) X = np.empty((n, npcs + 1)) betas = np.empty((buf, 2 * X.shape[1] + 2), dtype = np.float32) #X_design = np.ones((n,2)) V = np.matrix(np.zeros(shape = (X.shape[0], X.shape[0]))) X[:,1:] = [U_dict[int(sample.iid)] for sample in sample_list] X[:,1:] /= np.std(X[:,1:], axis = 0) covp = X.shape[1] # High C corresponds to less regularization. model = LogisticRegression(fit_intercept=False, tol=1e-5, C=1e4) k = 0 # fit nuisance model.fit(X[:,1:], y) y_model = model.predict_proba(X[:,1:]) l_null = log_loss(y, y_model, normalize=False) with open(out_file, 'w') as out_f: i = 0 logging.info("Iterating over SNPs") for j, row in tqdm.tqdm(enumerate(plink_file), total = p): locus = locus_list[j] arr = np.fromiter(row, dtype = np.float32) mu = np.mean(arr[arr!=3]) std = np.std(arr[arr!=3]) arr[arr==3] = mu arr -= mu if std > 0: arr /= std X[:,0] = arr model.fit(X, y) # Wald Test y_model = model.predict_proba(X) X_design= X np.fill_diagonal(V, np.multiply(y_model[:,0], y_model[:,1])) covLogit = np.linalg.inv(X_design.T * V * X_design) coefs = np.array(model.coef_)#np.insert(model.coef_, 0, model.intercept_) z = (coefs / np.sqrt(np.diag(covLogit))) ** 2 # Chi-squared test l_fit = log_loss(y, y_model, normalize=False) D = l_fit - l_null p = chi2.sf(z, 1) betas[i, :covp] = coefs betas[i, covp:2*covp] = p betas[i, 2*covp] = D else: betas[i,:] = np.nan betas[i, 2*covp+1] = locus.chromosome i += 1 if i == buf: i = 0 np.savetxt(out_f, betas, delimiter='\t') np.savetxt(out_f, betas[:i,:], delimiter='\t') # write the remaining logging.info("Finished iterating")
def test_get_path(self): path = "./data/wgas" pf = plinkfile.open(path) self.assertEqual(path, pf.get_path())
def plinkToH5(client_config, env): """Gets plink prefix, produces an HDF file with the same prefix""" pfile = client_config['plinkfile'] store_name = shared.get_plink_store(pfile) logger.info(f'Opening plinkfile: {pfile}') try: plink_file = plinkfile.open(pfile) except MemoryError as e: logger.error('MemoryError!') logger.error(e) if not plink_file.one_locus_per_row(): logger.error("""This script requires that snps are rows and samples columns.""") sys.exit(1) sample_list = plink_file.get_samples() locus_list = plink_file.get_loci() n_tot = len(sample_list) logger.info(f'Opening h5py file:{store_name}') with h5py.File(store_name, 'w', libver='latest') as store: store.attrs['n'] = len(sample_list) store.attrs['has_local_AF'] = False store.attrs['has_global_AF'] = False store.attrs['has_centering'] = False store.attrs['has_normalization'] = False potential_pheno_file = pfile + ".pheno" if os.path.isfile(pfile + ".pheno"): affection = np.loadtxt(potential_pheno_file, dtype=int, usecols=2) else: affection = [sample.affection for sample in sample_list] if len(np.unique(affection)) > 2: raise ValueError( "phenotype is not binary. We only support binary for now") write_or_replace(store, 'meta/Status', affection, np.int8) ids = [sample.iid for sample in sample_list] write_or_replace(store, 'meta/id', ids, 'S11') del ids, affection # Read Demographic file logger.info(f'Reading demographic file at {pfile}.ind') logger.info(f'File exists: {os.path.isfile(pfile + ".ind")}') with open(pfile + ".ind", 'r') as dem_f: dem = [(row.split("\t")[2]).encode("UTF8") for row in dem_f] write_or_replace(store, 'meta/regions', dem) # Read chromosome data current_chr = 1 positions = [] rsids = [] all_counts = [] current_group = store.require_group(str(current_chr)) genotypes = np.zeros(n_tot, dtype=np.float32) for locus, row in zip(locus_list, plink_file): if locus.chromosome != current_chr: if len(positions) == 0: del store[str(current_chr)] else: write_or_replace(current_group, 'positions', positions, dtype=np.uint) write_or_replace(current_group, 'rsids', rsids) write_or_replace(current_group, 'counts', all_counts, np.uint32) send_positions_to_server(positions, current_chr, client_config, env) positions = [] # rsid = [] all_counts = [] current_chr = locus.chromosome if current_chr == 23: break current_group = store.require_group(str(current_chr)) pos = str(locus.bp_position) counts, geno = process_plink_row(row, genotypes) # This should be a try except try: current_group.create_dataset(pos, data=geno) except Exception: logger.error( f"Cannot write position: chr{locus.chromosome} {pos}") rsids.append(locus.name.encode('utf8')) positions.append(pos) all_counts.append(counts) if locus.chromosome != 23: write_or_replace(current_group, 'positions', positions, np.uint32) write_or_replace(current_group, 'rsids', rsids) write_or_replace(current_group, 'counts', all_counts, np.uint32) send_positions_to_server(positions, current_chr, client_config, env) plink_file.close() logger.info('Finished writing plink to hdf5.')
def setUp(self): pf = plinkfile.open("./data/wgas") self.row = next(pf) pf.close()
def test_get_samples(self): pf = plinkfile.open("./data/wgas") self.assertEqual(len(pf.get_samples()), 90)
else: if str(sampleId) in reps: return str(sampleId) else: #if not sampleId.startswith("HGDP"): # pdb.set_trace() return "" droppingRelatives = False basename = 'uae_hgdp1LD' wdir = "/research/gutsybugs/KUMI/Data/" hetfile = "%s.het" % basename ## to be calculated plink_file = plinkfile.open(basename) dmfile = "%s.mdist" % basename #"mergedQCLD.mdist" # "merged_1ibs.mdist" #"merged.dist" dmfileIDs = dmfile + ".id" ## Loading distance matrix, produced by plink, putting ids to dataframe ids = pd.read_csv(dmfileIDs, delimiter='\t', header=None)[1] dm = pd.read_csv(dmfile, header=None, delimiter='\t') het = pd.read_csv(hetfile, delimiter='\s+') sampleInfo = pd.read_csv("hgdp/HGDPid_populations.csv", sep=',', index_col='Id') fam = pd.read_csv('%s.fam' % basename, sep=' ', header=None) #reps = map(str, set(pd.read_csv('100medoids.txt', header=None)[0])) reps = '10187 12742 13120 13076 10651 10347 10215 10725 10926 12599'.split()
from plinkio import cplinkio import collections import numpy as np import pandas from prettytable import PrettyTable import math import scipy.stats as stats from operator import itemgetter import utils data_dir = utils.data_dir # Initialize plink library and read phenotype file infile = "{}/2_Exome_36K_Ws".format(data_dir) plink_file = plinkfile.open( infile ) if not plink_file.one_locus_per_row( ): print( "This script requires that snps are rows and samples columns." ) exit( 1 ) sample_list = plink_file.get_samples( ) locus_list = plink_file.get_loci( ) # Phenotype file phenotypefile = "../data/pheno_Exome_36k_MCC_Ws_MIN2.txt" # phenotypefile = "../data/pheno_autism_exclutions2.txt" df_full_pheno = pandas.read_csv(phenotypefile, '\t')
def test_get_samples(self): pf = plinkfile.open( "./data/wgas" ) self.assertEqual( len( pf.get_samples( ) ), 90 );
def test_get_loci(self): pf = plinkfile.open( "./data/wgas" ) self.assertEqual( len( pf.get_loci( ) ), 228694 );
def test_open(self): pf = plinkfile.open( "./data/wgas" ) self.assertNotEqual( pf, None );
def test_get_path(self): path = "./data/wgas" pf = plinkfile.open( path ) self.assertEqual( path, pf.get_path( ) )
def test_fail_open(self): with self.assertRaises(IOError): plinkfile.open("/")
from plinkio import plinkfile import pandas as pd import numpy as np import MySQLdb from MySQLdb.cursors import DictCursor from scipy.spatial.distance import squareform import scipy.cluster.hierarchy as sch import pylab #filename = '/research/gutsybugs/Software/Plink/Tutorial/hapmap1' wdir = "/research/gutsybugs/KUMI/Data/" filename = '%s/mergedQC' % wdir dmfile = "%s.dist" % filename hetfile = "%s.het" % filename plink_file = plinkfile.open(filename) #plink_file = plinkfile.open('/research/gutsybugs/KUMI/Data/mergedQC') ## TAKES LOOOOOONG! sample_list = plink_file.get_samples() locus_list = plink_file.get_loci() makeSampleID = lambda sample: "%s_%s" % (sample.fid, sample.iid) sampleIDs = [makeSampleID(sample) for sample in sample_list] ntdict = {'N': np.NaN, 'A': 1, 'C': 2, 'G': 3, 'T': 4} if filename.endswith('hapmap1') or filename.endswith('mergedQC'): snpFct = lambda x: x else: snpFct = lambda x: ntdict[x] makerow = lambda row, locus: [snpFct(snp) for snp in row] + [ locus.chromosome, locus.name, locus.position, locus.bp_position
def write_records(prefix, phenotype_file, nfolds=5, phenotype_idcol=0, phenotype_col=1, phenotype_categorical=True, save_tfrecords=True, save_npy=False, num_class = None): assert save_tfrecords or save_npy, 'Either TFRecords or NPY must be specified' create_diet_dir(prefix) # Read plink files Xt_plink = plinkfile.open(prefix) num_snps = len(Xt_plink.get_loci()) num_ind = len(Xt_plink.get_samples()) # Read sample ids from the .fam file fam_ids = np.array([s.iid for s in Xt_plink.get_samples()]) pheno = pd.read_csv(phenotype_file, sep=None, engine='python') assert len(fam_ids) == pheno.shape[0], "Number of records in .fam file "\ "and phenotype file do not match." assert np.all(fam_ids == np.array(pheno.iloc[:,phenotype_idcol].as_matrix())),\ "IDs of .fam file and phenotype file do not match" pheno_list = pheno.iloc[:, phenotype_col] if phenotype_categorical: pheno_list_cat = pheno_list.astype('category').cat pheno_list_values = pheno_list_cat.categories.values pheno_map = pd.DataFrame({'Phenotype': pheno_list_values, 'Codes': range(len(pheno_list_values))}, columns=('Phenotype', 'Codes')) pheno_map.to_csv(_templ['phenomap'].format(pref=prefix), sep='\t', index=False) labels = pheno_list_cat.codes.astype(np.uint8) num_class = num_class or len(set(labels)) else: # TODO: Test that labels = pheno_list.as_matrix() # Prepare indices for k-fold cv and train/valid/test split cv_indices = [] for cv_trainval, cv_test in KFold(nfolds, True, 42).split(range(num_ind)): cv_train, cv_val = train_test_split(cv_trainval, test_size=1/(nfolds-1)) cv_indices.append((cv_train, cv_val, cv_test)) # Save metadata as json with open(_templ['metadata'].format(pref=prefix), 'w') as f: json.dump({'num_snp': num_snps, 'num_ind': num_ind, 'phenotype_categorical': phenotype_categorical, 'nfolds': nfolds, 'num_ind_per_fold': [(len(x),len(y),len(z)) for x,y,z in cv_indices], 'num_class': num_class }, f) # Transpose bed file to get X matrix trans_filename = _templ['plinktrans'].format(pref=prefix) # Produces transposed BED file print('Transposing plink file...') assert Xt_plink.transpose(trans_filename), 'Transpose failed' # Open transposed file and iterate over records X_plink = plinkfile.open(trans_filename) assert not X_plink.one_locus_per_row(), 'PLINK file should be transposed' assert len(labels) == num_ind, 'Number of labels is not equal to num individuals' if save_tfrecords: wr = lambda i, t: tf.python_io.TFRecordWriter(_templ['fold'].format(pref=prefix, k=i, set=t)) tf_writers = [{ 'train': wr(i+1, 'train'), 'valid': wr(i+1, 'valid'), 'test': wr(i+1, 'test')} for i in range(nfolds)] tf_writer_all = tf.python_io.TFRecordWriter(_templ['data'].format(pref=prefix)) if save_npy: X = np.zeros((num_ind, num_snps), np.int8) # Write k-fold train/valid/test splits for i, (row, label) in enumerate(zip(X_plink, labels)): #iterates over individuals if save_tfrecords: # Save TFRecords example = tf.train.Example(features=tf.train.Features(feature={ 'genotype': tf.train.Feature(int64_list=_int_feature(list(row))), 'label': tf.train.Feature(int64_list=_int_feature([int(label)]))})) for fold, (train_idx, valid_idx, test_idx) in zip(range(nfolds), cv_indices): serialized_example = example.SerializeToString() if i in train_idx: tf_writers[fold]['train'].write(serialized_example) elif i in valid_idx: tf_writers[fold]['valid'].write(serialized_example) elif i in test_idx: tf_writers[fold]['test'].write(serialized_example) else: raise 'Not valid index' tf_writer_all.write(serialized_example) if save_npy: X[i, :] = list(row) if i % 100 == 0: print('Writing genotypes... {:.2f}% completed'.format((i/num_ind)*100), end='\r') sys.stdout.flush() # Save fold as npy if requested if save_npy: for i, (train_idx, valid_idx, test_idx) in zip(range(nfolds), cv_indices): fold_filename = _templ['npy_fold'].format(pref=prefix, k=i+1, set='train') np.save(fold_filename, X[train_idx,]) fold_filename = _templ['npy_fold'].format(pref=prefix, k=i+1, set='valid') np.save(fold_filename, X[valid_idx,]) fold_filename = _templ['npy_fold'].format(pref=prefix, k=i+1, set='test') np.save(fold_filename, X[test_idx,]) np.save(_templ['npy'].format(pref=prefix), X) print('\nDone') if save_tfrecords: for fold in range(nfolds): tf_writers[fold]['train'].close() tf_writers[fold]['valid'].close() tf_writers[fold]['test'].close() tf_writer_all.close() Xt = np.zeros([num_snps, num_ind], np.int8) for i, row in enumerate(Xt_plink): #iterates over snps Xt[i,:] = row if i % 1000 == 0: print('Writing X transpose matrix... {:.2f}% completed'.format((i/num_snps)*100), end='\r') sys.stdout.flush() print('\nDone') # Save X^T as numpy arrays np.save(_templ['x_t'].format(pref=prefix), Xt)
def test_open(self): pf = plinkfile.open("./data/wgas") self.assertNotEqual(pf, None)
def test_fail_open(self): with self.assertRaises( IOError ): plinkfile.open( "/" )
# load libraries for plotting if not args.no_plot: import matplotlib.pyplot as plt from mpl_toolkits.basemap import Basemap from scipy.ndimage import label # importing data sampleLoc = np.loadtxt(args.coords, delimiter=args.coord_sep) if np.shape(sampleLoc)[1] != 2: raise IndexError('{} has {} separate columns, but ought to be 2 ' '(check COORDS_SEP?)'.format(args.coords, np.shape(sampleLoc)[1])) if args.plink: sampleData = plinkfile.open(args.plink) print('Using data file (PLINK): {}'.format(args.plink), file=sys.stderr) printEveryNthLine = 10000 else: sampleData = np.transpose( np.loadtxt(args.non_genetic, delimiter=args.non_gen_sep)) if np.shape(sampleData)[1] != np.shape(sampleLoc)[0]: raise IndexError( 'Make sure that {} ({} rows) has one row per position ' 'in {} ({} positions)!'.format(args.non_genetic, np.shape(sampleData)[1], args.coords, np.shape(sampleLoc)[0])) print('Using data file: {}'.format(args.non_genetic), file=sys.stderr) printEveryNthLine = int(np.shape(sampleData)[0] / 10)
def test_get_loci(self): pf = plinkfile.open("./data/wgas") self.assertEqual(len(pf.get_loci()), 228694)
def load_data(path, prefix, phenotype_file): """ loads the prefixed files: prefix.bed, prefix.fam, ... and saves, but it may make more sense to save in the load_1000 function above. Args: prefix: path with last elem as prefix of .bed, .fam, ... Returns: genomic_data: numpy array label_data: nuumpy array of labels """ prefix = os.path.join(path, prefix) print("loading plink files...") Xt_plink = plinkfile.open(prefix) num_snps = len(Xt_plink.get_loci()) num_ind = len(Xt_plink.get_samples()) num_class = 26 print("loaded.") # save metafile for info print("writing meta file...") with open(os.path.join(path,"_metadata.json"), 'w') as f: json.dump({'num_snps': num_snps, 'num_ind': num_ind, 'num_class': num_class}, f) print("written.") # have to transpose the plinkfile to get X trans_filename = os.path.join(path,"trans") print("transposing plink file...") assert Xt_plink.transpose(trans_filename), "transpose failed" print("done.") # Now Open the transpose as X print("make genomic_data matrix...") X_plink = plinkfile.open(trans_filename) assert not X_plink.one_locus_per_row(), "Plink file should be transposed" # save the data as a npy file: genomic_data = np.zeros((num_ind,num_snps), np.int8) for i, row in enumerate(X_plink): genomic_data[i,:]=list(row) print("made.") # lets save labels print("loading labels and making one-hot rep...") pheno = pd.read_csv(os.path.join(path,phenotype_file), sep=None, engine= "python") pheno_list = pheno.iloc[:, 1] pheno_list_cat = pheno_list.astype('category').cat pheno_list_values = pheno_list_cat.categories.values pheno_map = pd.DataFrame({'Phenotype': pheno_list_values, "Codes": range(len(pheno_list_values))}, columns=('Phenotype','Codes')) pheno_map.to_csv(os.path.join(path,"pheno_map")) # okay get labels now that we have a map labels = pheno_list_cat.codes.astype(np.uint8) nb_class = len(pheno_list_values) targets = np.array(labels).reshape(-1) # makes one hot matrix for label data class1 = [1,0,...,0] label_data = np.eye(nb_class)[targets] print("just made the one-hot matrix for labels") return genomic_data, label_data
def setUp(self): pf = plinkfile.open( "./data/wgas" ) self.row = next( pf ) pf.close( )
def import_genetics(path_to_plink_files): """ Import genetics data Note about how to read the bed/bim/fam files: ## affection - unknown: 0 (fam) -> -9 (python) - unaffected: 1 (fam) -> 0 (python) - affected: 2 (fam) -> 1 (python) ## sex - male: 1 (fam) -> 0 (python) - female: 2 (fam) -> 1 (python) ## genotype - genotype 0: code 00 Homozygote "0"/"0" - genotype 1: code 01 Heterozygote - genotype 2: code 11 Homozygote "1"/"1" - genotype 3: unknown ------------------------------------------------------- More details are available on https://web.njit.edu/~zhiwei/GAS_101.pdf and on https://github.com/mfranberg/libplinkio and on http://www.gwaspi.org/?page_id=671 Args: path_to_plink_files: path to a folder containing a bed, bim and fam file Returns: snp_list: list of snps patients: list of patients matrix: matrix with patient as line and snp as column """ from plinkio import plinkfile import numpy as np plink_file = plinkfile.open(path_to_plink_files) if not plink_file.one_locus_per_row(): print("This script requires that snps are rows and samples columns.") exit(1) sample_list = plink_file.get_samples() locus_list = plink_file.get_loci() """ for sample in sample_list: print sample.fid, sample.iid, sample.father_iid, sample.mother_iid, sample.sex, sample.affection for locus in locus_list: print locus.chromosome, locus.name, locus.position, locus.bp_position, locus.allele1, locus.allele2 """ num_snp = len(locus_list) num_patient = len(sample_list) matrix_patient_snp = np.zeros((num_snp, num_patient)) patient_list = np.zeros(num_patient, dtype='|S30') snp_list = np.zeros(num_snp, dtype='|S30') for i, row, locus in zip(range(num_snp), plink_file, locus_list): matrix_patient_snp[i] = row snp_list[i] = locus.name matrix_patient_snp = matrix_patient_snp.T for i, sample in zip(range(num_patient), sample_list): patient_list[i] = sample.iid return snp_list, patient_list, matrix_patient_snp
def case_control_split(to_split, num_case_holders, num_control_holders, split_prefix, seed=1234, pheno_file=None, create=True): """Distributes the rows of h5py dataset at to_split into num_case_holders, num_control_holders groups of approximately equal size adding up to the total number of individuals This function copies by shamelessly iterating over everything so it can be very slow""" # Figure out how many #cases and #controls if pheno_file is None: with h5py.File(to_split, 'r') as to_split_fp: status = to_split_fp['meta/Status'].value num_cases = np.sum(status) controls = status == 0 num_controls = np.sum(controls) control_rows = np.where(controls) case_rows = np.where(~controls) del controls else: # It must be a plink file plink_file = plinkfile.open(pheno_file) sample_list = plink_file.get_samples() status = np.array([i.affection for i in sample_list]) ids = np.array([i.iid for i in sample_list]) case_rows = ids[status == 1] control_rows = ids[status == 0] num_cases = len(case_rows) num_controls = len(control_rows) del status, ids if num_case_holders > 1: case_per_silo = [num_cases / num_case_holders] * (num_case_holders - 1) case_per_silo.append(num_cases - sum(case_per_silo)) else: case_per_silo = [num_cases] to_create = zip(case_per_silo, ['case'] * num_case_holders) if num_control_holders > 1: control_per_silo = [num_controls / num_control_holders ] * (num_control_holders - 1) control_per_silo.append(num_controls - sum(control_per_silo)) else: control_per_silo = [num_controls] to_create += zip(control_per_silo, ['control'] * num_control_holders) to_create = set(to_create) names = [] def group_copy(name, node, rows, fp): dtype = node.dtype value = node[...] fp.require_dataset(name, data=value[rows], shape=(len(rows), ), dtype=dtype) i = 0 with h5py.File(to_split, 'r') as to_split_fp: if pheno_file is not None: ids = to_split_fp["meta/id"].value case_rows = np.where([ind in case_rows for ind in ids])[0] control_rows = np.where([ind in control_rows for ind in ids])[0] np.random.seed(seed) case_rows = np.random.permutation(case_rows) control_rows = np.random.permutation(control_rows) while len(to_create): count, status = to_create.pop() split_name = split_prefix + status + str(i) + '.h5py' names.append(split_name) if not create: i += 1 continue logging.info("-Constructing: " + split_name) if status == 'case': chosen_rows = case_rows[:count] case_rows = case_rows[count:] else: chosen_rows = control_rows[:count] control_rows = control_rows[count:] with h5py.File(split_name, 'w') as copy_to_fp: for key in to_split_fp.keys(): dset_to_copy = to_split_fp[key] dset_to_copyto = copy_to_fp.require_group(key) copier = partial(group_copy, rows=chosen_rows, fp=dset_to_copyto) dset_to_copy.visititems(copier) i += 1 return names