def test_locate_unlinked(self): gn = [[0, 1, 2], [0, 1, 2]] expect = [True, False] actual = allel.locate_unlinked(gn, size=2, step=2, threshold=.5) aeq(expect, actual) gn = [[0, 1, 1, 2], [0, 1, 1, 2], [1, 1, 0, 2], [1, 1, 0, 2]] actual = allel.locate_unlinked(gn, size=2, step=1, threshold=.5) expect = [True, False, True, False] aeq(expect, actual) gn = [[0, 1, 1, 2], [0, 1, 1, 2], [0, 1, 1, 2], [1, 1, 0, 2], [1, 1, 0, 2]] actual = allel.locate_unlinked(gn, size=2, step=1, threshold=.5) expect = [True, False, True, True, False] aeq(expect, actual) actual = allel.locate_unlinked(gn, size=3, step=1, threshold=.5) expect = [True, False, False, True, False] aeq(expect, actual) # test with bcolz carray import bcolz gnz = bcolz.carray(gn, chunklen=2) actual = allel.locate_unlinked(gnz, size=2, step=1, threshold=.5, blen=2) expect = [True, False, True, True, False] aeq(expect, actual)
def ld_prune(gn, pos, size=500, step=200, threshold=.1, n_iter=5): """Remove sites in LD. Parameters ---------- gn : TYPE DESCRIPTION. pos : TYPE DESCRIPTION. size : TYPE, optional DESCRIPTION. The default is 500. step : TYPE, optional DESCRIPTION. The default is 200. threshold : TYPE, optional DESCRIPTION. The default is .1. n_iter : TYPE, optional DESCRIPTION. The default is 5. Returns ------- TYPE DESCRIPTION. gn : TYPE DESCRIPTION. """ for i in range(n_iter): loc_unlinked = allel.locate_unlinked(gn, size=size, step=step, threshold=threshold) n = np.count_nonzero(loc_unlinked) n_remove = gn.shape[0] - n print(f"iteration {i+1} retaining {n} removing {n_remove} variants") gn = gn.compress(loc_unlinked, axis=0) pos = pos[loc_unlinked] return allel.SortedIndex(pos), gn
def work(self): import numpy as np import allel import h5py import pandas as pd from luigi.file import atomic_file # Opens the SynSNPS file, which contains only biallelic synonymous sites callset = h5py.File(self.input()['syn'].path, mode='r') genotypes = allel.GenotypeChunkedArray(callset['calldata']['genotype']) samples = np.array([x.decode() for x in callset['samples']]) # Selects site with r**2 linkage < max_linkage n_ref = genotypes.to_n_ref(fill=-9) unlinked = allel.locate_unlinked(n_ref, threshold=self.max_linkage)[:] # Create pseudohaplotypes (0=ref, 1=alt, -1=missing) hap_matrix = genotypes[:][unlinked].to_haplotypes() # Double up the sample names samples_dup = np.array(list(zip(samples, samples))).reshape(-1, 1) hap_df = pd.DataFrame(np.hstack((samples_dup, hap_matrix.T))) # Atomic write TSV file output af = atomic_file(self.output().path) hap_df.to_csv(af.tmp_path, sep='\t', index=False) af.move_to_final_destination()
def ld_prune(gn, size, step, threshold=.1, n_iter=1): for i in range(n_iter): loc_unlinked = al.locate_unlinked(gn, size=size, step=step, threshold=threshold) n = np.count_nonzero(loc_unlinked) n_remove = gn.shape[0] - n print('iteration', i+1, 'retaining', n, 'removing', n_remove, 'variants') gn = gn.compress(loc_unlinked, axis=0) return gn
def ld_prune(gn, size, step, threshold=.1, n_iter=1): #via http://alimanfoo.github.io/2015/09/28/fast-pca.html for i in range(n_iter): loc_unlinked = allel.locate_unlinked(gn, size=size, step=step, threshold=threshold) n = np.count_nonzero(loc_unlinked) n_remove = gn.shape[0] - n print('iteration', i+1, 'retaining', n, 'removing', n_remove, 'variants') gn = gn.compress(loc_unlinked, axis=0) return gn
def test_locate_unlinked(self): gn = [[0, 1, 2], [0, 1, 2]] expect = [True, False] actual = allel.locate_unlinked(gn, size=2, step=2, threshold=.5) aeq(expect, actual) gn = [[0, 1, 1, 2], [0, 1, 1, 2], [1, 1, 0, 2], [1, 1, 0, 2]] actual = allel.locate_unlinked(gn, size=2, step=1, threshold=.5) expect = [True, False, True, False] aeq(expect, actual) gn = [[0, 1, 1, 2], [0, 1, 1, 2], [0, 1, 1, 2], [1, 1, 0, 2], [1, 1, 0, 2]] actual = allel.locate_unlinked(gn, size=2, step=1, threshold=.5) expect = [True, False, True, True, False] aeq(expect, actual) actual = allel.locate_unlinked(gn, size=3, step=1, threshold=.5) expect = [True, False, False, True, False] aeq(expect, actual)
def ld_prune(gn, size, step, threshold=.1, n_iter=1): """ Performs LD pruning, originally from Alistair Miles' blog. """ for i in range(n_iter): loc_unlinked = allel.locate_unlinked(gn, size=size, step=step, threshold=threshold) n = np.count_nonzero(loc_unlinked) n_remove = gn.shape[0] - n print('iteration', i + 1, 'retaining', n, 'removing', n_remove, 'variants') gn = gn.compress(loc_unlinked, axis=0) return gn
def _pca_ld_prune(gn, size, step, threshold=.1, n_iter=1): blen = size * 10 for i in range(n_iter): loc_unlinked = allel.locate_unlinked(gn, size=size, step=step, threshold=threshold, blen=blen) n = np.count_nonzero(loc_unlinked) n_remove = gn.shape[0] - n print( '[Exec][PCA][LD Prune] Iteration {}/{}: Retaining {} and removing {} variants.' .format(i + 1, n_iter, n, n_remove)) gn = gn.compress(loc_unlinked, axis=0) return gn
def ld_prune(gn, size, step, threshold=.2, n_iter=1, blen=10000): gn_alt = gn.to_n_alt() for i in range(n_iter): loc_unlinked = allel.locate_unlinked(gn_alt, size=size, step=step, threshold=threshold, blen=blen) n = np.count_nonzero(loc_unlinked) n_remove = gn.shape[0] - n print('iteration', i + 1, 'retaining', n, 'removing', n_remove, 'variants') gn = gn.compress(loc_unlinked, axis=0) return (gn, loc_unlinked)
def test_vs_skallel(args): x, size, step, threshold, chunks = args ds = simulate_genotype_call_dataset(n_variant=x.shape[0], n_sample=x.shape[1]) ds["dosage"] = (["variants", "samples"], da.asarray(x).rechunk({0: chunks})) ds = window_by_variant(ds, size=size, step=step) ldm = ld_matrix(ds, threshold=threshold) has_duplicates = ldm.compute().duplicated(subset=["i", "j"]).any() assert not has_duplicates idx_drop_ds = maximal_independent_set(ldm) idx_drop = np.sort(idx_drop_ds.ld_prune_index_to_drop.data) m = allel.locate_unlinked(x, size=size, step=step, threshold=threshold) idx_drop_ska = np.sort(np.argwhere(~m).squeeze(axis=1)) npt.assert_equal(idx_drop_ska, idx_drop)
def ld_prune(gn, size, step, threshold, n_iter): """ Applies pruning; removing SNPs that are not correlated """ for i in range(n_iter): loc_unlinked = allel.locate_unlinked(gn, size=size, step=step, threshold=threshold) n = np.count_nonzero(loc_unlinked) n_remove = gn.shape[0] - n logging.info( f"Pruning iteration {i+1} retaining {n} removing {n_remove} variants" ) gn = gn.compress(loc_unlinked, axis=0) logging.info( f"Applied pruning with paramaters, size : {size}, step : {step}, threshold : {threshold}, n_iter : {n_iter}" ) return gn[:]
def prune_by_ld(number_of_alternate_alleles, window_size=1000, step_size=100, r2=0.2): '''Take an array of the number of alternate alleles and return a smaller array pruned to remove SNPs in LD with each other above a specified threshold as well as the Boolean used for filtering.''' if not all(item > 0 for item in [window_size, step_size, r2]): raise ValueError("All numeric parameters must be positive") pruned_bool = allel.locate_unlinked(number_of_alternate_alleles, window_size, step_size, r2) pruned = number_of_alternate_alleles[pruned_bool] if not len(pruned) < len(number_of_alternate_alleles): warnings.warn("Warning, no pruning occurred!") return pruned, pruned_bool
def ldthin(geno, positions, method, rsize=50000, mac=1, size=100, step=20, thresh=.1, iters=1): """.take if coord, .compress if mask """ ac = geno.count_alleles() # only biallelic and mac or 2 pca_selection = (ac.max_allele() == 1) & (ac[:, :2].min(axis=1) > mac) print("Available site for PCA: {}".format(np.count_nonzero(pca_selection))) if 'random' in method: indices = np.nonzero(pca_selection)[0] indices_ds = np.random.choice(indices, size=rsize, replace=False) indices_ds.sort() genotypes_pca = geno.take(indices_ds, axis=0) # sites with missing data can return error gn = genotypes_pca.to_n_alt()[:] pos = indices_ds print("{} Random SNPs selected for PCA".format(gn.shape[0])) else: genotypes_pca = geno.compress(pca_selection, axis=0) gn = genotypes_pca.to_n_alt() pos = positions[pca_selection] for i in range(iters): loc_unlinked = allel.locate_unlinked(gn, size=size, step=step, threshold=thresh) n = np.count_nonzero(loc_unlinked) n_remove = gn.shape[0] - n print("iteration {} retaining {} removing {} variants".format( i + 1, n, n_remove)) gn = gn.compress(loc_unlinked, axis=0) pos = pos[loc_unlinked] return (gn, allel.SortedIndex(pos))
def ld_prune(genotypes_012, pos=None, size=100, step=20, threshold=0.1, verbosity=0): """Carries out ld pruning""" loc_unlinked = allel.locate_unlinked(genotypes_012, size=size, step=step, threshold=threshold) n = np.count_nonzero(loc_unlinked) genotypes = genotypes_012.compress(loc_unlinked, axis=0) if verbosity > 0: print("ld_prune: Retaining: {} out of {} variants".format( n, genotypes_012.shape[0])) if pos is not None: pos = pos[loc_unlinked] return genotypes, pos else: return genotypes
def test_vs_skallel(args, scheduler): x, window, step, threshold = args be_args = dict(backend='numba') idx_drop_lib = ld_prune(GenotypeCountDataset.create(x), window=window, step=step, threshold=threshold, unit='index', target_chunk_size=max(x.shape[0] // 2, 1), ld_matrix_kwargs=dict(backend='dask/numba'), axis_intervals_kwargs=be_args, mis_kwargs=be_args) with dask.config.set(scheduler=scheduler): idx_drop_lib = np.sort(idx_drop_lib.index_to_drop.data) m = allel.locate_unlinked(x, size=window + 1, step=step, threshold=threshold, blen=x.shape[0]) idx_drop_ska = np.sort(np.argwhere(~m).squeeze(axis=1)) npt.assert_equal(idx_drop_ska, idx_drop_lib)
import allel import moments import numpy as np # load vcf and extract folded sfs; write to file vcf_file = "/data3/vcf_files/allopatric_parental/intermediate_files/demographic_inf_parentals_rename.vcf" f = allel.read_vcf(vcf_file, fields='*') gt = f['calldata/GT'] gt = allel.GenotypeArray(gt) personata=gt[:,0:10] alba=gt[:,10:20] gn_p = personata.to_n_alt(fill=-1) gn_a = alba.to_n_alt(fill=-1) p_unlinked = allel.locate_unlinked(gn_p) a_unlinked = allel.locate_unlinked(gn_a) pass_linkage = np.logical_and(p_unlinked, a_unlinked) final = gt.compress(pass_linkage, axis=0) personata2=final[:,0:10] alba2=final[:,10:20] pers_ac = personata2.count_alleles() alba_ac = alba2.count_alleles() fsfs = allel.joint_sfs_folded(pers_ac, alba_ac) m_fsfs = moments.Spectrum(fsfs) m_fsfs.to_file("allopatric_parental_unlinked_sfs.txt")
import allel import moments import numpy as np # load vcf and extract folded sfs; write to file vcf_file = "syma.moments.bi.recode.vcf" f = allel.read_vcf(vcf_file, fields='*') gt = f['calldata/GT'] gt = allel.GenotypeArray(gt) toro = gt[:, np.r_[0, 1, 2, 7, 12, 13, 17, 18]] mega = gt[:, np.r_[3, 4, 5, 6, 8, 9, 10, 14, 15, 16]] gn_t = toro.to_n_alt(fill=-1) gn_m = mega.to_n_alt(fill=-1) t_unlinked = allel.locate_unlinked(gn_t) m_unlinked = allel.locate_unlinked(gn_m) pass_linkage = np.logical_and(t_unlinked, m_unlinked) final = gt.compress(pass_linkage, axis=0) toro2 = final[:, np.r_[0, 1, 2, 7, 12, 13, 17, 18]] mega2 = final[:, np.r_[3, 4, 5, 6, 8, 9, 10, 14, 15, 16]] toro_ac = toro2.count_alleles() mega_ac = mega2.count_alleles() fsfs = allel.joint_sfs_folded(toro_ac, mega_ac) m_fsfs = moments.Spectrum(fsfs) m_fsfs.to_file("syma_unlinked_sfs.txt")