def test_locate_unlinked(self):

        gn = [[0, 1, 2], [0, 1, 2]]
        expect = [True, False]
        actual = allel.locate_unlinked(gn, size=2, step=2, threshold=.5)
        aeq(expect, actual)

        gn = [[0, 1, 1, 2], [0, 1, 1, 2], [1, 1, 0, 2], [1, 1, 0, 2]]
        actual = allel.locate_unlinked(gn, size=2, step=1, threshold=.5)
        expect = [True, False, True, False]
        aeq(expect, actual)

        gn = [[0, 1, 1, 2], [0, 1, 1, 2], [0, 1, 1, 2], [1, 1, 0, 2],
              [1, 1, 0, 2]]
        actual = allel.locate_unlinked(gn, size=2, step=1, threshold=.5)
        expect = [True, False, True, True, False]
        aeq(expect, actual)
        actual = allel.locate_unlinked(gn, size=3, step=1, threshold=.5)
        expect = [True, False, False, True, False]
        aeq(expect, actual)

        # test with bcolz carray
        import bcolz
        gnz = bcolz.carray(gn, chunklen=2)
        actual = allel.locate_unlinked(gnz,
                                       size=2,
                                       step=1,
                                       threshold=.5,
                                       blen=2)
        expect = [True, False, True, True, False]
        aeq(expect, actual)
Exemple #2
0
def ld_prune(gn, pos, size=500, step=200, threshold=.1, n_iter=5):
    """Remove sites in LD.

    Parameters
    ----------
    gn : TYPE
        DESCRIPTION.
    pos : TYPE
        DESCRIPTION.
    size : TYPE, optional
        DESCRIPTION. The default is 500.
    step : TYPE, optional
        DESCRIPTION. The default is 200.
    threshold : TYPE, optional
        DESCRIPTION. The default is .1.
    n_iter : TYPE, optional
        DESCRIPTION. The default is 5.

    Returns
    -------
    TYPE
        DESCRIPTION.
    gn : TYPE
        DESCRIPTION.

    """
    for i in range(n_iter):
        loc_unlinked = allel.locate_unlinked(gn, size=size, step=step, threshold=threshold)
        n = np.count_nonzero(loc_unlinked)
        n_remove = gn.shape[0] - n
        print(f"iteration {i+1} retaining {n} removing {n_remove} variants")
        gn = gn.compress(loc_unlinked, axis=0)
        pos = pos[loc_unlinked]

    return allel.SortedIndex(pos), gn
    def work(self):

        import numpy as np
        import allel
        import h5py
        import pandas as pd
        from luigi.file import atomic_file

        # Opens the SynSNPS file, which contains only biallelic synonymous sites
        callset = h5py.File(self.input()['syn'].path, mode='r')
        genotypes = allel.GenotypeChunkedArray(callset['calldata']['genotype'])
        samples = np.array([x.decode() for x in callset['samples']])

        # Selects site with r**2 linkage < max_linkage
        n_ref = genotypes.to_n_ref(fill=-9)
        unlinked = allel.locate_unlinked(n_ref, threshold=self.max_linkage)[:]

        # Create pseudohaplotypes (0=ref, 1=alt, -1=missing)
        hap_matrix = genotypes[:][unlinked].to_haplotypes()

        # Double up the sample names
        samples_dup = np.array(list(zip(samples, samples))).reshape(-1, 1)

        hap_df = pd.DataFrame(np.hstack((samples_dup, hap_matrix.T)))

        # Atomic write TSV file output
        af = atomic_file(self.output().path)
        hap_df.to_csv(af.tmp_path, sep='\t', index=False)
        af.move_to_final_destination()
Exemple #4
0
def ld_prune(gn, size, step, threshold=.1, n_iter=1):
    for i in range(n_iter):
        loc_unlinked = al.locate_unlinked(gn, size=size, step=step, threshold=threshold)
        n = np.count_nonzero(loc_unlinked)
        n_remove = gn.shape[0] - n
        print('iteration', i+1, 'retaining', n, 'removing', n_remove, 'variants')
        gn = gn.compress(loc_unlinked, axis=0)
    return gn
Exemple #5
0
def ld_prune(gn, size, step, threshold=.1, n_iter=1): #via http://alimanfoo.github.io/2015/09/28/fast-pca.html
    for i in range(n_iter):
        loc_unlinked = allel.locate_unlinked(gn, size=size, step=step, threshold=threshold)
        n = np.count_nonzero(loc_unlinked)
        n_remove = gn.shape[0] - n
        print('iteration', i+1, 'retaining', n, 'removing', n_remove, 'variants')
        gn = gn.compress(loc_unlinked, axis=0)
    return gn
Exemple #6
0
    def test_locate_unlinked(self):

        gn = [[0, 1, 2], [0, 1, 2]]
        expect = [True, False]
        actual = allel.locate_unlinked(gn, size=2, step=2, threshold=.5)
        aeq(expect, actual)

        gn = [[0, 1, 1, 2], [0, 1, 1, 2], [1, 1, 0, 2], [1, 1, 0, 2]]
        actual = allel.locate_unlinked(gn, size=2, step=1, threshold=.5)
        expect = [True, False, True, False]
        aeq(expect, actual)

        gn = [[0, 1, 1, 2], [0, 1, 1, 2], [0, 1, 1, 2], [1, 1, 0, 2],
              [1, 1, 0, 2]]
        actual = allel.locate_unlinked(gn, size=2, step=1, threshold=.5)
        expect = [True, False, True, True, False]
        aeq(expect, actual)
        actual = allel.locate_unlinked(gn, size=3, step=1, threshold=.5)
        expect = [True, False, False, True, False]
        aeq(expect, actual)
Exemple #7
0
def ld_prune(gn, size, step, threshold=.1, n_iter=1):
    """
    Performs LD pruning, originally from Alistair Miles' blog. 
    """
    for i in range(n_iter):
        loc_unlinked = allel.locate_unlinked(gn,
                                             size=size,
                                             step=step,
                                             threshold=threshold)
        n = np.count_nonzero(loc_unlinked)
        n_remove = gn.shape[0] - n
        print('iteration', i + 1, 'retaining', n, 'removing', n_remove,
              'variants')
        gn = gn.compress(loc_unlinked, axis=0)
    return gn
Exemple #8
0
 def _pca_ld_prune(gn, size, step, threshold=.1, n_iter=1):
     blen = size * 10
     for i in range(n_iter):
         loc_unlinked = allel.locate_unlinked(gn,
                                              size=size,
                                              step=step,
                                              threshold=threshold,
                                              blen=blen)
         n = np.count_nonzero(loc_unlinked)
         n_remove = gn.shape[0] - n
         print(
             '[Exec][PCA][LD Prune] Iteration {}/{}: Retaining {} and removing {} variants.'
             .format(i + 1, n_iter, n, n_remove))
         gn = gn.compress(loc_unlinked, axis=0)
     return gn
Exemple #9
0
def ld_prune(gn, size, step, threshold=.2, n_iter=1, blen=10000):

    gn_alt = gn.to_n_alt()

    for i in range(n_iter):
        loc_unlinked = allel.locate_unlinked(gn_alt,
                                             size=size,
                                             step=step,
                                             threshold=threshold,
                                             blen=blen)
        n = np.count_nonzero(loc_unlinked)
        n_remove = gn.shape[0] - n
        print('iteration', i + 1, 'retaining', n, 'removing', n_remove,
              'variants')
        gn = gn.compress(loc_unlinked, axis=0)
    return (gn, loc_unlinked)
Exemple #10
0
def test_vs_skallel(args):
    x, size, step, threshold, chunks = args

    ds = simulate_genotype_call_dataset(n_variant=x.shape[0],
                                        n_sample=x.shape[1])
    ds["dosage"] = (["variants", "samples"], da.asarray(x).rechunk({0:
                                                                    chunks}))
    ds = window_by_variant(ds, size=size, step=step)

    ldm = ld_matrix(ds, threshold=threshold)
    has_duplicates = ldm.compute().duplicated(subset=["i", "j"]).any()
    assert not has_duplicates
    idx_drop_ds = maximal_independent_set(ldm)

    idx_drop = np.sort(idx_drop_ds.ld_prune_index_to_drop.data)
    m = allel.locate_unlinked(x, size=size, step=step, threshold=threshold)
    idx_drop_ska = np.sort(np.argwhere(~m).squeeze(axis=1))

    npt.assert_equal(idx_drop_ska, idx_drop)
def ld_prune(gn, size, step, threshold, n_iter):
    """
    Applies pruning; removing SNPs that are not correlated
    """
    for i in range(n_iter):
        loc_unlinked = allel.locate_unlinked(gn,
                                             size=size,
                                             step=step,
                                             threshold=threshold)
        n = np.count_nonzero(loc_unlinked)
        n_remove = gn.shape[0] - n
        logging.info(
            f"Pruning iteration {i+1} retaining  {n} removing {n_remove} variants"
        )
        gn = gn.compress(loc_unlinked, axis=0)
    logging.info(
        f"Applied pruning with paramaters, size : {size}, step : {step}, threshold : {threshold}, n_iter : {n_iter}"
    )
    return gn[:]
Exemple #12
0
def prune_by_ld(number_of_alternate_alleles,
                window_size=1000,
                step_size=100,
                r2=0.2):
    '''Take an array of the number of alternate alleles and return a smaller
    array pruned to remove SNPs in LD with each other above a specified
    threshold as well as the Boolean used for filtering.'''

    if not all(item > 0 for item in [window_size, step_size, r2]):
        raise ValueError("All numeric parameters must be positive")

    pruned_bool = allel.locate_unlinked(number_of_alternate_alleles,
                                        window_size, step_size, r2)

    pruned = number_of_alternate_alleles[pruned_bool]

    if not len(pruned) < len(number_of_alternate_alleles):
        warnings.warn("Warning, no pruning occurred!")

    return pruned, pruned_bool
Exemple #13
0
def ldthin(geno,
           positions,
           method,
           rsize=50000,
           mac=1,
           size=100,
           step=20,
           thresh=.1,
           iters=1):
    """.take if coord, .compress if mask
    """
    ac = geno.count_alleles()
    # only biallelic and mac or 2
    pca_selection = (ac.max_allele() == 1) & (ac[:, :2].min(axis=1) > mac)

    print("Available site for PCA: {}".format(np.count_nonzero(pca_selection)))
    if 'random' in method:
        indices = np.nonzero(pca_selection)[0]
        indices_ds = np.random.choice(indices, size=rsize, replace=False)
        indices_ds.sort()
        genotypes_pca = geno.take(indices_ds, axis=0)
        # sites with missing data can return error
        gn = genotypes_pca.to_n_alt()[:]
        pos = indices_ds
        print("{} Random SNPs selected for PCA".format(gn.shape[0]))
    else:
        genotypes_pca = geno.compress(pca_selection, axis=0)
        gn = genotypes_pca.to_n_alt()
        pos = positions[pca_selection]
        for i in range(iters):
            loc_unlinked = allel.locate_unlinked(gn,
                                                 size=size,
                                                 step=step,
                                                 threshold=thresh)
            n = np.count_nonzero(loc_unlinked)
            n_remove = gn.shape[0] - n
            print("iteration {} retaining {} removing {} variants".format(
                i + 1, n, n_remove))
            gn = gn.compress(loc_unlinked, axis=0)
            pos = pos[loc_unlinked]
    return (gn, allel.SortedIndex(pos))
Exemple #14
0
def ld_prune(genotypes_012,
             pos=None,
             size=100,
             step=20,
             threshold=0.1,
             verbosity=0):
    """Carries out ld pruning"""
    loc_unlinked = allel.locate_unlinked(genotypes_012,
                                         size=size,
                                         step=step,
                                         threshold=threshold)
    n = np.count_nonzero(loc_unlinked)
    genotypes = genotypes_012.compress(loc_unlinked, axis=0)
    if verbosity > 0:
        print("ld_prune: Retaining: {}  out of {} variants".format(
            n, genotypes_012.shape[0]))
    if pos is not None:
        pos = pos[loc_unlinked]
        return genotypes, pos
    else:
        return genotypes
Exemple #15
0
def test_vs_skallel(args, scheduler):
    x, window, step, threshold = args
    be_args = dict(backend='numba')

    idx_drop_lib = ld_prune(GenotypeCountDataset.create(x),
                            window=window,
                            step=step,
                            threshold=threshold,
                            unit='index',
                            target_chunk_size=max(x.shape[0] // 2, 1),
                            ld_matrix_kwargs=dict(backend='dask/numba'),
                            axis_intervals_kwargs=be_args,
                            mis_kwargs=be_args)
    with dask.config.set(scheduler=scheduler):
        idx_drop_lib = np.sort(idx_drop_lib.index_to_drop.data)
    m = allel.locate_unlinked(x,
                              size=window + 1,
                              step=step,
                              threshold=threshold,
                              blen=x.shape[0])
    idx_drop_ska = np.sort(np.argwhere(~m).squeeze(axis=1))

    npt.assert_equal(idx_drop_ska, idx_drop_lib)
Exemple #16
0
import allel
import moments
import numpy as np

# load vcf and extract folded sfs; write to file
vcf_file = "/data3/vcf_files/allopatric_parental/intermediate_files/demographic_inf_parentals_rename.vcf"
f = allel.read_vcf(vcf_file, fields='*')
gt = f['calldata/GT']
gt = allel.GenotypeArray(gt)
personata=gt[:,0:10]
alba=gt[:,10:20]
gn_p = personata.to_n_alt(fill=-1)
gn_a = alba.to_n_alt(fill=-1)
p_unlinked = allel.locate_unlinked(gn_p)
a_unlinked = allel.locate_unlinked(gn_a)
pass_linkage = np.logical_and(p_unlinked, a_unlinked)
final = gt.compress(pass_linkage, axis=0)
personata2=final[:,0:10]
alba2=final[:,10:20]
pers_ac = personata2.count_alleles()
alba_ac = alba2.count_alleles()
fsfs = allel.joint_sfs_folded(pers_ac, alba_ac)
m_fsfs = moments.Spectrum(fsfs)
m_fsfs.to_file("allopatric_parental_unlinked_sfs.txt")
Exemple #17
0
import allel
import moments
import numpy as np

# load vcf and extract folded sfs; write to file
vcf_file = "syma.moments.bi.recode.vcf"
f = allel.read_vcf(vcf_file, fields='*')
gt = f['calldata/GT']
gt = allel.GenotypeArray(gt)
toro = gt[:, np.r_[0, 1, 2, 7, 12, 13, 17, 18]]
mega = gt[:, np.r_[3, 4, 5, 6, 8, 9, 10, 14, 15, 16]]
gn_t = toro.to_n_alt(fill=-1)
gn_m = mega.to_n_alt(fill=-1)
t_unlinked = allel.locate_unlinked(gn_t)
m_unlinked = allel.locate_unlinked(gn_m)
pass_linkage = np.logical_and(t_unlinked, m_unlinked)
final = gt.compress(pass_linkage, axis=0)
toro2 = final[:, np.r_[0, 1, 2, 7, 12, 13, 17, 18]]
mega2 = final[:, np.r_[3, 4, 5, 6, 8, 9, 10, 14, 15, 16]]
toro_ac = toro2.count_alleles()
mega_ac = mega2.count_alleles()
fsfs = allel.joint_sfs_folded(toro_ac, mega_ac)
m_fsfs = moments.Spectrum(fsfs)
m_fsfs.to_file("syma_unlinked_sfs.txt")