Ejemplo n.º 1
0
def test_sfs_folded():
    ac = [[0, 3], [1, 2], [2, 1]]
    expect = [1, 2]
    actual = allel.sfs_folded(ac)
    assert_array_equal(expect, actual)
    for dtype in 'u2', 'i2', 'u8', 'i8':
        aca = np.asarray(ac, dtype=dtype)
        actual = allel.sfs_folded(aca)
        assert_array_equal(expect, actual)
Ejemplo n.º 2
0
def asfs_stats(gt, pos, fold):
    """Calculate the allele frequence spectrum.

    Future implementations will utilize the breakpoints from msprime tree object
    to find unlinked positions.

    Parameters
    ----------
    gt : TYPE
        DESCRIPTION.
    pos : TYPE
        DESCRIPTION.
    fold : bool
        if True, return folded SFS

    Returns
    -------
    sfs : TYPE
        DESCRIPTION.

    """
    # TODO: random sample OR use msprime breakpoint to reduce linkage
    gtseg, pos_s = get_seg(gt, pos)
    # sfs
    if fold:
        sfsp = (allel.sfs_folded(gtseg.count_alleles(), gtseg.shape[1]))[1:]
    else:
        sfsp = (allel.sfs(gtseg.count_alleles()[:, 1], gtseg.shape[1]))[1:-1]
    tots = np.sum(sfsp)
    sfs = sfsp / tots

    return sfs
Ejemplo n.º 3
0
def binned_sfs_mean(ac, bin_no=5):
    """
    Caclulates the mean allele counts in bins across the site frequency spectrum.
    Ignores position 0 which corresponds to monomophic sites (which can be non-zero in sub populations).

    Arguments
    -------------
    ac: Allele counts (in format returned from scikit allel)
    bin_no: number of roughly equally spaced bins.

    Returns
    ----------
    dictionary with the bin range as the key and the

    """
    sfs = allel.sfs_folded(ac)[1:]  # Drop monomorphic sites
    if bin_no > len(sfs):
        raise ValueError(
            "The number of bins cannot exceed the length of the site frequency spectrum."
        )

    split_sfs = np.array_split(sfs, bin_no)  # Splits roughly equal

    idx = 0
    stats = {}
    for array in split_sfs:
        bin_mean = array.mean()
        bin_label = f"{idx}_{idx + len(array)}"
        stats[bin_label] = bin_mean
        idx += len(array)

    return stats
Ejemplo n.º 4
0
def sfs_plot(c, ac_subpops, save=True, fold=True, scale=True):
    """
    note: should filter on segregating if only using subset of pops
    note: only biallelic if >1 allele
    is_biallelic_01 = ac_seg['all'].is_biallelic_01()[:]
    ac1 = ac_seg['BFM'].compress(is_biallelic_01, axis=0)[:, :2]
    ac2 = ac_seg['AOM'].compress(is_biallelic_01, axis=0)[:, :2]
    """
    sfsdict = {}
    fig, ax = plt.subplots(figsize=(8, 5))
    sns.despine(ax=ax, offset=10)
    for pop in ac_subpops.keys():
        acu = ac_subpops[pop]
        flt = acu.is_segregating() & (acu.max_allele() == 1)
        print('SFS : retaining', np.count_nonzero(flt), 'SNPs')
        # ac1 = allel.AlleleCountsArray(ac_subpops[pop].compress(flt, axis=0)[:, :2])
        ac1 = allel.AlleleCountsArray(ac_subpops[pop].compress(flt, axis=0))
        if fold and scale:
            sfs = allel.sfs_folded_scaled(ac1)
        elif fold and not scale:
            sfs = allel.sfs_folded(ac1)
        elif not fold and not scale:
            sfs = allel.sfs(ac1[:, 1])
        elif not fold and scale:
            sfs = allel.sfs_scaled(ac1[:, 1])
        sfsdict[pop] = sfs
        allel.stats.plot_sfs_folded_scaled(sfsdict[pop], ax=ax, label=pop,
                                           n=ac1.sum(axis=1).max())
    ax.legend()
    ax.set_title('{} Scaled folded site frequency spectra'.format(c))
    ax.set_xlabel('minor allele frequency')
    if save:
        fig.savefig("ScaledSFS-{}.pdf".format(c), bbox_inches='tight')
    return(sfsdict)
Ejemplo n.º 5
0
def plot_sfs(segregating_biallelic_snp_acs_by_pop_id, metadataObj):
    # joint_sfs for two populations (works)
    for (pop_id_A, pop_id_B) in itertools.combinations(
        [pop_id for pop_id in metadataObj.pop_ids_order], 2):
        fig, ax = plt.subplots(figsize=(8, 5))
        sns.despine(ax=ax)
        sfs1 = allel.sfs_folded(
            segregating_biallelic_snp_acs_by_pop_id[pop_id_A])
        #allel.plot_sfs_folded(sfs1, ax=ax, label=pop_id_A, n=segregating_biallelic_snp_acs_by_pop_id[pop_id_A].sum(axis=1).max())
        allel.plot_sfs_folded(sfs1, ax=ax, label=pop_id_A)
        sfs2 = allel.sfs_folded(
            segregating_biallelic_snp_acs_by_pop_id[pop_id_B])
        #allel.plot_sfs_folded(sfs2, ax=ax, label=pop_id_B, n=segregating_biallelic_snp_acs_by_pop_id[pop_id_B].sum(axis=1).max())
        allel.plot_sfs_folded(sfs2, ax=ax, label=pop_id_B)
        ax.legend()
        ax.set_title('Folded site frequency spectra')
        # workaround bug in scikit-allel re axis naming
        ax.set_xlabel('Minor allele frequency')
        ax.set_ylim(min([min(sfs1), min(sfs2)]), max([max(sfs1), max(sfs2)]))
        fig.tight_layout()
        fig.savefig('%s.sfsfs.%s_%s.png' %
                    (metadataObj.prefix, pop_id_A, pop_id_B),
                    format="png")
        #n : int, optional, Number of chromosomes sampled. If provided, X axis will be plotted as allele frequency, otherwise as allele count.
        jsfs = allel.joint_sfs_folded(
            segregating_biallelic_snp_acs_by_pop_id[pop_id_A][:],
            segregating_biallelic_snp_acs_by_pop_id[pop_id_B][:])
        fig, ax = plt.subplots(figsize=(6, 6))
        ax = allel.plot_joint_sfs_folded(jsfs,
                                         ax=ax,
                                         imshow_kwargs={'cmap': 'Blues'})
        total = np.sum(jsfs)
        for i in range(len(jsfs)):
            for j in range(len(jsfs[i])):
                ax.text(j,
                        i,
                        "{:.2f}".format(jsfs[i, j] / total),
                        ha="center",
                        va="center",
                        color="black",
                        fontsize=12)
        ax.set_ylabel('Alternate allele count, %s' % pop_id_A)
        ax.set_xlabel('Alternate allele count, %s' % pop_id_B)
        fig.tight_layout()
        fig.savefig('%s.jsfs.%s_%s.png' %
                    (metadataObj.prefix, pop_id_A, pop_id_B),
                    format="png")
Ejemplo n.º 6
0
def plot_sfs(allele_counts, dest=os.path.join(plot_dir, "site_freq_spectrum.pdf")):
    fig, ax = plt.subplots(figsize=(12, 5))
    sns.despine(ax=ax, offset=10)
    sfs = allel.sfs_folded(allele_counts)
    # sfs = sfs[1:-1]
    x = np.arange(0, sfs.shape[0])
    n = allele_counts.sum(axis=1).max()
    x = x / n  ## To frequencies
    # sns.barplot(x, sfs, ax=ax, color=sns.color_palette("Blues")[3])
    ax.plot(x, sfs)

    ##allel.plot_sfs_folded(sfs, ax=ax, label='TB', n=allele_counts.sum(axis=1).max())
    ##ax.legend()
    ax.set_title("Folded site frequency spectrum")
    # workaround bug in scikit-allel re axis naming
    ax.set_xlabel("minor allele frequency")
    ax.set_ylabel("num sites")

    fig.savefig(dest)
def sfs(haplotype, ac, nindiv=None, folded=False):
    """
    Compute sfs for SNP matrix
    """
    if nindiv == None:
        nindiv = haplotype.shape[1]
    tmp_df = pd.DataFrame({"N_indiv": range(1, nindiv)})
    if folded:
        df_sfs = pd.DataFrame(allel.sfs_folded(ac), columns=["count_SNP"])
        df_sfs["i_xi"] = allel.sfs_folded_scaled(ac)
        df_sfs.index.name = "N_indiv"
        df_sfs.reset_index(inplace=True)
        df_sfs = df_sfs.merge(tmp_df, on="N_indiv",
                              how="right").fillna(0).astype(int)
    else:
        df_sfs = pd.DataFrame(allel.sfs(ac.T[1]), columns=["count_SNP"])
        df_sfs["i_xi"] = allel.sfs_scaled(ac.T[1])
        df_sfs.index.name = "N_indiv"
        df_sfs.reset_index(inplace=True)
        df_sfs = df_sfs.merge(tmp_df, on="N_indiv",
                              how="right").fillna(0).astype(int)

    df_sfs["freq_indiv"] = df_sfs.N_indiv / nindiv
    return df_sfs