Beispiel #1
0
def fst(g, directory, outfn, samplelist):
    ## get subpops - dataframe
    df = fp.retrieveMetaData(None, directory, outfn)

    #test
    #print(df)

    ## get list of subpops
    subdict = {}
    for pop in df['ethnic group'].unique():
        subpop = df[df['ethnic group'] == pop]
        subdict[pop] = list(subpop['id'])

    finalpopdict = defaultdict(list)
    for num in range(len(samplelist)):
        idname = samplelist[num]
        for key in subdict.keys():
            if idname in subdict[key]:
                finalpopdict[key].append(num)

    subpoplist = []
    for key in finalpopdict.keys():
        subpoplist.append(finalpopdict[key])

    ## calculate variance components
    a, b, c = allel.weir_cockerham_fst(g, subpoplist)

    ## average fst per variant
    fst = (np.sum(a, axis=1) /
           (np.sum(a, axis=1) + np.sum(b, axis=1) + np.sum(c, axis=1)))

    return fst
def get_weirFst(twopop_geno_mat, popsize):
    ga = to_Genotype_array(twopop_geno_mat)
    a, b, c = allel.weir_cockerham_fst(
        ga, subpops=[range(popsize),
                     range(popsize, popsize * 2)])
    fst = np.sum(a) / (np.sum(a) + np.sum(b) + np.sum(c))
    return fst
Beispiel #3
0
def calc_fst_persite(gt_array_fst, fst_pop_indicies, fst_type):

    # compute basic (multisite) FST via scikit allel

    # WC 84
    if fst_type == "wc":
        a, b, c = allel.weir_cockerham_fst(gt_array_fst,
                                           subpops=fst_pop_indicies)

        fst = (np.sum(a, axis=1) /
               (np.sum(a, axis=1) + np.sum(b, axis=1) + np.sum(c, axis=1)))

        return (fst)

    # Hudson 92
    elif fst_type == "hudson":

        # following scikit allel docs
        # allel counts for each population
        ac1 = gt_array_fst.count_alleles(subpop=fst_pop_indicies[0])
        ac2 = gt_array_fst.count_alleles(subpop=fst_pop_indicies[1])

        #hudson fst has two components (numerator & denominator)
        num, den = allel.hudson_fst(ac1, ac2)

        fst = num / den

        return (fst)
Beispiel #4
0
def calcWCfst_per_site(ac, pairs, gtvars, idx1, idx2):
    acu = al.AlleleCountsArray(ac[pairs[0]][:] + ac[pairs[1]][:])
    is_seg = acu.is_segregating() & (acu.max_allele() == 1)
    gtmp = gtvars.compress(is_seg, axis=0)
    segSitesPos = scafbp[getScafBp(idx, is_seg)]
    # Weir & Cockerham's
    a, b, c = al.weir_cockerham_fst(gtmp, subpops=[ idx1, idx2 ], max_allele=1)
    with np.errstate(divide='ignore', invalid='ignore'):
        snp_fst = (a / (a + b + c))[:,0]
    return pairs, np.count_nonzero(is_seg), snp_fst, segSitesPos, is_seg
Beispiel #5
0
def calc_fst(gt_array_fst, fst_pop_indicies, fst_type):

    # compute basic (multisite) FST via scikit allel

    # WC 84
    if fst_type == "wc":
        a, b, c = allel.weir_cockerham_fst(gt_array_fst,
                                           subpops=fst_pop_indicies)

        # compute variance component sums
        a = np.nansum(a).tolist()
        b = np.nansum(b).tolist()
        c = np.nansum(c).tolist()
        n_sites = len(gt_array_fst)

        # compute fst
        if (a + b + c) > 0:
            fst = a / (a + b + c)
        else:
            fst = "NA"

        return (fst, a, b, c, n_sites)

    # Hudson 92
    if fst_type == "hudson":

        # following scikit allel docs
        # allel counts for each population
        ac1 = gt_array_fst.count_alleles(subpop=fst_pop_indicies[0])
        ac2 = gt_array_fst.count_alleles(subpop=fst_pop_indicies[1])

        #hudson fst has two components (numerator & denominator)
        num, den = allel.hudson_fst(ac1, ac2)
        c = 0  # for compatibility with aggregation code for WC 84

        # compute variance component sums
        num = np.nansum(num).tolist()
        den = np.nansum(den).tolist()
        n_sites = len(gt_array_fst)

        # compute fst
        if (num + den) > 0:
            fst = num / den
        else:
            fst = "NA"

        # same abc format as WC84, where 'a' is the numerator and
        # 'b' is the demoninator, and 'c' is a zero placeholder
        return (fst, num, den, c, n_sites)
Beispiel #6
0
    'N': ids[ids['pops'] == 'N'].index.tolist(),
    'S': ids[ids['pops'] == 'S'].index.tolist(),
}

ac_subpops = gtsub.count_alleles_subpops(subpops, max_allele= 1)

segAll = ac_subpops['all'].is_segregating()[:]

gtseg = gtsub.compress(segAll, axis= 0)


########

## Weir & Cockerham's Fst pfor each locus

a, b, c, = al.weir_cockerham_fst(gtseg, list(subpops.values())[1:])

# estimate theta (a.k.a. Fst) for each variant & allele directly:
fst = a / (a + b + c)




# compare Hudson's and Weir & Cockerham's per locus Fst:

# only take variants that are segregating between the two pops
acu = al.AlleleCountsArray(ac_subpops['S'][:] + ac_subpops['N'][:])
flt = acu.is_segregating() & (acu.max_allele() == 1)
print('retaining', np.count_nonzero(flt), 'SNPs')

ac1 = al.AlleleCountsArray(ac_subpops['S'].compress(flt, axis=0)[:, :2])
    def print_pi(self, tree_sequence, indices, populations):
        if not self.pi_needed():
            return

        writer = self.writers['pi']
        # invert populations dictionary to be keyed by population index
        # this keeps the order consistent instead of relying on keys

        pops = 'AF EU AS'.split()
        indices = np.array(indices)

        writer.write('\t'.join(pops) + '\t')
        writer.write('AF-EU\tAF-AS\tEU-AS\n')

        length = tree_sequence.get_sequence_length()
        haplotypes = tree_sequence.genotype_matrix()

        ga_comb = allel.HaplotypeArray(
            haplotypes[:, indices == populations['AF']]).to_genotypes(
                ploidy=2).concatenate([
                    allel.HaplotypeArray(
                        haplotypes[:,
                                   indices == populations['EU']]).to_genotypes(
                                       ploidy=2),
                    allel.HaplotypeArray(
                        haplotypes[:,
                                   indices == populations['AS']]).to_genotypes(
                                       ploidy=2)
                ], 1)

        keep_alleles = ga_comb.count_alleles().is_biallelic_01(
            min_mac=int(0.05 * (ga_comb.n_samples)))

        # for pop in pops:
        #     mpd = allel.mean_pairwise_difference(
        #         allel.HaplotypeArray(
        #             haplotypes[:, indices == populations[pop]]
        #         ).count_alleles())
        #     writer.write(
        #         f'{mpd.sum()/length:.5}\t')
        #
        # for pairs in (('AF', 'EU'), ('AF', 'AS'), ('EU', 'AS')):
        #     count1 = allel.HaplotypeArray(
        #         haplotypes[:, indices == populations[pairs[0]]]
        #     ).count_alleles()
        #     count2 = allel.HaplotypeArray(
        #         haplotypes[:, indices == populations[pairs[1]]]
        #     ).count_alleles()
        #     num, den = allel.hudson_fst(count1, count2)
        #     writer.write(f'{num.sum() / den.sum():.5}\t')
        # writer.write('\n')

        # Calculate pi
        for pop in pops:
            ## Create genotype array from tree_sequence haplotype data for
            ## population and ploidy=2
            counts = allel.HaplotypeArray(
                haplotypes[:, indices == populations[pop]]).to_genotypes(
                    ploidy=2).count_alleles()

            ## keep with maf > 5% and < 95%
            maf = counts.values[:, 1] / sum(counts.values[0, :])
            counts = counts[np.logical_and(maf > 0.05, maf < 0.95)]

            ## Calculate mean_pairwise_difference for genotype array including
            ## variants with maf > 5%
            mpd = allel.mean_pairwise_difference(counts)

            writer.write(f'{mpd.sum()/counts.shape[0]:.5}\t')

        #Calculate Fst
        for pairs in (('AF', 'EU'), ('AF', 'AS'), ('EU', 'AS')):
            num1 = sum(indices == populations[pairs[0]]) // 2
            num2 = sum(indices == populations[pairs[1]]) // 2
            ## Set up empty list of lists for subpop array indices
            subpops = [list(range(0, num1)), list(range(num1, num1 + num2))]
            ga = allel.HaplotypeArray(
                haplotypes[:,
                           np.logical_or(indices ==
                                         populations[pairs[0]], indices ==
                                         populations[pairs[1]])]).to_genotypes(
                                             ploidy=2)
            counts = ga.count_alleles()
            maf = counts.values[:, 1] / sum(counts.values[0, :])

            ## Calculate mean Fst based on combined genotype data
            a, b, c = allel.weir_cockerham_fst(
                ga[np.logical_and(maf > 0.05, maf < 0.95)], subpops)
            fst = np.mean(
                np.sum(a, axis=1) /
                (np.sum(a, axis=1) + np.sum(b, axis=1) + np.sum(c, axis=1)))

            writer.write(f'{fst:.5}\t')
        writer.write('\n')
Beispiel #8
0
                     help="output file path",
                     type=str,
                     required=True)
args = parser.parse_args()

import numpy as np
import pandas as pd
import allel

vcf = allel.read_vcf(args.vcf)
gt = allel.GenotypeArray(vcf['calldata/GT'])

pops = [
    list(range(((i - 1) * 250) + 1, ((i - 1) * 250) + 251))
    for i in range(1, 35)
]

a, b, c = allel.weir_cockerham_fst(gt, pops)
a = np.take(a, indices=1, axis=1)
b = np.take(b, indices=1, axis=1)
c = np.take(c, indices=1, axis=1)

denom = a + b + c
ix_0 = np.where(denom != 0)[0]
fst = np.zeros(len(a))
fst[ix_0] = a[ix_0] / denom[ix_0]

df = pd.DataFrame({"a": a, "b": b, "c": c, "fst": fst})

df.to_csv(args.outpre + ".scikit.fst", sep="\t", header=False, index=False)
Beispiel #9
0
def calculate_per_marker_fst(gt, subpops):
    a, b, c = weir_cockerham_fst(gt, subpops)
    fst = (np.sum(a, axis=1) /
           (np.sum(a, axis=1) + np.sum(b, axis=1) + np.sum(c, axis=1)))
    return fst