Ejemplo n.º 1
0
def calc_fst_persite(gt_array_fst, fst_pop_indicies, fst_type):

    # compute basic (multisite) FST via scikit allel

    # WC 84
    if fst_type == "wc":
        a, b, c = allel.weir_cockerham_fst(gt_array_fst,
                                           subpops=fst_pop_indicies)

        fst = (np.sum(a, axis=1) /
               (np.sum(a, axis=1) + np.sum(b, axis=1) + np.sum(c, axis=1)))

        return (fst)

    # Hudson 92
    elif fst_type == "hudson":

        # following scikit allel docs
        # allel counts for each population
        ac1 = gt_array_fst.count_alleles(subpop=fst_pop_indicies[0])
        ac2 = gt_array_fst.count_alleles(subpop=fst_pop_indicies[1])

        #hudson fst has two components (numerator & denominator)
        num, den = allel.hudson_fst(ac1, ac2)

        fst = num / den

        return (fst)
Ejemplo n.º 2
0
def get_fst(s1,s2):
    ac1 = s1.count_alleles()
    ac2 = s2.count_alleles()
    num, den = allel.hudson_fst(ac1, ac2)
    fst = np.sum(num) / np.sum(den)
    print(f'The F_st value between the two populations is {round(np.abs(fst),2)}')
    return 
    def print_pi(self, tree_sequence, indices, populations):
        if not self.pi_needed():
            return

        writer = self.writers['pi']
        # invert populations dictionary to be keyed by population index
        # this keeps the order consistent instead of relying on keys

        pops = 'AF EU AS'.split()
        indices = np.array(indices)

        writer.write('\t'.join(pops) + '\t')
        writer.write('AF-EU\tAF-AS\tEU-AS\n')

        length = tree_sequence.get_sequence_length()
        haplotypes = tree_sequence.genotype_matrix()
        for pop in pops:
            mpd = allel.mean_pairwise_difference(
                allel.HaplotypeArray(
                    haplotypes[:,
                               indices == populations[pop]]).count_alleles())
            writer.write(f'{mpd.sum()/length:.5}\t')

        for pairs in (('AF', 'EU'), ('AF', 'AS'), ('EU', 'AS')):
            count1 = allel.HaplotypeArray(
                haplotypes[:,
                           indices == populations[pairs[0]]]).count_alleles()
            count2 = allel.HaplotypeArray(
                haplotypes[:,
                           indices == populations[pairs[1]]]).count_alleles()
            num, den = allel.hudson_fst(count1, count2)
            writer.write(f'{num.sum() / den.sum():.5}\t')
        writer.write('\n')
Ejemplo n.º 4
0
def main(vcffile, pop1, pop2, binwidth, stepsize, outprefix):
    """
    计算pop1和pop2之间的Fst
    using the method of Hudson (1992) elaborated by Bhatia et al. (2013).
    """
    pop1 = [x.strip() for x in open(pop1)]
    pop2 = [x.strip() for x in open(pop2)]
    callset = allel.read_vcf(vcffile)
    allsamples = callset['samples']
    genotypes = allel.GenotypeChunkedArray(callset['calldata/GT'])
    variant_selection = np.full((genotypes.shape[0] + 1), True)  # 选择vcf中的全部位点
    sample_selection = [True if x in pop1 else False for x in allsamples]
    ac1 = getAC(genotypes, variant_selection, sample_selection)
    sample_selection = [True if x in pop2 else False for x in allsamples]
    ac2 = getAC(genotypes, variant_selection, sample_selection)
    num, den = allel.hudson_fst(ac1, ac2)
    fst = num / den
    meanFst = np.sum(num) / np.sum(den)
    print('meanFst: %s' % meanFst)
    chrom = callset['variants/CHROM']
    pos = callset['variants/POS']
    df = pd.DataFrame({'chrom': chrom, 'pos': pos, 'hudson_Fst': fst})
    df.to_csv(f'{outprefix}_persite.tsv.gz',
              sep='\t',
              index=False,
              na_rep='nan',
              compression='gzip')
    df['num'] = num
    df['den'] = den
    # sliding bins
    bdf = []
    for offset in range(0, binwidth, stepsize):
        df['bin_index'] = ((df['pos'].values - 1) - offset) // binwidth
        for group_name, gdf in df.groupby(by=['chrom', 'bin_index']):
            chrom, bin_index = group_name
            start = bin_index * binwidth + offset + 1
            if start < 0:  # 开头几个窗口长度不足的就直接跳过
                continue
            end = start + binwidth - 1
            n_snp = gdf.shape[0]
            sum_num = gdf['num'].sum()
            sum_den = gdf['den'].sum()
            if sum_den > 0:
                meanFst = sum_num / sum_den
            else:
                meanFst = np.nan
            bdf.append([chrom, start, end, n_snp, meanFst])
    bdf = pd.DataFrame(bdf,
                       columns=['chrom', 'start', 'end', 'n_snp',
                                'meanFst']).sort_values(by=['chrom', 'start'])
    bdf.to_csv(f'{outprefix}_meanFst.tsv.gz',
               index=False,
               compression='gzip',
               sep='\t',
               float_format='%.3f')
Ejemplo n.º 5
0
def calc_fst(gt_array_fst, fst_pop_indicies, fst_type):

    # compute basic (multisite) FST via scikit allel

    # WC 84
    if fst_type == "wc":
        a, b, c = allel.weir_cockerham_fst(gt_array_fst,
                                           subpops=fst_pop_indicies)

        # compute variance component sums
        a = np.nansum(a).tolist()
        b = np.nansum(b).tolist()
        c = np.nansum(c).tolist()
        n_sites = len(gt_array_fst)

        # compute fst
        if (a + b + c) > 0:
            fst = a / (a + b + c)
        else:
            fst = "NA"

        return (fst, a, b, c, n_sites)

    # Hudson 92
    if fst_type == "hudson":

        # following scikit allel docs
        # allel counts for each population
        ac1 = gt_array_fst.count_alleles(subpop=fst_pop_indicies[0])
        ac2 = gt_array_fst.count_alleles(subpop=fst_pop_indicies[1])

        #hudson fst has two components (numerator & denominator)
        num, den = allel.hudson_fst(ac1, ac2)
        c = 0  # for compatibility with aggregation code for WC 84

        # compute variance component sums
        num = np.nansum(num).tolist()
        den = np.nansum(den).tolist()
        n_sites = len(gt_array_fst)

        # compute fst
        if (num + den) > 0:
            fst = num / den
        else:
            fst = "NA"

        # same abc format as WC84, where 'a' is the numerator and
        # 'b' is the demoninator, and 'c' is a zero placeholder
        return (fst, num, den, c, n_sites)
Ejemplo n.º 6
0
def test_Fst__Hudson(sample_size):
    # scikit-allel can only calculate Fst for pairs of cohorts (populations)
    n_cohorts = 2
    ts = simulate_ts(sample_size)
    ds = ts_to_dataset(ts)
    ds, subsets = add_cohorts(ds, ts, n_cohorts)
    n_variants = ds.dims["variants"]
    ds = window_by_variant(ds, size=n_variants)  # single window
    ds = Fst(ds, estimator="Hudson")
    fst = ds.stat_Fst.sel(cohorts_0="co_0", cohorts_1="co_1").values

    # scikit-allel
    ac1 = ds.cohort_allele_count.values[:, 0, :]
    ac2 = ds.cohort_allele_count.values[:, 1, :]
    num, den = hudson_fst(ac1, ac2)
    ska_fst = np.sum(num) / np.sum(den)

    np.testing.assert_allclose(fst, ska_fst)
Ejemplo n.º 7
0
def compute_fst(raw):
    """
    FST (for two populations)
    https://scikit-allel.readthedocs.io/en/stable/stats/fst.html
    """
    # raw has been transposed
    nvar = raw.shape[0]
    nsam = raw.shape[1]
    raw = np.expand_dims(raw, axis=2).astype('i')

    g = allel.GenotypeArray(raw)
    subpops = [range(nsam // 2), range(nsam // 2, nsam)]

    # for each pop
    ac1 = g.count_alleles(subpop=subpops[0])
    ac2 = g.count_alleles(subpop=subpops[1])

    # compute average fst
    num, den = allel.hudson_fst(ac1, ac2)
    fst = np.sum(num) / np.sum(den)
    return fst
Ejemplo n.º 8
0
def calc_fst(mseqs):

    groups = list(mseqs.keys())
    len_grp = len(groups)
    FST_mat = np.zeros((len_grp, len_grp))
    allele_counts = count_allele(mseqs)
    for i, j in itertools.combinations(range(len_grp), 2):

        ac1 = allele_counts[groups[i]]
        ac2 = allele_counts[groups[j]]

        with np.errstate(divide='ignore', invalid='ignore'):
            num, den = allel.hudson_fst(ac1, ac2)

            FST_mat[i, j] = np.nanmean(num / den)  #np.sum(num) / np.sum(den)
            FST_mat[j, i] = np.nanstd(num / den)

        cout('%5.4f +- %5.4f : %s <> %s' %
             (FST_mat[i, j], FST_mat[j, i], groups[i], groups[j]))

    return FST_mat, groups
Ejemplo n.º 9
0
def test_Fst__Hudson(sample_size):
    # scikit-allel can only calculate Fst for pairs of cohorts (populations)
    n_cohorts = 2
    ts = msprime.simulate(sample_size,
                          length=100,
                          mutation_rate=0.05,
                          random_seed=42)
    ds = ts_to_dataset(ts)  # type: ignore[no-untyped-call]
    ds, subsets = add_cohorts(ds, ts,
                              n_cohorts)  # type: ignore[no-untyped-call]
    n_variants = ds.dims["variants"]
    ds = window(ds, size=n_variants)  # single window
    ds = Fst(ds, estimator="Hudson")
    fst = ds.stat_Fst.sel(cohorts_0="co_0", cohorts_1="co_1").values

    # scikit-allel
    ac1 = ds.cohort_allele_count.values[:, 0, :]
    ac2 = ds.cohort_allele_count.values[:, 1, :]
    num, den = hudson_fst(ac1, ac2)
    ska_fst = np.sum(num) / np.sum(den)

    np.testing.assert_allclose(fst, ska_fst)
Ejemplo n.º 10
0
def fst(p1, pos, gt, quants):
    """Calculate Hudson's FST.

    Hudson’s FST estimator as the ratio of averages computed following
    Bhatia et al. (2013).

    Parameters
    ----------
    p1 : TYPE
        DESCRIPTION.
    p2 : TYPE
        DESCRIPTION.
    pos : TYPE
        DESCRIPTION.
    gt : TYPE
        DESCRIPTION.
    win_size : TYPE
        DESCRIPTION.
    length_bp : TYPE
        DESCRIPTION.

    Returns
    -------
    fst : TYPE
        DESCRIPTION.

    """
    ac1, ac2, pos_s = get_ac_seg(p1, pos, gt)
    # segregating in both pops
    loc_asc = ac1.is_segregating() & ac2.is_segregating()
    ac1_seg = ac1.compress(loc_asc, axis=0)
    ac2_seg = ac2.compress(loc_asc, axis=0)
    num, den = allel.hudson_fst(ac1_seg, ac2_seg)
    fst_snp = num / den
    if quants[0] < 0:
        fst_ = [np.nanmean(fst_snp)]
    else:
        fst_ = np.nanquantile(fst_snp, quants)
    return fst_
Ejemplo n.º 11
0
def calc_site_fst(mseqs, nan_to_zero=False):

    groups = list(mseqs.keys())
    len_grp = len(groups)
    FST_sites = []
    allele_counts = count_allele(mseqs)
    for i, j in itertools.combinations(range(len_grp), 2):

        ac1 = allele_counts[groups[i]]
        ac2 = allele_counts[groups[j]]

        with np.errstate(divide='ignore', invalid='ignore'):
            num, den = allel.hudson_fst(ac1, ac2)

            if nan_to_zero:
                # convert nan to zero
                fst = np.nan_to_num(num / den)
            else:
                fst = num / den
            FST_sites.append(('%s <> %s' % (groups[i], groups[j]), fst))

    return FST_sites
Ejemplo n.º 12
0
    def select(self, haplotypes, groups, haplotest, k=None):

        # we use k for redundancy parameters
        if k == 0 or k is None:
            k = 1

        candidate_L = []  # [ (pos, rank, no_actual_pops)]
        # we traverse through the tree
        for (level, pop1, pop2) in traverse(self.guide_tree):

            n_pops = len(pop1) + len(pop2)
            haplotypes1 = haplotypes[np.isin(groups, pop1)]
            haplotypes2 = haplotypes[np.isin(groups, pop2)]

            if len(haplotypes1) < 4:
                cerr('[I - insufficient population size for %s]' % pop1)
            if len(haplotypes2) < 4:
                cerr('[I - insufficient population size for %s]' % pop2)

            # convert haplotypes to allele counts
            ac1 = count_allele(haplotypes1)
            ac2 = count_allele(haplotypes2)

            # calculate highest FST
            FST = []
            num, den = allel.hudson_fst(ac1, ac2)

            # NOTE: the line below avoids warning (invalid value in true_divide)
            # when den == 0, which should be perfectly ok for FST calculation
            den[den == 0] = -1
            fst = num / den

            # check for FST == 1.0
            ultimate_fst_pos = np.nonzero(fst >= self.ultimate_fst)[0]
            if len(ultimate_fst_pos) > 0:
                self.log(
                    'FST: %3.2f at %s for pop %s <> %s' %
                    (self.ultimate_fst, str(ultimate_fst_pos), pop1, pop2))

            if len(ultimate_fst_pos) > k:
                if self.priority is not None:
                    # get ultimate_fst based on priority

                    ultimate_priority = self.priority[ultimate_fst_pos]
                    sortidx = ultimate_fst_pos[np.argsort(ultimate_priority)]
                else:
                    np.random.shuffle(ultimate_fst_pos)
                    sortidx = ultimate_fst_pos

                #import IPython; IPython.embed()

            else:
                #fst[ np.isnan(fst) ] = 0
                sortidx = np.argsort(fst)

            # get highest FST
            #highest_fst_pos = sortidx[-(k+1):-1]
            #highest_fst_pos = list(reversed(sortidx))[:k]
            highest_fst_pos = sortidx[-k:]
            highest_fst_val = fst[highest_fst_pos]
            #self.log('highest FST: %5.4f at %d for pops %s <> %s' % (highest_fst_val, highest_fst_pos, pop1, pop2))
            if len(ultimate_fst_pos
                   ) > 0 and highest_fst_pos not in ultimate_fst_pos:
                pass
                #import IPython; IPython.embed()

            # check suitability of SNPs
            snplist, F = None, -1
            if highest_fst_val.max() < self.min_fst:

                if self.max_leaf_snp > k:

                    X_train = np.append(haplotypes1, haplotypes2, axis=0)
                    y_train = np.array([1] * len(haplotypes1) +
                                       [2] * len(haplotypes2))

                    best_iteration = (-1, None)
                    for i in range(k, self.max_leaf_snp):
                        features = sortidx[-(i + 1):-1]

                        model = FixSNPSelectorLK('dummy', snpindex=features)
                        lk_predictions, snplist, _, params = model.fit_and_predict(
                            X_train, y_train, X_train, len(features))
                        scores = calculate_scores(y_train, lk_predictions)

                        F = scores.loc[scores['REG'] == 'MIN', 'MCC'].values[0]
                        if best_iteration[0] < F:
                            best_iteration = (F, snplist)

                    snplist, F = best_iteration[1], best_iteration[0]

                snplist_2, F_2 = self.select_2(haplotypes1, haplotypes2)
                if F_2 > F:
                    snplist, F = snplist_2, F_2

                if snplist is not None:
                    self.log('F: %5.4f SNP: %d for pop %s <> %s => %s' %
                             (F, len(snplist), pop1, pop2, snplist))

                    for p in snplist:
                        candidate_L.append((p, level, n_pops))
                    continue

                # TODO: 2nd approach: find 2 SNPs with highest r^2(st) eg r^2 subpopulation vs r^2 total population

                # if snplist is None, just provide warning notice and skip this node!
                else:
                    self.log('low FST = %5.4f < %5.4f for %s vs %s; skipping' %
                             (highest_fst_val.max(), self.min_fst, pop1, pop2))
                    continue

            # append to candidate_L
            for p in highest_fst_pos:
                candidate_L.append((p, level, n_pops))
            self.log('FST: %s SNP: %d for pop %s <> %s => %s' %
                     (str(highest_fst_val), len(highest_fst_pos), pop1, pop2,
                      str(highest_fst_pos)))

        # process candidate_L
        L = np.unique(np.array(sorted([x[0] for x in candidate_L])))

        # return snp position
        return (L, None, {})
Ejemplo n.º 13
0
gt_seg = gt_homo[flt]

print(np.count_nonzero(flt), 'positions are homoyzgous and segregating')

# get all variant data for segregating positions (flt=True)
variants_pass = variants_pass[flt]
variants_pass



###   calc hudson Fst
print('calculating Hudson Fst for each position')
ac1 = gt_seg.count_alleles(subpop=subpop1)
ac2 = gt_seg.count_alleles(subpop=subpop2)

num, den = allel.hudson_fst(ac1, ac2)
num_fix = np.nan_to_num(num)
den_fix = np.nan_to_num(den)

fst = num_fix / den_fix



###    append Fst to variants

# Fst array to dataframe
fst_df = pd.DataFrame(fst, columns=['Fst_hudson'])

# set index of Fst to index of variants_pass
fst_df.index = variants_pass.index
Ejemplo n.º 14
0
Archivo: lkest.py Proyecto: trmznt/pys
    def select(self, haplotypes, groups, haplotest, k=None):

        # we use k for redundancy parameters
        if k == 0 or k is None:
            k = 1

        candidate_L = []     # [ (pos, rank, no_actual_pops)]
        # we traverse through the tree
        for (level, pop1, pop2) in traverse(self.guide_tree):

            n_pops = len(pop1) + len(pop2)
            haplotypes1 = haplotypes[ np.isin(groups, pop1) ]
            haplotypes2 = haplotypes[ np.isin(groups, pop2) ]

            if len(haplotypes1) < 4:
                cerr('[I - insufficient population size for %s -> %d]' %
                    (pop1, len(haplotypes)) )
            if len(haplotypes2) < 4:
                cerr('[I - insufficient population size for %s -> %d]' %
                    (pop2, len(haplotypes)) )

            # convert haplotypes to allele counts
            ac1 = count_allele(haplotypes1)
            ac2 = count_allele(haplotypes2)

            # calculate highest FST
            FST = []
            num, den = allel.hudson_fst(ac1, ac2)

            # NOTE: the line below might produce warning (invalid value in true_divide)
            # if den == 0, which should be perfectly ok for FST calculation
            fst = num/den

            fst[ np.isnan(fst) ] = 0
            sortidx = np.argsort( fst )

            # get highest FST
            highest_fst_pos = sortidx[-(k+1):-1]
            highest_fst_val = fst[ highest_fst_pos ]
            #cerr('[I - highest FST: %5.4f at %d for pops %s and %s' % (highest_fst_val, highest_fst_pos, pop1, pop2))

            # check suitability of SNPs
            if highest_fst_val.max() < self.min_fst:

                snplist, F = self.select_2(haplotypes1, haplotypes2)
                if snplist:
                    self.log('F: %5.4f SNP: %d for pop %s <> %s' % (F, len(snplist), pop1, pop2))

                    for p in snplist:
                        candidate_L.append( (p, level, n_pops) )
                    continue

                # 2nd approach: find 2 SNPs with highest r^2(st) eg r^2 subpopulation vs r^2 total population
                else:
                    self.log('low FST = %5.4f for %s vs %s' % ( highest_fst_val.max(), pop1, pop2))

            # append to candidate_L
            for p in highest_fst_pos:
                candidate_L.append( (p, level, n_pops) )

        # process candidate_L
        L = np.unique( np.array( sorted( [ x[0] for x in candidate_L] ) ) )

        # return snp position
        return (L, None, {})
Ejemplo n.º 15
0
def traditional_stats(data):
    """
    Caclulates lots of (mostly) traditional statistics,
    that are summaries of the site frequency spectrum.

    Arguments
    ---------
    data: Named tuple of results (made by collate_results function)

    Returns
    ---------
    Nested dictionary of statistics
    """
    pop_names = ["domestic", "wild", "captive", "all_pops"]

    stats = {
        "sfs_mean": {},
        "diversity": {},
        "wattersons_theta": {},
        "tajimas_d": {},
        "observed_heterozygosity": {},
        "expected_heterozygosity": {},
        "segregating_sites": {},
        "monomorphic_sites": {},
        "roh_mean": {},
        "roh_iqr": {},
        "r2": {},
        "f3": {},
        "divergence": {},
        "fst": {},
        "f2": {},
    }

    for pop in pop_names:
        # One way statistics
        stats["sfs_mean"][pop] = binned_sfs_mean(data.allele_counts[pop])
        stats["diversity"][pop] = allel.sequence_diversity(
            data.positions, data.allele_counts[pop])
        stats["wattersons_theta"][pop] = allel.watterson_theta(
            data.positions, data.allele_counts[pop])
        stats["tajimas_d"][pop] = allel.tajima_d(data.allele_counts[pop],
                                                 data.positions)
        stats["observed_heterozygosity"][pop] = allel.heterozygosity_observed(
            data.genotypes[pop]).mean()
        stats["expected_heterozygosity"][pop] = allel.heterozygosity_expected(
            data.allele_counts[pop].to_frequencies(), ploidy=2).mean()
        stats["segregating_sites"] = data.allele_counts[pop].count_segregating(
        )

        if pop != "all_pops":  # all_pops has no monomorphic sites
            stats["monomorphic_sites"][pop] = data.allele_counts[
                pop].count_non_segregating()

            # Three way statistics
            other_pops = [
                pop_name for pop_name in pop_names
                if pop_name not in ["all_pops", pop]
            ]
            t, b = allel.patterson_f3(data.allele_counts[pop],
                                      data.allele_counts[other_pops[0]],
                                      data.allele_counts[other_pops[1]])
            stats["f3"][pop] = np.sum(t) / np.sum(b)

    # Two way statistics
    for comparison in ["domestic_wild", "domestic_captive", "wild_captive"]:
        p = comparison.split("_")
        stats["divergence"][comparison] = allel.sequence_divergence(
            data.positions, data.allele_counts[p[0]], data.allele_counts[p[1]])

        num, den = allel.hudson_fst(data.allele_counts[p[0]],
                                    data.allele_counts[p[1]])
        stats["fst"][comparison] = np.sum(num) / np.sum(den)
        stats["f2"][comparison] = allel.patterson_f2(
            data.allele_counts[p[0]], data.allele_counts[p[1]]).mean()

    return stats