Example #1
0
    def test_mean_pairwise_diversity(self):

        # start with simplest case, two haplotypes, one pairwise comparison
        h = HaplotypeArray([[0, 0],
                            [1, 1],
                            [0, 1],
                            [1, 2],
                            [0, -1],
                            [-1, -1]])
        ac = h.count_alleles()
        expect = [0, 0, 1, 1, -1, -1]
        actual = allel.mean_pairwise_difference(ac, fill=-1)
        aeq(expect, actual)

        # four haplotypes, 6 pairwise comparison
        h = HaplotypeArray([[0, 0, 0, 0],
                            [0, 0, 0, 1],
                            [0, 0, 1, 1],
                            [0, 1, 1, 1],
                            [1, 1, 1, 1],
                            [0, 0, 1, 2],
                            [0, 1, 1, 2],
                            [0, 1, -1, -1],
                            [-1, -1, -1, -1]])
        ac = h.count_alleles()
        expect = [0, 3/6, 4/6, 3/6, 0, 5/6, 5/6, 1, -1]
        actual = allel.mean_pairwise_difference(ac, fill=-1)
        assert_array_almost_equal(expect, actual)
Example #2
0
def test_diversity__windowed(sample_size):
    ts = simulate_ts(sample_size, length=200)
    ds = ts_to_dataset(ts)  # type: ignore[no-untyped-call]
    ds, subsets = add_cohorts(
        ds, ts, cohort_key_names=["cohorts"])  # type: ignore[no-untyped-call]
    ds = window(ds, size=25)
    ds = diversity(ds)
    div = ds["stat_diversity"].sel(cohorts="co_0").compute()

    # Calculate diversity using tskit windows
    # Find the variant positions so we can have windows with a fixed number of variants
    positions = ts.tables.sites.position
    windows = np.concatenate(([0], positions[::25][1:], [ts.sequence_length]))
    ts_div = ts.diversity(windows=windows, span_normalise=False)
    np.testing.assert_allclose(div, ts_div)

    # Calculate diversity using scikit-allel moving_statistic
    # (Don't use windowed_diversity, since it treats the last window differently)
    ds = count_variant_alleles(
        ts_to_dataset(ts))  # type: ignore[no-untyped-call]
    ac = ds["variant_allele_count"].values
    mpd = allel.mean_pairwise_difference(ac, fill=0)
    ska_div = allel.moving_statistic(mpd, np.sum, size=25)
    np.testing.assert_allclose(
        div[:-1], ska_div)  # scikit-allel has final window missing
    def print_pi(self, tree_sequence, indices, populations):
        if not self.pi_needed():
            return

        writer = self.writers['pi']
        # invert populations dictionary to be keyed by population index
        # this keeps the order consistent instead of relying on keys

        pops = 'AF EU AS'.split()
        indices = np.array(indices)

        writer.write('\t'.join(pops) + '\t')
        writer.write('AF-EU\tAF-AS\tEU-AS\n')

        length = tree_sequence.get_sequence_length()
        haplotypes = tree_sequence.genotype_matrix()
        for pop in pops:
            mpd = allel.mean_pairwise_difference(
                allel.HaplotypeArray(
                    haplotypes[:,
                               indices == populations[pop]]).count_alleles())
            writer.write(f'{mpd.sum()/length:.5}\t')

        for pairs in (('AF', 'EU'), ('AF', 'AS'), ('EU', 'AS')):
            count1 = allel.HaplotypeArray(
                haplotypes[:,
                           indices == populations[pairs[0]]]).count_alleles()
            count2 = allel.HaplotypeArray(
                haplotypes[:,
                           indices == populations[pairs[1]]]).count_alleles()
            num, den = allel.hudson_fst(count1, count2)
            writer.write(f'{num.sum() / den.sum():.5}\t')
        writer.write('\n')
def removeMissingStats(region, region_length):
    """ Calculates haplotype frequencies, haplotypic diversity, and nucleotide
    diversity for a given region of a vcf. Subjects with missing data for
    1+ variant will be removed before calculation.

    Args:
      region: a haplotypeArray covering coordinates of interest.

    Returns:
      List containing:
        - Number of individuals with no missing variant data in this region;
          only these individuals' data is used in further calculations.
        - Haplotypic diversity for region. Returns 0 if only 1 subject present.
        - Nucleotide diversity.
        - List of haplotype frequencies.
    """
    # remove missing in region
    keep_subject = np.ones(region.shape[1], dtype=int)
    for i in range(1, region.shape[1]):
        if -1 in region[:, i]:
            keep_subject[i] = 0
    region_complete = region.compress(condition=keep_subject, axis=1)
    # calculate haplotype frequencies
    freqs = region_complete.distinct_frequencies()
    nh = region_complete.n_haplotypes
    if nh == 1:
        hap_div = 0
    else:
        hap_div = allel.haplotype_diversity(region_complete)
    # calculate nucleotide diversity specifically on nonmissing region
    ac = region_complete.count_alleles()
    diffs = allel.mean_pairwise_difference(ac, fill=0)
    pi = np.sum(diffs) / region_length
    return [nh, hap_div, pi, freqs]
Example #5
0
def main(args):

    ## Step 0: get null model for SNP calling
    null_loc = os.path.dirname(
        __file__) + '/helper_files/combined_null1000000.txt'
    null_model = generate_snp_model(null_loc)
    P2C = {'A': 0, 'C': 1, 'T': 2, 'G': 3}
    C2P = {0: 'A', 1: 'C', 2: 'T', 3: 'G'}

    ## Step 1: build new counts table from all objects
    s_final = SNPprofile()
    s_final.filename = args.output
    i = 0
    counts_per_block = {}
    s1 = SNPprofile()
    print("loading " + args.input[0])
    s1.load(args.input[0])

    s_final.scaffold_list = s1.scaffold_list
    s_final.counts_table = copy.deepcopy(s1.counts_table)

    s2 = SNPprofile()
    print("loading " + args.input[1])
    s2.load(args.input[1])

    for scaf in s2.scaffold_list:
        if scaf not in s_final.scaffold_list:
            sys.exit(
                "Error: scaffold " + scaf + " in " + fn +
                " not found in initial file. Your inStrain objects were probably not run on the same FASTA."
            )

    scaf_counter = 0
    for scaf in s2.counts_table:
        s_final.counts_table[scaf_counter] += scaf
        scaf_counter += 1
    i += 1

    # Step 2: call all SNPs for new object
    allele_counts_total = {}
    allele_counts1 = {}
    allele_counts2 = {}
    snp_table = defaultdict(list)
    scaf_counter = 0

    for scaf in tqdm(s_final.counts_table, desc='Calling new SNVs...'):
        pos_counter = 0
        for counts in scaf:
            snp = call_snv_site(counts,
                                min_cov=5,
                                min_freq=0.05,
                                model=null_model)

            if snp:  # means that there was coverage at this position
                if snp != -1:  # means this is a SNP
                    # calculate varBase
                    snp, varbase = major_minor_allele(counts)

                    snp_table['scaffold'].append(
                        s_final.scaffold_list[scaf_counter])
                    snp_table['position'].append(pos_counter)
                    snp_table['varBase'].append(snp)
                    snp_table['conBase'].append(varbase)
                    allele_counts_total[s_final.scaffold_list[scaf_counter] +
                                        ":" + str(pos_counter)] = (
                                            s_final.counts_table[scaf_counter]
                                            [pos_counter])
                    allele_counts1[s_final.scaffold_list[scaf_counter] + ":" +
                                   str(pos_counter)] = (
                                       s1.counts_table[scaf_counter]
                                       [pos_counter])
                    allele_counts2[s_final.scaffold_list[scaf_counter] + ":" +
                                   str(pos_counter)] = (
                                       s2.counts_table[scaf_counter]
                                       [pos_counter])
            pos_counter += 1  # 0 based positions!!
        scaf_counter += 1

    # Step 3: Save new FST_SNP table to disk.
    SNPTable = pd.DataFrame(snp_table)

    FstTable = defaultdict(list)
    for gene in tqdm(create_gene_index(args.gene_file),
                     desc="calculating fst"):
        snps = SNPTable[(SNPTable.scaffold == gene['scaf'])
                        & (SNPTable.position >= gene['start']) &
                        (SNPTable.position <= gene['end'])]
        snp_list = []
        for index, row in snps.iterrows():
            snp_list.append(row['scaffold'] + ":" + str(row['position']))

        # only continue if there are at least 3 snps in this gene
        if len(snp_list) >= 3:
            allele_counts_1 = []
            allele_counts_2 = []
            for snp in snp_list:
                allele_counts_1.append(allele_counts1[snp])
                allele_counts_2.append(allele_counts2[snp])

            allel1 = allel.AlleleCountsArray(allele_counts_1)
            allel2 = allel.AlleleCountsArray(allele_counts_2)
            fst_h = allel.moving_hudson_fst(
                allel1, allel2,
                size=len(snp_list))[0]  #allel.moving_hudson_fst(a1,a2, size=3)
            nd_1 = np.sum(allel.mean_pairwise_difference(allel1)) / (
                1 + gene['end'] - gene['start'])
            nd_2 = np.sum(allel.mean_pairwise_difference(allel2)) / (
                1 + gene['end'] - gene['start'])

            FstTable['gene'].append(gene['name'])
            FstTable['snp_num'].append(len(snp_list))
            FstTable['fst'].append(fst_h)
            FstTable['pi_1'].append(nd_1)
            FstTable['pi_2'].append(nd_2)
            FstTable['cov_1'].append(np.mean(np.sum(allele_counts_1, axis=1)))
            FstTable['cov_2'].append(np.mean(np.sum(allele_counts_2, axis=1)))

    FstTable = pd.DataFrame(FstTable)
    print(np.mean(FstTable['fst']))
    FstTable.to_csv(args.output + '.Fst.tsv', index=False, sep='\t')
Example #6
0
    def print_pi(self, tree_sequence, indices, populations):
        if not self.pi_needed():
            return

        writer = self.writers['pi']
        # invert populations dictionary to be keyed by population index
        # this keeps the order consistent instead of relying on keys

        pops = 'AF EU AS'.split()
        indices = np.array(indices)

        writer.write('\t'.join(pops) + '\t')
        writer.write('AF-EU\tAF-AS\tEU-AS\n')

        length = tree_sequence.get_sequence_length()
        haplotypes = tree_sequence.genotype_matrix()

        ga_comb = allel.HaplotypeArray(
            haplotypes[:, indices == populations['AF']]).to_genotypes(
                ploidy=2).concatenate([
                    allel.HaplotypeArray(
                        haplotypes[:,
                                   indices == populations['EU']]).to_genotypes(
                                       ploidy=2),
                    allel.HaplotypeArray(
                        haplotypes[:,
                                   indices == populations['AS']]).to_genotypes(
                                       ploidy=2)
                ], 1)

        keep_alleles = ga_comb.count_alleles().is_biallelic_01(
            min_mac=int(0.05 * (ga_comb.n_samples)))

        # for pop in pops:
        #     mpd = allel.mean_pairwise_difference(
        #         allel.HaplotypeArray(
        #             haplotypes[:, indices == populations[pop]]
        #         ).count_alleles())
        #     writer.write(
        #         f'{mpd.sum()/length:.5}\t')
        #
        # for pairs in (('AF', 'EU'), ('AF', 'AS'), ('EU', 'AS')):
        #     count1 = allel.HaplotypeArray(
        #         haplotypes[:, indices == populations[pairs[0]]]
        #     ).count_alleles()
        #     count2 = allel.HaplotypeArray(
        #         haplotypes[:, indices == populations[pairs[1]]]
        #     ).count_alleles()
        #     num, den = allel.hudson_fst(count1, count2)
        #     writer.write(f'{num.sum() / den.sum():.5}\t')
        # writer.write('\n')

        # Calculate pi
        for pop in pops:
            ## Create genotype array from tree_sequence haplotype data for
            ## population and ploidy=2
            counts = allel.HaplotypeArray(
                haplotypes[:, indices == populations[pop]]).to_genotypes(
                    ploidy=2).count_alleles()

            ## keep with maf > 5% and < 95%
            maf = counts.values[:, 1] / sum(counts.values[0, :])
            counts = counts[np.logical_and(maf > 0.05, maf < 0.95)]

            ## Calculate mean_pairwise_difference for genotype array including
            ## variants with maf > 5%
            mpd = allel.mean_pairwise_difference(counts)

            writer.write(f'{mpd.sum()/counts.shape[0]:.5}\t')

        #Calculate Fst
        for pairs in (('AF', 'EU'), ('AF', 'AS'), ('EU', 'AS')):
            num1 = sum(indices == populations[pairs[0]]) // 2
            num2 = sum(indices == populations[pairs[1]]) // 2
            ## Set up empty list of lists for subpop array indices
            subpops = [list(range(0, num1)), list(range(num1, num1 + num2))]
            ga = allel.HaplotypeArray(
                haplotypes[:,
                           np.logical_or(indices ==
                                         populations[pairs[0]], indices ==
                                         populations[pairs[1]])]).to_genotypes(
                                             ploidy=2)
            counts = ga.count_alleles()
            maf = counts.values[:, 1] / sum(counts.values[0, :])

            ## Calculate mean Fst based on combined genotype data
            a, b, c = allel.weir_cockerham_fst(
                ga[np.logical_and(maf > 0.05, maf < 0.95)], subpops)
            fst = np.mean(
                np.sum(a, axis=1) /
                (np.sum(a, axis=1) + np.sum(b, axis=1) + np.sum(c, axis=1)))

            writer.write(f'{fst:.5}\t')
        writer.write('\n')
Example #7
0
def RecombinationRepper(
        pooled_args):  #  provide r_rate, model_function, reps, samples

    r_rate = pooled_args[0]
    model_function = pooled_args[1]
    reps = pooled_args[2]
    samples = pooled_args[3]

    mean_Fst_dists = []
    var_Fst_dists = []
    mean_SE_dists = []
    mean_SE_dists_shuf = []
    tree_counts = []
    mean_Dxy_dists = []
    mean_Tajima_dists = []
    mean_diversity_dists = []
    mean_H12_dists = []

    for t in range(reps):
        print(t)

        new_tree = migration_simulation_2patch(r_rate)
        # Add mutations to the tree
        count = 0
        for r in new_tree.trees():
            count += 1


#		print(t, count)
        tree_counts.append(count)

        new_tree_dist_Fst = []
        new_tree_dist_SE = []
        new_tree_dist_SE_shuf = []
        new_tree_dist_Dxy = []
        new_tree_dist_diversity = []
        new_tree_dist_Tajima = []
        new_tree_dist_H12 = []

        for i in range(samples):  ## Repeat 100 times
            # Add mutations to the tree
            muts = 0
            while muts == 0:
                mutated_tree = msprime.mutate(new_tree, 1.25e-7)
                muts = len([v for v in mutated_tree.variants()])
            # Get the genotype matrix, ready for using sci-kit.allel
            msprime_genotype_matrix = mutated_tree.genotype_matrix()
            # Convert msprime's haplotype matrix into genotypes by randomly merging chromosomes
            haplotype_array = allel.HaplotypeArray(msprime_genotype_matrix)

            genotype_array = haplotype_array.to_genotypes(ploidy=2)

            shuffled_genotypes = shuffle(genotype_array, random_state=0)

            ac1 = haplotype_array.count_alleles(
                subpop=[s for s in range(0, 100)])
            ac2 = haplotype_array.count_alleles(
                subpop=[s for s in range(100, 200)])

            ## Calculate Tajima's D
            Tajimas_D = allel.tajima_d(ac1)

            ## Calculate Dxy
            dxy = sum(allel.mean_pairwise_difference_between(ac1,
                                                             ac2)) / 10000.

            ## Calculate Garud's H statistics for the population
            ## Grab the haplotypes for 400SNPs from deme 1
            hapslice = haplotype_array[:400, 0:100]
            H_vector = allel.garud_h(hapslice)

            ## Calculate Diversity
            pi = sum(allel.mean_pairwise_difference(ac1)) / 10000.

            subpopulations = [[p for p in range(0, 50)],
                              [z for z in range(50, 100)]]
            mean_fst = allel.average_weir_cockerham_fst(genotype_array,
                                                        blen=100,
                                                        subpops=subpopulations)
            mean_fst_shuf = allel.average_weir_cockerham_fst(
                shuffled_genotypes, blen=100, subpops=subpopulations)

            new_tree_dist_Fst.append(mean_fst[0])
            new_tree_dist_SE.append(mean_fst[1])
            new_tree_dist_SE_shuf.append(mean_fst_shuf[1])
            new_tree_dist_Tajima.append(Tajimas_D)
            new_tree_dist_Dxy.append(dxy)
            new_tree_dist_H12.append(H_vector[1])
            new_tree_dist_diversity.append(pi)

        mean_Fst_dists.append(np.mean(new_tree_dist_Fst))

        var_Fst_dists.append(np.sqrt(np.var(new_tree_dist_Fst)))

        mean_SE_dists.append(np.mean(new_tree_dist_SE))

        mean_SE_dists_shuf.append(np.mean(new_tree_dist_SE_shuf))

        mean_Dxy_dists.append(np.mean(new_tree_dist_Dxy))

        mean_Tajima_dists.append(np.mean(new_tree_dist_Tajima))

        mean_H12_dists.append(np.mean(new_tree_dist_H12))

        mean_diversity_dists.append(np.mean(new_tree_dist_diversity))

    return [
        r_rate, mean_Fst_dists, mean_SE_dists, mean_SE_dists_shuf,
        var_Fst_dists, tree_counts, mean_Dxy_dists, mean_Tajima_dists,
        mean_diversity_dists, mean_H12_dists
    ]