Esempio n. 1
0
    def test_per_base(self):
        pos = [1, 12, 15, 27]

        # boolean array, all true
        b = [True, True, True, True]
        # N.B., final bin includes right edge
        expected_nnz = [1, 2, 1]
        expected_windows = [[1, 10], [11, 20], [21, 27]]
        expected_counts = [1, 2, 1]
        expected_densities = [1 / 10, 2 / 10, 1 / 7]
        expected_n_bases = [10, 10, 7]
        nnz, windows, counts = allel.windowed_statistic(
            pos, b, statistic=np.count_nonzero, size=10, start=1)
        densities, n_bases = allel.per_base(nnz, windows)
        aeq(expected_nnz, nnz)
        aeq(expected_windows, windows)
        aeq(expected_counts, counts)
        aeq(expected_densities, densities)
        aeq(expected_n_bases, n_bases)

        # boolean array, not all true
        b = [False, True, False, True]
        expected_densities = [0 / 10, 1 / 10, 1 / 7]
        expected_n_bases = [10, 10, 7]
        nnz, windows, counts = allel.windowed_statistic(
            pos, b, statistic=np.count_nonzero, size=10, start=1)
        densities, n_bases = allel.per_base(nnz, windows)
        aeq(expected_densities, densities)
        aeq(expected_n_bases, n_bases)

        # 2D, 4 variants, 2 samples
        b = [[True, False], [True, True], [True, False], [True, True]]
        expected_densities = [[1 / 10, 0 / 10], [2 / 10, 1 / 10],
                              [1 / 7, 1 / 7]]
        expected_n_bases = [10, 10, 7]
        nnz, windows, counts = allel.windowed_statistic(
            pos, b, statistic=lambda x: np.sum(x, axis=0), size=10, start=1)
        densities, n_bases = allel.per_base(nnz, windows)
        aeq(expected_densities, densities)
        aeq(expected_n_bases, n_bases)

        # include is_accessible array option
        is_accessible = np.array([
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1,
            1, 1, 1, 1, 1, 1, 1, 1
        ],
                                 dtype=bool)
        b = [False, True, False, True]
        expected_densities = [-1, 1 / 6, 1 / 7]
        expected_n_bases = [0, 6, 7]
        nnz, windows, counts = allel.windowed_statistic(
            pos, b, statistic=np.count_nonzero, size=10, start=1)
        densities, n_bases = allel.per_base(nnz,
                                            windows,
                                            is_accessible=is_accessible,
                                            fill=-1)
        aeq(expected_densities, densities)
        aeq(expected_n_bases, n_bases)
Esempio n. 2
0
def mutrate(ps, polymorphic_loci_outgroup, chrom):
    '''
    Input:
        - ps, 
        - polymorphic_loci_outgroup, 
        - chrom
    Output: 
        - ingroup_index : np array with shape (x, ) x being the number of ingroup individuals. 
                          The ingroup is considered any non-african from HGDP.
    '''
    #Counts of polymorphic sites in windows of 1Kb
    snp_count, windows, _ = allel.windowed_statistic(
        pos=ps,
        values=polymorphic_loci_outgroup,
        statistic=np.sum,
        windows=get_windowed(chrom),
        fill=0)

    #np array precursor of the final mutation rate. The columns are:
    #	1. Chromosome
    #	2. Start coordinate for the window (0-based, included)
    #	3. Number of polymorphic sites
    #	4. Percentage of callable bases in that window
    mut_rate = pd.DataFrame({
        "chrom": [chrom] * windows.shape[0],
        "start": (windows[:, 0] - 1).astype("int32"),
        "segregating":
        snp_count,
        "call":
        np.loadtxt(
            "/home/moicoll/GenerationInterval/people/moi/tmp/weigths/chr{}_weigths.txt"
            .format(chrom),
            usecols=[2])
    })

    #average genomic mutation rate in the inputed chromosome
    genomic_mut_rate = np.sum(mut_rate["segregating"]) / np.sum(
        mut_rate["call"])

    #average genomic mutation rate in the inputed chromosome over 1Mb. However, a row per 1Kb window is given.
    mut_rate["mut_rate"] = (mut_rate.assign(
        start_big_window=(mut_rate["start"] / 1000000).astype(int)).groupby(
            "start_big_window", group_keys=False
        ).apply(lambda x: ((x["start"] + 1) / (x["start"] + 1)) * np.sum(x[
            "segregating"]) / np.sum(x["call"]) / genomic_mut_rate).fillna(0))

    #save dataframe, droping columns "segregating" and "call"
    mut_rate.drop(["segregating", "call"], axis=1).to_csv(
        "/home/moicoll/GenerationInterval/people/moi/tmp/mutrate/chr{}.tmp".
        format(chrom),
        header=False,
        index=False,
        sep='\t')
Esempio n. 3
0
meta_data_samples['callset_index'] = samples_callset_index


def het_counting(gt):
    return gt.count_het()


gt_zarr = callset["{}/calldata/GT".format(chrom)]
pos = callset["{}/variants/POS".format(chrom)]
gt = allel.GenotypeDaskArray(gt_zarr)
df_list = []
for i, row in meta_data_samples.iterrows():
    df = pd.DataFrame()
    individual = (gt.take([row.callset_index], axis=1))
    nnz, windows, counts = allel.windowed_statistic(pos,
                                                    individual,
                                                    statistic=het_counting,
                                                    size=window_size)
    df["het"] = nnz
    if i % 10 == 0:
        print(i)
    window_numbering = []
    df.insert(0, column="chr", value=chrom)
    window_numbering.extend(range(len(nnz)))
    df.insert(1, column="window", value=window_numbering)
    df.insert(2, column="PGDP_ID", value=row.PGDP_ID)
    df_list.append(df)
chr_df = pd.concat(df_list, axis=0)
chr_df.to_csv("../steps/het_counts_windows_{}.txt".format(chrom),
              sep=" ",
              index=False)
print("Finished with {}".format(chrom))
Esempio n. 4
0
	# Use the middle of the window as the index
	window_middle = np.sum(eqa, axis=1)/2
	vref_dxy_by_window[chrom] = pd.DataFrame(index=window_middle.astype(int), columns=list(subpops.keys()) + list(species.keys()))
	xpop_dxy_by_window[chrom] = pd.DataFrame(index=window_middle.astype(int))

	# Calculate distance from the reference in each sub-population
	for pop in subpops.keys():
		print('processing', pop)

		# Faster if we drop non variant loci first, and load into mem
		loc = ac[pop].is_variant().compute()
		pop_ac = ac[pop].compress(loc, axis=0).compute()
		pop_pos = pos.compress(loc, axis=0)
		print('computing divergence...', loc.sum())

		vals, windows, counts = allel.windowed_statistic(
			pop_pos, pop_ac.to_frequencies(), compute_divergence, windows=eqa)
		vref_dxy_by_window[chrom][pop] = vals / window_size

	# Calculate distance from the reference in each species
	for pop in species.keys():
		print('processing', pop)

		# Faster if we drop non variant loci first, and load into mem
		loc = ac_species[pop].is_variant().compute()
		pop_ac = ac_species[pop].compress(loc, axis=0).compute()
		pop_pos = pos.compress(loc, axis=0)
		print('computing divergence...', loc.sum())

		vals, windows, counts = allel.windowed_statistic(
			pop_pos, pop_ac.to_frequencies(), compute_divergence, windows=eqa)
		vref_dxy_by_window[chrom][pop] = vals / window_size
        hap_div = allel.haplotype_diversity(region_complete)
    # calculate nucleotide diversity specifically on nonmissing region
    ac = region_complete.count_alleles()
    diffs = allel.mean_pairwise_difference(ac, fill=0)
    pi = np.sum(diffs) / 200
    return [nh, hap_div, pi, freqs]


chromosomes = allel.read_vcf(vcf_path, fields=['CHROM'])
chromosomes_list = np.unique(chromosomes['variants/CHROM'])
for chrom in chromosomes_list:
    print(chrom)
    # read in that chromosome data only
    callset = allel.read_vcf(vcf_path, region=chrom, fields='*')
    gt = allel.GenotypeArray(callset["calldata/GT"])
    # remove any het calls, convert to haploid
    gt.mask = gt.is_het()
    gt_hom_only = gt.fill_masked(value=-1)
    gt_hap_array = gt_hom_only.haploidify_samples()
    # remove individuals with missing data and calculate stats
    n_list, w, n = allel.windowed_statistic(pos=callset["variants/POS"],
                                            values=gt_hap_array,
                                            statistic=removeMissingStats,
                                            size=200,
                                            step=50,
                                            start=1)
    df = pd.DataFrame(list(zip(n_list, w, n)),
                      columns=["n_list", "windows_n", "n_var_n"])
    file_name = os.getcwd() + "/" + prefix + chrom + "_hap_div.csv"
    df.to_csv(file_name, header=True)