Example #1
0
    def test_rogers_huff_r(self):

        gn = [[0, 1, 2],
              [0, 1, 2]]
        expect = 1.
        actual = allel.rogers_huff_r(gn)
        eq(expect, actual)

        gn = [[0, 1, 2],
              [2, 1, 0]]
        expect = -1.
        actual = allel.rogers_huff_r(gn)
        eq(expect, actual)

        gn = [[0, 0, 0],
              [1, 1, 1]]
        actual = allel.rogers_huff_r(gn)
        assert np.isnan(actual)

        gn = [[0, 1, 0, 1],
              [0, 1, 1, 0]]
        expect = 0
        actual = allel.rogers_huff_r(gn)
        eq(expect, actual)

        gn = [[0, 1, 2, -1],
              [0, 1, 2, 2]]
        expect = 1.
        actual = allel.rogers_huff_r(gn)
        eq(expect, actual)

        gn = [[0, 1, 2, 2],
              [0, 1, 2, -1]]
        expect = 1.
        actual = allel.rogers_huff_r(gn)
        eq(expect, actual)

        gn = [[0, 1, 2],
              [0, 1, -1]]
        expect = 1.
        actual = allel.rogers_huff_r(gn)
        eq(expect, actual)

        gn = [[0, 2],
              [2, 0],
              [0, 1]]
        expect = [-1, 1, -1]
        actual = allel.rogers_huff_r(gn)
        assert_array_close(expect, actual)

        gn = [[0, 2, 0],
              [0, 2, 0],
              [2, 0, 2],
              [0, 2, -1]]
        expect = [1, -1, 1, -1, 1, -1]
        actual = allel.rogers_huff_r(gn)
        assert_array_close(expect, actual)
Example #2
0
def binned_ld(genotypes, positions, window_size, num_bins=20):
    bins = dict((i, []) for i in range(num_bins))
    exponent_start = 8
    base = np.exp(np.log(window_size) / (exponent_start + num_bins))

    def bin_index(pos1, pos2):
        dist = np.abs(pos2 - pos1)
        return int(max(np.floor(np.log(dist) / np.log(base) - exponent_start), 0))

    for window_start in range(positions[0], positions[-1], window_size):
        window_indices = np.logical_and(positions >= window_start, positions < window_start + window_size)
        window_positions = positions[window_indices]
        window_gn = genotypes[window_indices]

        if len(window_positions) == 0:
            continue
        r = allel.rogers_huff_r(window_gn)
        r_squared_matrix = squareform(r ** 2)

        for i, j in itertools.combinations(range(len(window_positions)), 2):
            r_squared = r_squared_matrix[i, j]
            if np.isnan(r_squared):
                continue
            index = bin_index(window_positions[i], window_positions[j])
            bins[index].append(r_squared)

    sizes = [base ** i for i in range(exponent_start + 1, exponent_start + num_bins + 1)]
    binned_r_squared = [np.mean(bins[i]) for i in range(num_bins)]
    return sizes, binned_r_squared
Example #3
0
def calcAndAppendStatValDiplo(alleleCounts, snpLocs, statName, subWinStart, subWinEnd, statVals, instanceIndex, subWinIndex, genosInSubWin, unmasked):
    genosNAlt = genosInSubWin.to_n_alt()
    if statName == "tajD":
        statVals[statName][instanceIndex].append(allel.stats.diversity.tajima_d(
            alleleCounts, pos=snpLocs, start=subWinStart, stop=subWinEnd))
    elif statName == "pi":
        statVals[statName][instanceIndex].append(allel.stats.diversity.sequence_diversity(
            snpLocs, alleleCounts, start=subWinStart, stop=subWinEnd, is_accessible=unmasked))
    elif statName == "thetaW":
        statVals[statName][instanceIndex].append(allel.stats.diversity.watterson_theta(
            snpLocs, alleleCounts, start=subWinStart, stop=subWinEnd, is_accessible=unmasked))
    elif statName == "thetaH":
        statVals[statName][instanceIndex].append(thetah(
            snpLocs, alleleCounts, start=subWinStart, stop=subWinEnd, is_accessible=unmasked))
    elif statName == "fayWuH":
        statVals[statName][instanceIndex].append(
            statVals["thetaH"][instanceIndex][subWinIndex]-statVals["pi"][instanceIndex][subWinIndex])
    elif statName == "HapCount":
        statVals[statName][instanceIndex].append(len(hapsInSubWin.distinct()))
    elif statName == "nDiplos":
        diplotypeCounts = shicstats.getHaplotypeFreqSpec(genosNAlt)
        nDiplos = diplotypeCounts[genosNAlt.shape[1]]
        statVals["nDiplos"][instanceIndex].append(nDiplos)
        diplotypeCounts = diplotypeCounts[:-1]
        dh1 = garudH1(diplotypeCounts)
        dh2 = garudH2(diplotypeCounts)
        dh12 = garudH12(diplotypeCounts)
        if "diplo_H1" in statVals:
            statVals["diplo_H1"][instanceIndex].append(dh1)
        if "diplo_H12" in statVals:
            statVals["diplo_H12"][instanceIndex].append(dh12)
        if "diplo_H2/H1" in statVals:
            statVals["diplo_H2/H1"][instanceIndex].append(dh2/dh1)
    elif statName == "diplo_ZnS":
        if genosNAlt.shape[0] == 1:
            statVals["diplo_ZnS"][instanceIndex].append(0.0)
            statVals["diplo_Omega"][instanceIndex].append(0.0)
        else:
            r2Matrix = allel.rogers_huff_r(genosNAlt, fill=0.0)
            statVals["diplo_ZnS"][instanceIndex].append(np.nanmean(r2Matrix))
            r2Matrix2 = squareform(r2Matrix ** 2)
            statVals["diplo_Omega"][instanceIndex].append(
                shicstats.omega(r2Matrix2)[0])
    elif statName == "distVar":
        dists = shicstats.pairwiseDiffsDiplo(
            genosNAlt)/float(unmasked[subWinStart-1:subWinEnd].count(True))
        statVals["distVar"][instanceIndex].append(np.var(dists, ddof=1))
        statVals["distSkew"][instanceIndex].append(scipy.stats.skew(dists))
        statVals["distKurt"][instanceIndex].append(scipy.stats.kurtosis(dists))
    elif statName in ["diplo_H12", "diplo_H123", "diplo_H2/H1", "distVar", "distSkew", "distKurt", "diplo_Omega"]:
        if not len(statVals[statName][instanceIndex]) == subWinIndex+1:
            print(statName, instanceIndex, subWinIndex+1)
            print(statVals["diplo_H1"][instanceIndex],
                  statVals["diplo_H12"][instanceIndex])
            sys.exit()
Example #4
0
def linkage_disequilibrium(ts,
                           span=40000,
                           bins=20,
                           min_obs_per_bin=8,
                           max_sequence_length=1e6):
    """
    R^2 as a function of site-separation distance, for `bins` bins up to a
    site-separation distance of `span` bp.
    """
    if ts.sequence_length > max_sequence_length:
        ts = ts.keep_intervals([(0, max_sequence_length)],
                               record_provenance=False)
    position = [site.position for site in ts.sites()]
    num_sites = len(position)
    assert num_sites == int(ts.num_sites)

    nans = np.full(bins, np.nan)
    if num_sites >= min_obs_per_bin:
        gts = np.expand_dims(ts.genotype_matrix(), axis=-1)
        gn = allel.GenotypeArray(gts, dtype="i1").to_n_alt()
        ld = allel.rogers_huff_r(gn)**2
        assert len(ld) == num_sites * (num_sites - 1) // 2

        # Bin the pairwise site R^2 in `ld` by site separation distance.
        r2 = np.zeros(bins)
        n = np.zeros(bins)
        i = 0
        for j in range(num_sites):
            for k in range(j + 1, num_sites):
                distance = position[k] - position[j]
                if distance >= span:
                    break
                index = int(distance * bins / span)
                if not np.isnan(ld[i]):
                    r2[index] += ld[i]
                    n[index] += 1
                i += 1
        # Divide `r2` by `n`, but return NaN where n has insufficient observations.
        r2 = np.divide(r2, n, out=nans, where=n >= min_obs_per_bin)
    else:
        # Too few segregating sites to do anything meaningful.
        # LD plots may be blank.
        r2 = nans

    return {
        f"$\Delta$bp$\in[{span*k/bins/1000:.0f}\,$k$,"  # NOQA
        f"{span*(k+1)/bins/1000:.0f}\,$k$)$": r2[k]  # NOQA
        for k in range(bins)
    }
Example #5
0
def ld(synthetic_population_code, synthetic_genotypes, reference_genotypes, synthetic_positions, reference_positions, reference_samples, classification_map, window_size=2e5):
    window_size = int(window_size)
    reference_population_labels = np.array([classification_map.loc[sample]['population'] for sample in reference_samples])
    original_reference_genotypes = reference_genotypes[:, reference_population_labels == synthetic_population_code]

    synthetic_genotypes, synthetic_positions = remove_fixed_sites(allel.GenotypeArray(np.copy(synthetic_genotypes)).to_n_alt(), np.copy(synthetic_positions))
    reference_genotypes, reference_positions = remove_fixed_sites(allel.GenotypeArray(np.copy(original_reference_genotypes)).to_n_alt(), np.copy(reference_positions))

    # # plot binned ld
    plt.title('Binned Linkage Disequilibrium')
    sizes, binned_r_squared = binned_ld(synthetic_genotypes, synthetic_positions, window_size)
    plt.plot(sizes, binned_r_squared, label='Synthetic {}'.format(synthetic_population_code))
    sizes, binned_r_squared = binned_ld(reference_genotypes, reference_positions, window_size)
    plt.plot(sizes, binned_r_squared, label='{}'.format(synthetic_population_code))
    plt.xlabel('Distance (bp)')
    plt.ylabel('LD (r squared)')
    plt.xscale('log')
    plt.legend()
    plt.savefig(os.path.join(FIGURES_DIR, '{}.binned_ld.png'.format(synthetic_population_code)))
    plt.close(plt.gcf())

    # plot pairwise ld
    np.random.seed(SEED)
    window_start = np.random.randint(synthetic_positions[0], synthetic_positions[-1] - window_size)
    synthetic_window_indices = np.logical_and(np.logical_and(synthetic_positions >= window_start, synthetic_positions < window_start + window_size), np.isin(synthetic_positions, reference_positions))
    reference_window_indices = np.logical_and(np.logical_and(reference_positions >= window_start, reference_positions < window_start + window_size), np.isin(reference_positions, synthetic_positions))
    synthetic_window_gn = synthetic_genotypes[synthetic_window_indices]
    reference_window_gn = reference_genotypes[reference_window_indices]
    synthetic_r = allel.rogers_huff_r(synthetic_window_gn)
    reference_r = allel.rogers_huff_r(reference_window_gn)
    synthetic_r_squared_matrix = squareform(synthetic_r ** 2)
    reference_r_squared_matrix = squareform(reference_r ** 2)
    ax = plot_pairwise_ld(synthetic_r_squared_matrix, reference_r_squared_matrix, colorbar=True, imshow_kwargs={'cmap': 'cividis'})
    plt.title('SNP Correlation in {}kb Window'.format(window_size // 1000))
    plt.savefig(os.path.join(FIGURES_DIR, '{}.pairwise_ld.png'.format(synthetic_population_code)))
    plt.close(plt.gcf())
Example #6
0
def linkage_disequilibrium(ts, span=2 * 10**5, bins=50, min_obs_per_bin=8):
    """
    Average R^2 in `bins` bins over the first `span` bases of ts.
    """
    span = min(ts.sequence_length, span)
    ts = ts.keep_intervals([(0, span)], record_provenance=False)
    position = [site.position for site in ts.sites()]
    num_sites = len(position)
    assert num_sites == int(ts.num_sites)

    nans = np.full(bins, np.nan)
    if num_sites >= min_obs_per_bin:
        gts = np.expand_dims(ts.genotype_matrix(), axis=-1)
        gn = allel.GenotypeArray(gts, dtype='i1').to_n_alt()
        ld = allel.rogers_huff_r(gn)**2
        assert len(ld) == num_sites * (num_sites - 1) // 2

        # Bin the pairwise site R^2 in `ld` by site separation distance.
        r2 = np.zeros(bins)
        n = np.zeros(bins)
        i = 0
        for j in range(num_sites):
            for k in range(j + 1, num_sites):
                distance = position[k] - position[j]
                index = int(distance * bins / span)
                if not np.isnan(ld[i]):
                    r2[index] += ld[i]
                    n[index] += 1
                i += 1
        # divide `r2` by `n`, but return NaN where n has insufficient observations.
        r2 = np.divide(r2, n, out=nans, where=n >= min_obs_per_bin)
    else:
        # Too few segregating sites to do anything meaningful.
        # LD plots may be blank.
        r2 = nans

    a = f"{span//bins//1000}k"  # width of one bin, in kb
    b = f"{span//8//1000}k"
    c = f"{span//4//1000}k"
    d = f"{span//2//1000}k"

    return {
        f"$R^2$[<{a}]": r2[0],
        f"$R^2$[{b}]": r2[bins // 8],
        f"$R^2$[{c}]": r2[bins // 4],
        f"$R^2$[{d}]": r2[bins // 2]
    }
Example #7
0
def plot_ld(gn, title):
    m = allel.rogers_huff_r(gn)**2
    ax = allel.plot_pairwise_ld(m)
    ax.set_title(title)
Example #8
0
def ld_prune(gn, variants, cadd, thold):
    """
    Method used inside goPDX class (filteringSNPS)
    input:
        subset of the gn, variants and cadd associated to this subset
    output:
        subset of the input subset without high correlated snps and cadd above 1

    """
    import allel
    # https://en.wikipedia.org/wiki/Linkage_disequilibrium
    # Estimate the linkage disequilibrium parameter r for each pair of variants
    r = allel.rogers_huff_r(gn)
    correlations = squareform(r**2)
    correlations = pd.DataFrame(correlations)
    correlations.fillna(1, inplace=True)
    correlations = correlations.values
    del r
    # Saving the indiced of explored snps
    keep = []
    done = []

    for v_ in range(len(variants)):
        if v_ not in done:
            # Filtering out explored columns
            nextcolumns = set(np.arange(len(variants))) - set(done)
            filter_0 = np.zeros(len(variants))
            filter_0[list(nextcolumns)] = 1

            # Filtering the columns with high correlation
            filter_1 = np.greater(correlations[:, v_], thold)
            filter_1 = filter_1 * np.equal(filter_0, 1)

            if filter_1.sum() > 1:
                v_ind = np.arange(len(variants))[filter_1]
                v_ind = np.append(v_ind, v_)

                v_cadd = cadd[filter_1]
                v_cadd = np.append(v_cadd, cadd[v_])

                # keeping only the snp with highest cadd
                # if all less than 1, keep none
                filter_2 = np.equal(v_cadd, v_cadd.max())
                if v_cadd.max() > 1:
                    if isinstance(v_ind[filter_2], np.ndarray):
                        keep.append(v_ind[filter_2][0])
                    else:
                        keep.append(v_ind[filter_2])

                for item in v_ind:
                    done.append(item)
            else:
                keep.append(v_)
                done.append(v_)

    # Filtering final results on the subset to output
    # ADD FUNCTION TO KEEP KNOWN ELEMENTS HERE
    loc_unlinked = np.zeros(len(variants))
    loc_unlinked[keep] = 1

    gn = gn.compress(loc_unlinked, axis=0)
    variants = variants[keep]
    cadd = cadd[keep]
    return gn, variants, cadd
Example #9
0
def plot_ld(gn, title, filename):
    m = al.rogers_huff_r(gn) ** 2
    ax = al.plot_pairwise_ld(m)
    ax.set_title(title)
    ax.figure.savefig(os.path.join(pcafP, filename), bbox_inches='tight')
def LD(haplotype,
       pos_vec,
       size_chr,
       circular=True,
       distance_bins=None,
       gaps_type="short",
       min_SNP_pairs=300):
    """
    Compute LD for a subset of SNPs drawn with different gap sizes in between them.
    Gap sizes follow power 2 distribution.
    The LD is then computed and averaged over different bin (distance_bins) sizes.

    Parameters
    ----------
    haplotype : numpy 2D array or allel.haplotype
        SNP matrix where in the first dimension are the SNP (rows) and
        in the second dimension (columns) are the samples.
    pos_vec : 1D array
        array of absolute positions in [0, size_chr].
    size_chr : int
        Size of the chromosome.
    circular : bool
        Whether to consider the chromosome circular or not.
        If circular, the maximum distance between 2 SNPs is thus half the chromosome.
    distance_bins : int or list
        LD will be averaged by bins of distances
        e.g. if distance_bins = [0, 100, 1000, 10000], LD will be averaged for the groups [0,100[, [100, 1000[, and [1000, 10000[
        If distance_bins is an int, it defines the number of bins of distances for which to compute the LD
            The bins are created in a logspace
        If distance_bins is a list, they will be used instead
    gaps_type: str
        Pairs of SNP considered are separated by a given number (gap) of columns. Not all pairs are considered.
        By defaut (`short`), gaps are power of 2 up to the closest power of 2 of the number of SNP.
        Meaning that most of the comparisons will be done on close SNPs (short distance).
        If one wants to sample more at large distance (to test for circularity for instance), use `long` instead of `short`
        Using `long` will add gaps like: n_SNP - gaps. It will take more time to run.
    min_SNP_pairs: int
        Minimum number of pairs of SNP to consider for a given gap size.
        If the gap size is big enough such that there is less than `min_SNP_pairs` possible pairs,
        then all pairs are considered.

    Returns
    -------
    DataFrame
        Table with the distance_bins as index, and the mean value of
    """

    if isinstance(distance_bins, type(None)) or isinstance(distance_bins, int):
        if isinstance(distance_bins, int):
            n_bins = distance_bins - 3
        else:
            n_bins = 17
        if circular:
            distance_bins = np.logspace(2, np.log10(size_chr // 2), n_bins)
            distance_bins = np.insert(
                distance_bins, 0,
                [0, 25, 50, 75])  # add bins at short distances
        else:
            distance_bins = np.logspace(2, np.log10(size_chr), n_bins)
            distance_bins = np.insert(distance_bins, 0, [0, 25, 50, 75])

    n_SNP, n_samples = haplotype.shape

    # gaps are distance between SNPs in term of position in the snp matrix (not in bp)
    gaps_interval = (2**np.arange(0, np.log2(n_SNP),
                                  1)).astype(int)  # log2 scales of intervals
    if gaps_type.lower() == "long":
        gaps_interval = np.unique(
            np.concatenate([
                gaps_interval,
                np.array(
                    list(n_SNP // 2 - gaps_interval[:len(gaps_interval) //
                                                    2])[::-1]).astype(int),
                np.array(list(n_SNP - gaps_interval)[::-1])
            ])).astype(int)
    else:
        if gaps_type.lower() != "short":
            logging.warning(
                "gaps should be either `short` or `long`. Using short instead of f{gaps_type}"
            )

    selected_snps = []
    for gi, gap in enumerate(gaps_interval):

        if circular:
            max_value = n_SNP
        else:
            max_value = n_SNP - gap
        if max_value < min_SNP_pairs:  # min_SNP_pairs : min number of SNP pairs to consider.
            # if not many possible pairs possible, just take them all directly,
            # instead of reaching that number after many more random trials
            snps = np.arange(0, n_SNP, gap)
            snp_pairs = np.unique([((snps[i] + i) % n_SNP,
                                    (snps[i + 1] + i) % n_SNP)
                                   for i in range(len(snps) - 1)],
                                  axis=0)
            snp_pairs = np.concatenate([(snp_pairs + i) % n_SNP
                                        for i in range(max_value)],
                                       axis=0)
        else:
            if not circular:
                snps = np.arange(0, n_SNP, gap) + np.random.randint(
                    0, (n_SNP - 1) % gap + 1
                )  # adding a random start (+1, bc 2nd bound in randint is exlusive)
                # non overlapping contiguous pairs
                # snps=[ 196, 1220, 2244] becomes
                # snp_pairs=[(196, 1220), (1221, 2245)]
                snp_pairs = np.unique([((snps[i] + i) % n_SNP,
                                        (snps[i + 1] + i) % n_SNP)
                                       for i in range(len(snps) - 1)],
                                      axis=0)

                # If we don't have enough pairs (typically when gap is large), we add a random rotation until we have at least 300)
                #count = 0
                # remove pairs that are over the edges
                snp_pairs = snp_pairs[snp_pairs[:, 0] < snp_pairs[:, 1]]
            else:
                snps = np.arange(0, n_SNP, gap) + np.random.randint(
                    0, (n_SNP - 1))  # adding a random start
                # non overlapping contiguous pairs
                # snps=[ 196, 1220, 2244] becomes
                # snp_pairs=[(196, 1220), (1221, 2245)]
                snp_pairs = np.unique([((snps[i] + i) % n_SNP,
                                        (snps[i + 1] + i) % n_SNP)
                                       for i in range(len(snps) - 1)],
                                      axis=0)

            last_pair = snp_pairs[-1]

            while len(snp_pairs) < min(min_SNP_pairs, max_value):
                #count += 1
                #if count % 10 == 0:
                #print(">>  " + str(gap) + " - " + str(len(np.unique(snp_pairs, axis=0))) + " -- "+ str(len(snps) - 1) + "#" + str(count))
                #remainder = (n_SNP - 1) % gap if (n_SNP - 1) % gap != 0 else (n_SNP - 1) // gap
                shift = np.random.randint(1, n_SNP) % n_SNP
                new_pair = (last_pair + shift) % n_SNP
                snp_pairs = np.unique(np.concatenate(
                    [snp_pairs, new_pair.reshape(1, 2)]),
                                      axis=0)
                last_pair = new_pair

                if not circular:
                    snp_pairs = snp_pairs[snp_pairs[:, 0] < snp_pairs[:, 1]]

        selected_snps.append(snp_pairs)

    ld = pd.DataFrame()
    for i, snps_pos in enumerate(selected_snps):

        if circular:
            pos_i = pos_vec[snps_pos]
            min_dist = np.array([
                min(np.diff(pi) % size_chr,
                    np.diff(pi[::-1]) % size_chr) for pi in pos_i
            ]) % size_chr / 2
            sd = pd.DataFrame(min_dist, columns=[
                "snp_dist"
            ])  # %size_chr/2 because max distance btw 2 SNP is size_chr/2
        else:
            sd = pd.DataFrame((np.diff(pos_vec[snps_pos])),
                              columns=["snp_dist"])

        sd["dist_group"] = pd.cut(sd.snp_dist, bins=distance_bins)
        sr = [allel.rogers_huff_r(snps)**2 for snps in haplotype[snps_pos]]
        sd["r2"] = sr
        sd["gap_id"] = i
        ld = pd.concat([ld, sd])

    ld2 = ld.dropna().groupby("dist_group").agg(mean_dist=('snp_dist', 'mean'),
                                                mean_r2=('r2', 'mean'),
                                                Count=('r2', 'count'),
                                                sem_r2=('r2', 'sem'))

    return ld2
fig = plt.figure(figsize=(2,16))
pdf = PdfPages("%s/%s_%s.allele_fq.pdf" % (outdir,outcode,l_nom))

# plot
ax=sns.heatmap(fq_minor[is_report],vmin=0,vmax=0.5,cmap=sns.light_palette("darkslategray",n_colors=31),
               yticklabels=oc_snpname_seg[is_report],linewidths=0.8,linecolor="white",annot=True)
ax.set_title("ALT fq per pop %s" % l_nom)

pdf.savefig(fig,bbox_inches='tight')
pdf.close()


# Linkage disequilibrium
# linkage disequilibrium Rogers and Huff
print("LD Rogers & Huff...")
ld_rhr = allel.rogers_huff_r(oc_haploty_seg.compress(is_report).to_n_alt(fill=-1))
ld_rhr = squareform(ld_rhr)
np.fill_diagonal(ld_rhr,np.nan)
# plot
pdf = PdfPages("%s/%s_%s.allele_ld_rhr.pdf" % (outdir,outcode,l_nom))
fig = plt.figure(figsize=(16,14))
ax=sns.heatmap(ld_rhr,vmin=-1,vmax=1,cmap=sns.diverging_palette(20,255,s=99,sep=15,l=45,n=31),
               xticklabels=oc_snpname_seg[is_report],yticklabels=oc_snpname_seg[is_report],linewidths=0.2,linecolor="white",annot=True)
ax.set_title("Rogers & Huff $r$ %s" % l_nom)
pdf.savefig(fig,bbox_inches='tight')
pdf.close()

# print table
ld_rhr_df = pd.DataFrame(ld_rhr)
ld_rhr_df.columns = oc_snpname_seg[is_report]
ld_rhr_df.rows    = oc_snpname_seg[is_report]
Example #12
0
################### LD decay ####################
plt.hist(pos, bins=100)[2]
maskstart = 4.4e7
maskstop = 4.45e7

#get LD and pairwise distance for a subset of 1000 SNPs
np.random.seed(12345)
mask = np.logical_and(pos > maskstart, pos < maskstop)
dc2 = dc[:, mask]
dc2 = dc2[pred['pop'] == "YRI", :]
bingen2 = bingen[:, mask]
bingen2 = bingen2[pred['pop'] == 'YRI', :]
pos2 = pos[mask]

#calculate pairwise LD matrices
LDr = allel.rogers_huff_r(np.transpose(dc2))
LDg = allel.rogers_huff_r(np.transpose(bingen2))
LDr = spatial.distance.squareform(LDr)
LDg = spatial.distance.squareform(LDg)

#get bp distances
dists = [x - y for x in pos2 for y in pos2]
LDr2 = np.concatenate(LDr)
LDr2 = np.array(LDr2, dtype="float64")
LDr2 = LDr2**2

LDg2 = np.concatenate(LDg)
LDg2 = np.array(LDg2, dtype="float64")
LDg2 = LDg2**2

#simulation LD
Example #13
0
def calculate_r_2( genotypes ):
	cerr('[I - calculating Rogers-Huff r^2 for {} SNPs ]'.format(len(genotypes)))
	r = allel.rogers_huff_r(genotypes)
	r_2 = scipy.spatial.distance.squareform( r**2 )
	return r_2