Example #1
0
File: pca.py Project: tle003/ipyrad
    def plot_pairwise_dist(self, labels=None, ax=None, cmap=None, cdict=None, metric="euclidean"):
        """
        Plot pairwise distances between all samples

        labels: bool or list
                by default labels aren't included. If labels == True, then labels are read in
                from the vcf file. Alternatively, labels can be passed in as a list, should
                be same length as the number of samples.
        """
        allele_counts = self.genotypes.to_n_alt()
        dist = allel.pairwise_distance(allele_counts, metric=metric)
        if not ax:
            fig = plt.figure(figsize=(5, 5))
            ax = fig.add_subplot(1, 1, 1)

        if isinstance(labels, bool):
            if labels:
                labels = list(self.samples_vcforder)
        elif isinstance(labels, type(None)):
            pass
        else:
            ## If not bool or None (default), then check to make sure the list passed in
            ## is the right length
            if not len(labels) == len(self.samples_vcforder):
                raise IPyradError(LABELS_LENGTH_ERROR.format(len(labels), len(self.samples_vcforder)))

        allel.plot.pairwise_distance(dist, labels=labels, ax=ax, colorbar=False)
Example #2
0
    def test_pairwise_distance_multidim(self):
        g = GenotypeArray(
            [[[0, 0], [0, 0]], [[1, 1], [1, 1]], [[1, 1], [2, 2]],
             [[0, 0], [0, 1]], [[0, 0], [0, 2]], [[1, 1], [1, 2]],
             [[0, 1], [0, 1]], [[0, 1], [1, 2]], [[0, 0], [-1, -1]],
             [[0, 1], [-1, -1]], [[-1, -1], [-1, -1]]],
            dtype='i1')
        gac = g.to_allele_counts()

        def metric(ac1, ac2):
            mpd = allel.mean_pairwise_difference_between(ac1, ac2, fill=0)
            return mpd.sum()

        expect = [
            allel.mean_pairwise_difference_between(gac[:, 0],
                                                   gac[:, 1],
                                                   fill=0).sum()
        ]
        actual = allel.pairwise_distance(gac, metric)
        aeq(expect, actual)
Example #3
0
def get_pairwise_dist(g):
    d = allel.pairwise_distance(g.to_n_alt(),metric='cityblock')
    print(f'The mean pairwise distance within the population is {round(np.mean(d),2)}')
    allel.plot_pairwise_distance(d,labels=['Individual'+str(k) for k in range(g.shape[1])])
    plt.title('Pairwise Distance Matrix')
    return 
Example #4
0
        popseg[pop] = ac_pops_vars[pop].count_segregating()
    pd.Series(popseg, index=popseg.keys()).to_csv(os.path.join(statsP, 'nSegAlleles.pop.txt'), header= False, index=True, sep= '\t')


    nestseg = dict()
    for nest in ac_nests_vars.keys():
        nestseg[nest] = ac_nests_vars[nest].count_segregating()
    pd.Series(nestseg, index=nestseg.keys()).to_csv(os.path.join(statsP, 'nSegAlleles.nest.txt'), header= False, index=True, sep= '\t')




    #############   pairwise distance matrix   #############
    print("-----------  Calculating pairwise distance matrix  -----------")

    dvar = al.pairwise_distance(gtvars.to_n_alt(), metric= 'cityblock')


    # heatmap with dendrogram

    condensedDvar = scipy.spatial.distance.squareform(dvar)

    n2col = dict(zip(ids['nest'].unique(), sns.color_palette()))
    rowCols = np.array(ids['nest'].map(n2col))

    cDdf = pd.DataFrame(condensedDvar, index= ids['nest'], columns=ids['id'])
    g = sns.clustermap(cDdf, row_colors= rowCols, cmap= 'jet')
    g.ax_heatmap.set_xticklabels(g.ax_heatmap.get_xmajorticklabels(), fontsize = 12)     #ha= 'right', rotation= 40
    g.ax_heatmap.set_yticklabels(g.ax_heatmap.get_ymajorticklabels(), fontsize = 12)

    g.savefig(os.path.join(figsP, 'pwDist.png'), bbox_inches='tight')
Example #5
0
# Identify segregating, non-singleton positions
hap_matrix = allel.GenotypeArray(all_haplotypes[chrom]['calldata']['genotype']
                                 [:,
                                  genotype_indices_list, :]).to_haplotypes()
ac = hap_matrix.count_alleles()
non_singleton = ac.min(1) > 1
non_singleton_pos = pos[non_singleton]

# Get the range of SNPs that are within the region of interest, and that are segregating non-singletons
pre_SNPs = non_singleton_pos[non_singleton_pos <= focus][(-SNP_range):]
post_SNPs = non_singleton_pos[non_singleton_pos > focus][:SNP_range]
SNP_range = np.concatenate([pre_SNPs, post_SNPs])
SNP_pos = np.where([x in SNP_range for x in pos])[0]

focal_hap_matrix = hap_matrix[SNP_pos, :]
dist = allel.pairwise_distance(focal_hap_matrix, metric='hamming')
is_accessible = accessibility[chrom]['is_accessible'][
    SNP_range[0]:SNP_range[-1]]
n_bases = np.count_nonzero(is_accessible)
dist_dxy = dist * focal_hap_matrix.n_variants / n_bases
focal_clusters = find_clusters(dist_dxy, n=3, threshold=0.001)
largest_focal_cluster = focal_clusters[0]
focal_cluster_members = focal_haplotypes[list(largest_focal_cluster)]

# Record the focal cluster as binary, with 1 showing haplotypes in the swept cluster, and 0 showing haplotypes
# that are not
focal_cluster_calls = np.zeros(focal_hap_matrix.shape[1])
focal_cluster_calls[list(largest_focal_cluster)] = 1

# Get genotype-level calls for the swept cluster and the P4 SNP
focal_cluster_genotype = [