def plot_pairwise_dist(self, labels=None, ax=None, cmap=None, cdict=None, metric="euclidean"): """ Plot pairwise distances between all samples labels: bool or list by default labels aren't included. If labels == True, then labels are read in from the vcf file. Alternatively, labels can be passed in as a list, should be same length as the number of samples. """ allele_counts = self.genotypes.to_n_alt() dist = allel.pairwise_distance(allele_counts, metric=metric) if not ax: fig = plt.figure(figsize=(5, 5)) ax = fig.add_subplot(1, 1, 1) if isinstance(labels, bool): if labels: labels = list(self.samples_vcforder) elif isinstance(labels, type(None)): pass else: ## If not bool or None (default), then check to make sure the list passed in ## is the right length if not len(labels) == len(self.samples_vcforder): raise IPyradError(LABELS_LENGTH_ERROR.format(len(labels), len(self.samples_vcforder))) allel.plot.pairwise_distance(dist, labels=labels, ax=ax, colorbar=False)
def test_pairwise_distance_multidim(self): g = GenotypeArray( [[[0, 0], [0, 0]], [[1, 1], [1, 1]], [[1, 1], [2, 2]], [[0, 0], [0, 1]], [[0, 0], [0, 2]], [[1, 1], [1, 2]], [[0, 1], [0, 1]], [[0, 1], [1, 2]], [[0, 0], [-1, -1]], [[0, 1], [-1, -1]], [[-1, -1], [-1, -1]]], dtype='i1') gac = g.to_allele_counts() def metric(ac1, ac2): mpd = allel.mean_pairwise_difference_between(ac1, ac2, fill=0) return mpd.sum() expect = [ allel.mean_pairwise_difference_between(gac[:, 0], gac[:, 1], fill=0).sum() ] actual = allel.pairwise_distance(gac, metric) aeq(expect, actual)
def get_pairwise_dist(g): d = allel.pairwise_distance(g.to_n_alt(),metric='cityblock') print(f'The mean pairwise distance within the population is {round(np.mean(d),2)}') allel.plot_pairwise_distance(d,labels=['Individual'+str(k) for k in range(g.shape[1])]) plt.title('Pairwise Distance Matrix') return
popseg[pop] = ac_pops_vars[pop].count_segregating() pd.Series(popseg, index=popseg.keys()).to_csv(os.path.join(statsP, 'nSegAlleles.pop.txt'), header= False, index=True, sep= '\t') nestseg = dict() for nest in ac_nests_vars.keys(): nestseg[nest] = ac_nests_vars[nest].count_segregating() pd.Series(nestseg, index=nestseg.keys()).to_csv(os.path.join(statsP, 'nSegAlleles.nest.txt'), header= False, index=True, sep= '\t') ############# pairwise distance matrix ############# print("----------- Calculating pairwise distance matrix -----------") dvar = al.pairwise_distance(gtvars.to_n_alt(), metric= 'cityblock') # heatmap with dendrogram condensedDvar = scipy.spatial.distance.squareform(dvar) n2col = dict(zip(ids['nest'].unique(), sns.color_palette())) rowCols = np.array(ids['nest'].map(n2col)) cDdf = pd.DataFrame(condensedDvar, index= ids['nest'], columns=ids['id']) g = sns.clustermap(cDdf, row_colors= rowCols, cmap= 'jet') g.ax_heatmap.set_xticklabels(g.ax_heatmap.get_xmajorticklabels(), fontsize = 12) #ha= 'right', rotation= 40 g.ax_heatmap.set_yticklabels(g.ax_heatmap.get_ymajorticklabels(), fontsize = 12) g.savefig(os.path.join(figsP, 'pwDist.png'), bbox_inches='tight')
# Identify segregating, non-singleton positions hap_matrix = allel.GenotypeArray(all_haplotypes[chrom]['calldata']['genotype'] [:, genotype_indices_list, :]).to_haplotypes() ac = hap_matrix.count_alleles() non_singleton = ac.min(1) > 1 non_singleton_pos = pos[non_singleton] # Get the range of SNPs that are within the region of interest, and that are segregating non-singletons pre_SNPs = non_singleton_pos[non_singleton_pos <= focus][(-SNP_range):] post_SNPs = non_singleton_pos[non_singleton_pos > focus][:SNP_range] SNP_range = np.concatenate([pre_SNPs, post_SNPs]) SNP_pos = np.where([x in SNP_range for x in pos])[0] focal_hap_matrix = hap_matrix[SNP_pos, :] dist = allel.pairwise_distance(focal_hap_matrix, metric='hamming') is_accessible = accessibility[chrom]['is_accessible'][ SNP_range[0]:SNP_range[-1]] n_bases = np.count_nonzero(is_accessible) dist_dxy = dist * focal_hap_matrix.n_variants / n_bases focal_clusters = find_clusters(dist_dxy, n=3, threshold=0.001) largest_focal_cluster = focal_clusters[0] focal_cluster_members = focal_haplotypes[list(largest_focal_cluster)] # Record the focal cluster as binary, with 1 showing haplotypes in the swept cluster, and 0 showing haplotypes # that are not focal_cluster_calls = np.zeros(focal_hap_matrix.shape[1]) focal_cluster_calls[list(largest_focal_cluster)] = 1 # Get genotype-level calls for the swept cluster and the P4 SNP focal_cluster_genotype = [