def get_group_symmetry_reference_matched(self, ref_cs): ref_v6 = xtal.v6cell(ref_cs.niggli_cell().unit_cell()) ncdists = [] for i, keys in enumerate(self.groups): v6 = xtal.v6cell( uctbx.unit_cell(self._average_p1_cell(keys)).niggli_cell()) ncdists.append(NCDist(v6, ref_v6)) print "Group %d: NCDist to reference: %f" % (i + 1, ncdists[-1]) return ncdists.index(min(ncdists)) + 1
def distance_from(self, other_uc): """ Calculates distance using NCDist from Andrews and Bernstein J. Appl. Cryst. 2014 between this frame and some other unit cell. :param:other_uc: a 6-tuple of a, b, c, alpha, beta, gamma for some unit cell :return: the NCDist in A^2 to other_uc """ from cctbx.uctbx.determine_unit_cell import NCDist self_g6 = self.make_g6(self.uc) other_g6 = self.make_g6(other_uc) return NCDist(self_g6, other_g6)
def run_one(path): cells = [g for g in generate_unit_cells_from_text(path)] g6 = [SingleFrame.make_g6(u) for u in cells] # for the purpose of this test, cycle through pairs of g6 vectors for ix in xrange(len(g6) - 1): a = g6[ix] b = g6[ix + 1] old = NCDist(a, b) new = NCDist2017(a, b) com = NCDist2017(b, a) assert old == new assert new == com
def run_one(path): cells = [g for g in generate_unit_cells_from_text(path)] g6 = [SingleFrame.make_g6(u) for u in cells] # for the purpose of this test, cycle through pairs of g6 vectors for ix in range(len(g6) - 1): a = g6[ix] b = g6[ix + 1] old = NCDist(a, b) # workaround allows use of non-thread-safe NCDist, even if openMP is enabled elsewhere in the Python program import os, omptbx workaround_nt = int(os.environ.get("OMP_NUM_THREADS", 1)) omptbx.omp_set_num_threads(1) new = NCDist2017(a, b) com = NCDist2017(b, a) omptbx.omp_set_num_threads(workaround_nt) assert old == new, "Zeldin, AB2017" assert new == com, "Pair %d NCDist(a,b) %f != NCDist(b,a) %f" % ( ix, new, com)
def ab_cluster(self, threshold=10000, method='distance', linkage_method='single', log=False, ax=None, write_file_lists=True, schnell=False, doplot=True, labels='default'): """ Hierarchical clustering using the unit cell dimentions. :param threshold: the threshold to use for prunning the tree into clusters. :param method: which clustering method from scipy to use when creating the tree (see scipy.cluster.hierarchy) :param linkage_method: which linkage method from scipy to use when creating the linkages. x (see scipy.cluster.hierarchy) :param log: if True, use log scale on y axis. :param ax: if a matplotlib axes object is provided, plot to this. Otherwise, create a new axes object and display on screen. :param write_file_lists: if True, write out the files that make up each cluster. :param schnell: if True, use simple euclidian distance, otherwise, use Andrews-Berstein distance from Andrews & Bernstein J Appl Cryst 47:346 (2014) on the Niggli cells. :param doplot: Boolean flag for if the plotting should be done at all. Runs faster if switched off. :param labels: 'default' will not display any labels for more than 100 images, but will display file names for fewer. This can be manually overidden with a boolean flag. :return: A list of Clusters ordered by largest Cluster to smallest .. note:: Use 'schnell' option with caution, since it can cause strange behaviour around symmetry boundaries. """ logging.info("Hierarchical clustering of unit cells") import scipy.spatial.distance as dist import scipy.cluster.hierarchy as hcluster # 1. Create a numpy array of G6 cells g6_cells = np.array( [SingleFrame.make_g6(image.uc) for image in self.members]) # 2. Do hierarchichal clustering, using the find_distance method above. if schnell: logging.info("Using Euclidean distance") pair_distances = dist.pdist(g6_cells, metric='euclidean') logging.info("Distances have been calculated") this_linkage = hcluster.linkage(pair_distances, method=linkage_method, metric='euclidean') else: logging.info( "Using Andrews-Bernstein distance from Andrews & Bernstein " "J Appl Cryst 47:346 (2014)") pair_distances = dist.pdist(g6_cells, metric=lambda a, b: NCDist(a, b)) logging.info("Distances have been calculated") this_linkage = hcluster.linkage(pair_distances, method=linkage_method, metric=lambda a, b: NCDist(a, b)) cluster_ids = hcluster.fcluster(this_linkage, threshold, criterion=method) logging.debug("Clusters have been calculated") # 3. Create an array of sub-cluster objects from the clustering sub_clusters = [] for cluster in range(max(cluster_ids)): info_string = ('Made using ab_cluster with t={},' ' {} method, and {} linkage').format( threshold, method, linkage_method) sub_clusters.append( self.make_sub_cluster([ self.members[i] for i in range(len(self.members)) if cluster_ids[i] == cluster + 1 ], 'cluster_{}'.format(cluster + 1), info_string)) sub_clusters = sorted(sub_clusters, key=lambda x: len(x.members)) # Rename to order by size for num, cluster in enumerate(sub_clusters): cluster.cname = 'cluster_{}'.format(num + 1) # 3.5 optionally write out the clusters to files. if write_file_lists: for cluster in sub_clusters: if len(cluster.members) > 1: cluster.dump_file_list( out_file_name="{}.lst".format(cluster.cname)) if doplot: if labels is True: labels = [image.name for image in self.members] elif labels is False: labels = ['' for _ in self.members] elif labels == 'default': if len(self.members) > 100: labels = ['' for _ in self.members] else: labels = [image.name for image in self.members] else: labels = [getattr(v, labels, '') for v in self.members] # 4. Plot a dendogram to the axes if no axis is passed, otherwise just # return the axes object if ax is None: fig = plt.figure("Distance Dendogram") ax = fig.gca() direct_visualisation = True else: direct_visualisation = False hcluster.dendrogram(this_linkage, labels=labels, leaf_font_size=8, leaf_rotation=90.0, color_threshold=threshold, ax=ax) if log: ax.set_yscale("symlog", linthreshx=(-1, 1)) else: ax.set_ylim(-ax.get_ylim()[1] / 100, ax.get_ylim()[1]) if direct_visualisation: fig.savefig("{}_dendogram.pdf".format(self.cname)) plt.show() return sub_clusters, ax
def set_chunk_stats(chunk, stats, stat_choice, n_residues=None, ref_cell=None, space_group=None, d_min=None, ref_data=None): if "reslimit" in stat_choice: stats["reslimit"].append(chunk.res_lim) else: stats["reslimit"].append(float("nan")) if "pr" in stat_choice: stats["pr"].append(chunk.profile_radius) else: stats["pr"].append(float("nan")) stats["ccref"].append(float("nan")) if set(["ioversigma", "resnatsnr1", "ccref"]).intersection(stat_choice): iobs = chunk.data_array(space_group, False) iobs = iobs.select(iobs.sigmas() > 0).merge_equivalents( use_internal_variance=False).array() binner = iobs.setup_binner(auto_binning=True) if "resnatsnr1" in stat_choice: res = float("nan") for i_bin in binner.range_used(): sel = binner.selection(i_bin) tmp = iobs.select(sel) if tmp.size() == 0: continue sn = flex.mean(tmp.data() / tmp.sigmas()) if sn <= 1: res = binner.bin_d_range(i_bin)[1] break stats["resnatsnr1"].append(res) else: stats["resnatsnr1"].append(float("nan")) if d_min: iobs = iobs.resolution_filter(d_min=d_min) if "ccref" in stat_choice: corr = iobs.correlation(ref_data, assert_is_similar_symmetry=False) if corr.is_well_defined(): stats["ccref"][-1] = corr.coefficient() if "ioversigma" in stat_choice: stats["ioversigma"].append(flex.mean(iobs.data() / iobs.sigmas())) else: stats["ioversigma"].append(float("nan")) else: stats["ioversigma"].append(float("nan")) stats["resnatsnr1"].append(float("nan")) if "abdist" in stat_choice: from cctbx.uctbx.determine_unit_cell import NCDist G6a, G6b = make_G6(ref_cell), make_G6(chunk.cell) abdist = NCDist(G6a, G6b) stats["abdist"].append(abdist) else: stats["abdist"].append(float("nan")) if "wilsonb" in stat_choice: iso_scale_and_b = ml_iso_absolute_scaling(iobs, n_residues, 0) stats["wilsonb"].append(iso_scale_and_b.b_wilson) else: stats["wilsonb"].append(float("nan"))
def ab_cluster(self, threshold=10000, method='distance', linkage_method='single', log=False, plot=False): """ Do basic hierarchical clustering using the Andrews-Berstein distance on the Niggli cells """ print("Hierarchical clustering of unit cells:") import scipy.spatial.distance as dist print( "Using Andrews-Bernstein Distance from Andrews & Bernstein J Appl Cryst 47:346 (2014)." ) def make_g6(uc): """ Take a reduced Niggli Cell, and turn it into the G6 representation """ a = uc[0]**2 b = uc[1]**2 c = uc[2]**2 d = 2 * uc[1] * uc[2] * math.cos(uc[3]) e = 2 * uc[0] * uc[2] * math.cos(uc[4]) f = 2 * uc[0] * uc[1] * math.cos(uc[5]) return [a, b, c, d, e, f] # 1. Create a numpy array of G6 cells g6_cells = np.array([make_g6(image.uc) for image in self.members]) # 2. Do hierarchichal clustering, using the find_distance method above. pair_distances = dist.pdist(g6_cells, metric=lambda a, b: NCDist(a, b)) logging.debug("Distances have been calculated") this_linkage = hcluster.linkage(pair_distances, method=linkage_method, metric=lambda a, b: NCDist(a, b)) cluster_ids = hcluster.fcluster(this_linkage, threshold, criterion=method) logging.debug("Clusters have been calculated") # Create an array of sub-cluster objects from the clustering sub_clusters = [] for cluster in range(max(cluster_ids)): info_string = ('Made using ab_cluster with t={},' ' {} method, and {} linkage').format( threshold, method, linkage_method) sub_clusters.append( self.make_sub_cluster([ self.members[i] for i in range(len(self.members)) if cluster_ids[i] == cluster + 1 ], 'cluster_{}'.format(cluster + 1), info_string)) # 3. print out some information that is useful. out_str = "{} clusters have been identified.".format(max(cluster_ids)) out_str += "\n{:^5} {:^14} {:<11} {:<11} {:<11} {:<12} {:<12} {:<12}".format( "C_id", "Num in cluster", "Med_a", "Med_b", "Med_c", "Med_alpha", "Med_beta", "Med_gamma") singletons = [] for cluster in sub_clusters: if len(cluster.members) != 1: sorted_pg_comp = sorted(list(cluster.pg_composition.items()), key=lambda x: -1 * x[1]) pg_strings = [ "{} in {}".format(pg[1], pg[0]) for pg in sorted_pg_comp ] point_group_string = ", ".join(pg_strings) + "." out_str += ( "\n{:^5} {:^14} {:<5.1f}({:<4.1f}) {:<5.1f}({:<4.1f})" " {:<5.1f}({:<4.1f}) {:<6.2f}({:<4.2f}) {:<6.2f}" "({:<4.2f}) {:<6.2f}({:<4.2f})").format( cluster.cname, len(cluster.members), cluster.medians[0], cluster.stdevs[0], cluster.medians[1], cluster.stdevs[1], cluster.medians[2], cluster.stdevs[2], cluster.medians[3], cluster.stdevs[3], cluster.medians[4], cluster.stdevs[4], cluster.medians[5], cluster.stdevs[5]) out_str += "\n" + point_group_string else: singletons.append("".join([ ("{:<14} {:<11.1f} {:<11.1f} {:<11.1f}" "{:<12.1f} {:<12.1f} {:<12.1f}").format( list(cluster.pg_composition.keys())[0], cluster.members[0].uc[0], cluster.members[0].uc[1], cluster.members[0].uc[2], cluster.members[0].uc[3], cluster.members[0].uc[4], cluster.members[0].uc[5]), '\n' ])) out_str += "\nStandard deviations are in brackets." out_str += "\n" + str(len(singletons)) + " singletons:" out_str += "\n{:^14} {:<11} {:<11} {:<11} {:<12} {:<12} {:<12}".format( "Point group", "a", "b", "c", "alpha", "beta", "gamma") out_str += "".join(singletons) print(out_str) if plot: import matplotlib.pyplot as plt fig = plt.figure("Distance Dendogram") hcluster.dendrogram(this_linkage, labels=[image.name for image in self.members], leaf_font_size=8, color_threshold=threshold) ax = fig.gca() if log: ax.set_yscale("log") else: ax.set_ylim(-ax.get_ylim()[1] / 100, ax.get_ylim()[1]) fig.savefig("{}_dendogram.pdf".format(self.cname)) plt.show() return sub_clusters
def calc_stats(xac_file, stat_choice, n_residues=None, ref_v6cell=None, min_peak=None, min_peak_percentile=None, correct_peak=None): # Open XDS_ASCII if xac_file.endswith(".pkl"): xac = pickle.load(open(xac_file)) else: xac = xds_ascii.XDS_ASCII(xac_file) sel_remove = flex.bool(xac.iobs.size(), False) if min_peak is not None: sel = xac.peak < min_peak sel_remove |= sel elif min_peak_percentile is not None: q = numpy.percentile(xac.peak, min_peak_percentile) print "percentile %.2f %s" % (q, xac) sel = xac.peak < q sel_remove |= sel if correct_peak: sel_remove |= (xac.peak < 1) # remove PEAK==0 xac.remove_selection(sel_remove) if params.correct_peak: xac.iobs *= xac.peak * .01 xac.sigma_iobs *= xac.peak * .01 iobs = xac.i_obs(anomalous_flag=False) iobs = iobs.select(iobs.sigmas() > 0).merge_equivalents( use_internal_variance=False).array() stats = dict(filename=xac_file, cell=iobs.unit_cell().parameters()) if iobs.size() == 0: return stats if "ioversigma" in stat_choice or "resnatsnr1" in stat_choice: binner = iobs.setup_binner(auto_binning=True) if "ioversigma" in stat_choice: stats["ioversigma"] = flex.mean(iobs.data() / iobs.sigmas()) if "resnatsnr1" in stat_choice: res = float("nan") for i_bin in binner.range_used(): sel = binner.selection(i_bin) tmp = iobs.select(sel) if tmp.size() == 0: continue sn = flex.mean(tmp.data() / tmp.sigmas()) if sn <= 1: res = binner.bin_d_range(i_bin)[1] break stats["resnatsnr1"] = res if "abdist" in stat_choice: from cctbx.uctbx.determine_unit_cell import NCDist G6a, G6b = ref_v6cell, v6cell(iobs.unit_cell().niggli_cell()) abdist = NCDist(G6a, G6b) stats["abdist"] = abdist if "wilsonb" in stat_choice: iso_scale_and_b = ml_iso_absolute_scaling(iobs, n_residues, 0) stats["wilsonb"] = iso_scale_and_b.b_wilson print stats return stats