def cluster_events( events, cutoff=10, linkage='average', ): if len(events) == 1: return SiteList([Site(events=events, id=1).apply_parentage()]) centroids = [e.cluster.centroid for e in events] cluster_ids = scipy.cluster.hierarchy.fclusterdata( X=centroids, t=cutoff, criterion='distance', metric='euclidean', method=linkage, ) cluster_ids = list(cluster_ids) sites = [] for s_idx, e_idxs in generate_group_idxs(cluster_ids): assert s_idx > 0 new_site = Site([events[i] for i in e_idxs], id=s_idx).apply_parentage() sites.append(new_site) return SiteList(sites)
def cluster_high_z_values(self, z_map_data, point_mask_idx): """Finds all the points in the z-map above `z_cutoff`, points will then be clustered into groups of cutoff `clustering_cutoff` angstroms""" # Select these values from the map point_mask_idx = flex.size_t(point_mask_idx) point_mask_val = z_map_data.select(point_mask_idx) # Find values above cutoff if self.params.negative_values: above_idx = (point_mask_val >= self.params.contour_level).iselection() below_idx = (point_mask_val <= -1.0 * self.params.contour_level).iselection() sel_idx = above_idx.concatenate(below_idx) else: sel_idx = (point_mask_val >= self.params.contour_level).iselection() # Extract values and grid points for these sites above_val = point_mask_val.select(sel_idx) above_idx = point_mask_idx.select(sel_idx) above_gps = flex.vec3_double( [idx_to_grid(i, grid_size=z_map_data.all()) for i in above_idx]) above_len = len(above_val) # No Cluster points found if above_len == 0: return 0, [] # One Cluster point found elif above_len == 1: return 1, [(above_gps, above_val)] # Can't cluster if there are too many points elif above_len > 10000: return -1, [(above_gps, above_val)] # Cluster points if we have found them else: self.log('> Clustering {!s} Points.'.format(above_len)) # Cluster the extracted points t1 = time.time() cluster_ids = scipy.cluster.hierarchy.fclusterdata( X=above_gps, t=self.grid_clustering_cutoff, criterion='distance', metric='euclidean', method='single') cluster_ids = list(cluster_ids) t2 = time.time() self.log('> Clustering > Time Taken: {!s} seconds'.format( int(t2 - t1))) # Get the number of clusters num_clusters = max(cluster_ids) # Group the values by cluster id z_clusters = [] for c_id, c_idxs in generate_group_idxs(cluster_ids): c_idxs = flex.size_t(c_idxs) c_gps = above_gps.select(c_idxs) c_val = above_val.select(c_idxs) z_clusters.append((c_gps, c_val)) assert num_clusters == len(z_clusters) return num_clusters, z_clusters
def group_clusters(cluster_ids, num_clusters, above_gps, above_val): # Group the values by cluster id z_clusters = [] for c_id, c_idxs in generate_group_idxs(cluster_ids): c_idxs = flex.size_t(c_idxs) c_gps = above_gps.select(c_idxs) c_val = above_val.select(c_idxs) z_clusters.append((c_gps, c_val)) assert num_clusters == len(z_clusters) return z_clusters
def by_unit_cell(cls, crystals, method='lcv', cutoff=0.5): """Cluster crystals by unit cell and return CrystalGroup for each cluster""" if len(crystals) == 1: return [cls(crystals)] assert method in ['lcv'], 'method not recognised' # # Method 1 # if method == 'lcv': link_func = lambda a,b: lcv_from_unit_cells(a.unit_cell, b.unit_cell) # hierarchy = libtbx.cluster.HierarchicalClustering(crystals, link_func) # clusters = hierarchy.getlevel(cutoff) # return [cls(c) for c in clusters] # Method 2 if method == 'lcv': link_func = pairwise_lcv dist_mat = link_func(unit_cells=[c.unit_cell for c in crystals]) link_mat = scipy.cluster.hierarchy.linkage(dist_mat, method='single', metric='euclidean') clusters = scipy.cluster.hierarchy.fcluster(link_mat, t=cutoff, criterion='distance') return [cls([crystals[idx] for idx in g]) for i_g,g in generate_group_idxs(clusters)]
def group_clusters(self, z_clusters, separation_cutoff=5): """Join clusters that are separated by less than max_separation""" if len(z_clusters) == 1: return 1, z_clusters else: self.log('----------------------------------->>>') self.log('Grouping Nearby Clusters') # Minimum distance between grid points to be joined (squared) grid_cutoff_sq = (separation_cutoff / self.grid_spacing)**2 # Record which clusters are to be joined connect_array = numpy.zeros((len(z_clusters), len(z_clusters)), dtype=int) for i_clust_1, (c_gps_1, c_val_1) in enumerate(z_clusters): for i_clust_2, (c_gps_2, c_val_2) in enumerate(z_clusters): # Skip if this is the same blob if i_clust_1 == i_clust_2: connect_array[(i_clust_1, i_clust_2)] = 1 continue # Extract the minimum separation of the grid points min_dist_sq = min( [min((c_gps_2 - gp).dot()) for gp in c_gps_1]) # Check to see if they should be joined if min_dist_sq < grid_cutoff_sq: connect_array[(i_clust_1, i_clust_2)] = 1 # Cluster the connection array cluster_groupings = find_connected_groups( connection_matrix=connect_array) # Concatenate smaller clusters into larger clusters grouped_clusters = [] for g_id, g_idxs in generate_group_idxs(cluster_groupings): g_gps = [] [g_gps.extend(z_clusters[i][0]) for i in g_idxs] g_gps = flex.vec3_double(g_gps) g_val = [] [g_val.extend(z_clusters[i][1]) for i in g_idxs] g_val = flex.double(g_val) grouped_clusters.append((g_gps, g_val)) assert len(grouped_clusters) == max(cluster_groupings) self.log('Grouped {!s} Clusters together to form {!s} Clusters'.format( len(z_clusters), len(grouped_clusters))) return len(grouped_clusters), grouped_clusters
def filter_z_clusters_3(self, z_clusters, dataset, max_contact_dist=8): """Find and remove symmetry equivalent clusters""" if len(z_clusters) == 1: return 1, z_clusters else: self.log('----------------------------------->>>') self.log('Filtering symmetry equivalent clusters') # Extract the protein sites in the reference frame d_sites_cart = protein(dataset.model.hierarchy).atoms().extract_xyz() d_unit_cell = dataset.model.unit_cell d_sym_ops = dataset.model.crystal_contact_operators() # Cartesianise and fractionalise the points in each of the clusters (in the crystallographic frame) points_cart = [None] * len(z_clusters) points_frac = [None] * len(z_clusters) for c_idx, (c_gps, c_val) in enumerate(z_clusters): # Extract points in cluster points_cart[c_idx] = dataset.model.alignment.ref2nat( self.grid.grid2cart(c_gps)) # Fractionalise them to the unit cell of the dataset points_frac[c_idx] = d_unit_cell.fractionalize(points_cart[c_idx]) # Find the sets of clusters that are symmetry related sym_equiv_groups = find_symmetry_equivalent_groups( points_frac=points_frac, sym_ops=d_sym_ops, unit_cell=d_unit_cell, cutoff_cart=1.05 * 1.7321 * self.grid_spacing) # max_contact_dist - a point contacts an atom if the atoms is within this distance of it # Save time - calculate the square of the contact distance max_contact_dist_sq = max_contact_dist**2 # Iterate through and chose one from each group to keep filt_z_clusters = [] for g_id, g_idxs in generate_group_idxs(sym_equiv_groups): # Count the number of contact for each cluster in the group c_contacts = [] # Iterate through cluster in the group for c_idx in g_idxs: # Initialise contact counter contacts = 0 # Get the cartesian points for the cluster c_points_cart = points_cart[c_idx] # Again, use the brute force all-v-all method for rp in d_sites_cart: diffs_cart = c_points_cart - rp # Check to see if site closer to cluster than minimum if min(diffs_cart.dot()) < max_contact_dist_sq: contacts += 1 # Record the number of contacts (over size of cluster) c_contacts.append(1.0 * contacts / len(c_points_cart)) # if self.log.verbose: # print('CLUSTER:', c_idx, ', CONTACTS PER POINT:', round(c_contacts[-1],3)) # Find the cluster with the most contacts max_contacts = max(c_contacts) if max_contacts == 0: raise Exception('MAX CONTACTS IS 0!') else: cluster_to_keep = g_idxs[c_contacts.index(max_contacts)] filt_z_clusters.append(z_clusters[cluster_to_keep]) # if self.log.verbose: # print('KEEPING CLUSTER', cluster_to_keep) assert len(filt_z_clusters) == max( sym_equiv_groups ), 'NUMBER OF UNIQUE GROUPS AND GROUPS TO BE RETURNED NOT THE SAME' self.log('Filtered {!s} Clusters to {!s} Clusters'.format( len(z_clusters), len(filt_z_clusters))) return len(filt_z_clusters), filt_z_clusters
def run(params): assert params.input.pdb, 'No pdb files provided' # REMOVE THIS WHEN IMPLEMENTED assert params.selection.res_names # REMOVE THIS WHEN IMPLEMENTED if params.selection.res_names: params.selection.__inject__("res_names_list", params.selection.res_names.split(',')) else: params.selection.__inject__("res_names_list", None) output_dir, images_dir = prepare_output_directory(params) scores_file = os.path.join(output_dir, 'residue_scores.csv') all_data = pandas.DataFrame() for pdb in params.input.pdb: mtz = pdb.replace('.pdb', '.mtz') if params.input.labels == 'basename': label = os.path.split_ext(os.path.basename(pdb))[0] elif params.input.labels == 'folder_name': label = os.path.basename(os.path.dirname(os.path.abspath(pdb))) print bar print 'Scoring model {} against {}'.format(pdb, mtz) data_table = score_model(params=params, pdb1=pdb, mtz1=mtz, pdb2=params.input.ref_pdb, label_prefix=label) all_data = all_data.append(data_table, verify_integrity=True) print '...Done' print bar all_data.to_csv(scores_file) print 'Output written to {}'.format(scores_file) print bar ################################################################### # Image parameters ################################################################### columns = format_parameters_for_plot(params=params.plot.parameters) ################################################################### # Output Images - 1 image per residue per structure ################################################################### all_images = [] print 'Generating Output Images...' for label, row in all_data.iterrows(): image_path = os.path.join(images_dir, '{}.png'.format(label)) print 'Making: {}...'.format(image_path) make_residue_radar_plot( path=image_path, data=row.to_frame().T, columns=columns, remove_blank_entries=params.plot.remove_blank_entries, print_axis_values=params.plot.print_axis_values) all_images.append(image_path) ################################################################### # Output Images - 1 image per residue (allowing comparisons) ################################################################### if params.radar_plot.limits == 'automatic': columns.pop('limits', None) elif params.radar_plot.limits == 'manual': pass for res_label, index_idxs in generate_group_idxs( [i.split('-')[-2:] for i in all_data.index]): res_label = '-'.join(res_label) image_path = os.path.join(images_dir, 'compare-{}.png'.format(res_label)) print 'Making: {}...'.format(image_path) make_residue_radar_plot( path=image_path, data=all_data.iloc[index_idxs], columns=columns, remove_blank_entries=params.plot.remove_blank_entries, print_axis_values=params.plot.print_axis_values) print '...Done.' print bar