def filter_z_clusters_2(self, z_clusters, dataset, min_contact_dist=6): """Find and remove clusters more than a minimum distance from the protein""" # min_contact_dist - blobs are rejected if they are more than this distance from the protein # Extract the protein sites in the reference frame ref_sites_cart = dataset.model.alignment.nat2ref(protein(dataset.model.hierarchy).atoms().extract_xyz()) # Save time - calculate the square of the contact distance min_contact_dist_sq = min_contact_dist ** 2 # Remove any clusters that are more than min_contact_dist from the protein filtered_c_idxs = [] for c_idx, (c_gps, c_val) in enumerate(z_clusters): # Extract points in cluster cluster_points_cart = self.grid.grid2cart(c_gps) # Calculate minimum distance to protein for r_site_cart in ref_sites_cart: diff_vecs_cart = cluster_points_cart - r_site_cart # Keep cluster if minimum distance is less than min_contact_dist if min(diff_vecs_cart.dot()) < min_contact_dist_sq: filtered_c_idxs.append(c_idx) break filt_z_clusters = [z_clusters[i] for i in filtered_c_idxs] return len(filt_z_clusters), filt_z_clusters
def filter_z_clusters_2(self, z_clusters, dataset, min_contact_dist=6): """Find and remove clusters more than a minimum distance from the protein""" # min_contact_dist - blobs are rejected if they are more than this distance from the protein self.log('----------------------------------->>>') self.log('Filtering by minimum distance from protein') # Extract the protein sites in the reference frame ref_sites_cart = dataset.model.alignment.nat2ref( protein(dataset.model.hierarchy).atoms().extract_xyz()) # Save time - calculate the square of the contact distance min_contact_dist_sq = min_contact_dist**2 # Remove any clusters that are more than min_contact_dist from the protein filtered_c_idxs = [] for c_idx, (c_gps, c_val) in enumerate(z_clusters): # Extract points in cluster cluster_points_cart = self.grid.grid2cart(c_gps) # Calculate minimum distance to protein for r_site_cart in ref_sites_cart: diff_vecs_cart = cluster_points_cart - r_site_cart # Keep cluster if minimum distance is less than min_contact_dist if min(diff_vecs_cart.dot()) < min_contact_dist_sq: filtered_c_idxs.append(c_idx) break # Report # if self.log.verbose: # if filtered_c_idxs and (filtered_c_idxs[-1] == c_idx): # print('KEEPING CLUSTER:', c_idx) # else: # print('REJECTING CLUSTER:', c_idx) # Select filtered clusters filt_z_clusters = [z_clusters[i] for i in filtered_c_idxs] self.log('Filtered {!s} Clusters to {!s} Clusters'.format( len(z_clusters), len(filt_z_clusters))) return len(filt_z_clusters), filt_z_clusters
def from_pdb(cls, pdb_input=None, pdb_hierarchy=None): """Calculate the b-factor statistics of a model""" assert [pdb_input, pdb_hierarchy ].count(None) == 1, 'Provide pdb_input OR pdb_hierarchy' if pdb_input: pdb_hierarchy = pdb_input.construct_hierarchy() cache = pdb_hierarchy.atom_selection_cache() all_b = non_h(hierarchy=pdb_hierarchy, cache=cache, copy=True).atoms().extract_b() protein_b = protein(hierarchy=pdb_hierarchy, cache=cache, copy=True).atoms().extract_b() backbone_b = backbone(hierarchy=pdb_hierarchy, cache=cache, copy=True).atoms().extract_b() sidechain_b = sidechains(hierarchy=pdb_hierarchy, cache=cache, copy=True).atoms().extract_b() return cls(all=basic_statistics(all_b), protein=basic_statistics(protein_b), backbone=basic_statistics(backbone_b), sidechain=basic_statistics(sidechain_b))
def run(self): """Process the dataset""" dataset, dataset_map, grid, map_analyser, args, verbose = self.data # TODO Hardcoded check - to be removed? TODO assert dataset_map.is_sparse() # ============================================================================> # Prepare output objects # ============================================================================> log_strs = [] log_file = dataset.file_manager.get_file('dataset_log') log = Log(log_file=log_file, verbose=False, silent=True) # ============================================================================> # Build new blob search object # ============================================================================> blob_finder = PanddaZMapAnalyser(params=args.params.z_map_analysis, grid=grid, log=log) print('Writing log for dataset {!s} to ...{}'.format( dataset.tag, log_file[log_file.index('processed'):])) # ============================================================================> # Extract the global mask object from the grid # ============================================================================> dset_total_temp = grid.global_mask().total_mask_binary().copy() # ============================================================================> # Generate symmetry masks for this dataset # ============================================================================> log.bar() log('Masking symetry contacts from Z-map.') # Generate symmetry contacts for this dataset and align to reference frame dataset_sym_copies = dataset.model.crystal_contacts( distance_cutoff=args.params.masks.outer_mask + 5, combine_copies=True) dataset_sym_copies.atoms().set_xyz( dataset.model.alignment.nat2ref( dataset_sym_copies.atoms().extract_xyz())) # Only need to write if writing reference frame maps if args.output.developer.write_reference_frame_maps: dataset_sym_copies.write_pdb_file( dataset.file_manager.get_file('symmetry_copies')) # Extract protein atoms from the symmetry copies dataset_sym_sites_cart = non_water( dataset_sym_copies).atoms().extract_xyz() # Generate symmetry contacts grid mask dataset_mask = GridMask(parent=grid, sites_cart=dataset_sym_sites_cart, max_dist=args.params.masks.outer_mask, min_dist=args.params.masks.inner_mask_symmetry) # Combine with the total mask to generate custom mask for this dataset dset_total_temp.put(dataset_mask.inner_mask_indices(), 0) dset_total_idxs = numpy.where(dset_total_temp)[0] log('After masking with symmetry contacts: {} points for Z-map analysis' .format(len(dset_total_idxs))) # Write map of grid + symmetry mask if args.output.developer.write_reference_frame_grid_masks: grid.write_indices_as_map( indices=dset_total_idxs, f_name=dataset.file_manager.get_file('grid_mask'), origin_shift=True) # ============================================================================> # Generate custom masks for this dataset # ============================================================================> if args.params.z_map_analysis.masks.selection_string is not None: log.bar() log('Applying custom mask to the Z-map: "{}"'.format( args.params.z_map_analysis.masks.selection_string)) cache = dataset.model.hierarchy.atom_selection_cache() custom_mask_selection = cache.selection( args.params.z_map_analysis.masks.selection_string) custom_mask_sites = dataset.model.hierarchy.select( custom_mask_selection).atoms().extract_xyz() log('Masking with {} atoms'.format(len(custom_mask_sites))) # Generate custom grid mask dataset_mask = GridMask( parent=grid, sites_cart=custom_mask_sites, max_dist=args.params.z_map_analysis.masks.outer_mask, min_dist=args.params.z_map_analysis.masks.inner_mask) # Combine with the total mask to generate custom mask for this dataset dset_total_temp *= dataset_mask.total_mask_binary() dset_total_idxs = numpy.where(dset_total_temp)[0] log('After masking with custom mask: {} points for Z-map analysis'. format(len(dset_total_idxs))) # Write out mask grid.write_indices_as_map( indices=dset_total_idxs, f_name=dataset.file_manager.get_file('z_map_mask'), origin_shift=True) # ============================================================================> ##### # CALCULATE Z-MAPS AND LOOK FOR LARGE BLOBS ##### # ============================================================================> # Check maps and that all maps are sparse # ============================================================================> assert dataset_map.data is not None, 'Something has gone wrong - this dataset has no loaded map' assert dataset_map.is_sparse( ) is map_analyser.statistical_maps.mean_map.is_sparse() assert dataset_map.is_sparse( ) is map_analyser.statistical_maps.medn_map.is_sparse() assert dataset_map.is_sparse( ) is map_analyser.statistical_maps.stds_map.is_sparse() assert dataset_map.is_sparse( ) is map_analyser.statistical_maps.sadj_map.is_sparse() # ============================================================================> # CALCULATE MEAN-DIFF MAPS # ============================================================================> mean_diff_map = map_analyser.calculate_z_map(map=dataset_map, method='none') # # ============================================================================> # # NAIVE Z-MAP - NOT USING UNCERTAINTY ESTIMATION OR ADJUSTED STDS # # ============================================================================> # z_map_naive = map_analyser.calculate_z_map(map=dataset_map, method='naive') # z_map_naive_normalised = z_map_naive.normalised_copy() # ============================================================================> # UNCERTAINTY Z-MAP - NOT USING ADJUSTED STDS # ============================================================================> z_map_uncty = map_analyser.calculate_z_map( map=dataset_map, uncertainty=dataset_map.meta.map_uncertainty, method='uncertainty') z_map_uncty_normalised = z_map_uncty.normalised_copy() # ============================================================================> # ADJUSTED+UNCERTAINTY Z-MAP # ============================================================================> z_map_compl = map_analyser.calculate_z_map( map=dataset_map, uncertainty=dataset_map.meta.map_uncertainty, method='adjusted+uncertainty') z_map_compl_normalised = z_map_compl.normalised_copy() # ============================================================================> # SELECT WHICH MAP TO DO THE BLOB SEARCHING ON # ============================================================================> # if args.params.statistical_maps.z_map_type == 'naive': # z_map = z_map_naive_normalised # z_map_stats = basic_statistics(flex.double(z_map_naive.data)) if args.params.statistical_maps.z_map_type == 'uncertainty': z_map = z_map_uncty_normalised z_map_stats = basic_statistics(flex.double(z_map_uncty.data)) elif args.params.statistical_maps.z_map_type == 'adjusted+uncertainty': z_map = z_map_compl_normalised z_map_stats = basic_statistics(flex.double(z_map_compl.data)) else: raise Exception('Invalid Z-map type') # ============================================================================> # RECORD Z-MAP FOR STATISTICS # ============================================================================> # Calculate statistics of z-maps dataset_map.meta.z_mean = z_map_stats.mean dataset_map.meta.z_stdv = z_map_stats.bias_corrected_standard_deviation dataset_map.meta.z_skew = z_map_stats.skew dataset_map.meta.z_kurt = z_map_stats.kurtosis # ============================================================================> z_map.meta.type = 'z-map' # ============================================================================> # ============================================================================> ##### # WRITE ALL MAP DISTRIBUTIONS (THESE DON'T USE MUCH SPACE) ##### # ============================================================================> # Sampled Map analyse_graphs.map_value_distribution( f_name=dataset.file_manager.get_file('s_map_png'), plot_vals=dataset_map.get_map_data(sparse=True)) # Mean-Difference analyse_graphs.map_value_distribution( f_name=dataset.file_manager.get_file('d_mean_map_png'), plot_vals=mean_diff_map.get_map_data(sparse=True)) # # Naive Z-Map # analyse_graphs.map_value_distribution(f_name = dataset.file_manager.get_file('z_map_naive_png'), # plot_vals = z_map_naive.get_map_data(sparse=True), # plot_normal = True) # # Normalised Naive Z-Map # analyse_graphs.map_value_distribution(f_name = dataset.file_manager.get_file('z_map_naive_normalised_png'), # plot_vals = z_map_naive_normalised.get_map_data(sparse=True), # plot_normal = True) # Uncertainty Z-Map analyse_graphs.map_value_distribution( f_name=dataset.file_manager.get_file('z_map_uncertainty_png'), plot_vals=z_map_uncty.get_map_data(sparse=True), plot_normal=True) # Normalised Uncertainty Z-Map analyse_graphs.map_value_distribution( f_name=dataset.file_manager.get_file( 'z_map_uncertainty_normalised_png'), plot_vals=z_map_uncty_normalised.get_map_data(sparse=True), plot_normal=True) # Corrected Z-Map analyse_graphs.map_value_distribution( f_name=dataset.file_manager.get_file('z_map_corrected_png'), plot_vals=z_map_compl.get_map_data(sparse=True), plot_normal=True) # Normalised Corrected Z-Map analyse_graphs.map_value_distribution( f_name=dataset.file_manager.get_file( 'z_map_corrected_normalised_png'), plot_vals=z_map_compl_normalised.get_map_data(sparse=True), plot_normal=True) # Plot Q-Q Plot of Corrected Z-Map to see how normal it is analyse_graphs.qq_plot_against_normal( f_name=dataset.file_manager.get_file('z_map_qq_plot_png'), plot_vals=z_map_compl_normalised.get_map_data(sparse=True)) # ============================================================================> ##### # LOOK FOR CLUSTERS OF LARGE Z-SCORES ##### # ============================================================================> # Contour the grid at a particular Z-Value # ============================================================================> num_clusters, z_clusters = blob_finder.cluster_high_z_values( z_map_data=z_map.get_map_data(sparse=False), point_mask_idx=dset_total_idxs) # ============================================================================> # Too many points to cluster -- probably a bad dataset # ============================================================================> if num_clusters == -1: # This dataset is too noisy to analyse - flag! log_strs.append( 'Z-Map too noisy to analyse -- not sure what has gone wrong here...' ) return dataset, dataset_map.meta, log_strs # ============================================================================> ##### # FILTER/SELECT CLUSTERS OF Z-SCORES ##### # ============================================================================> # Filter the clusters by size and peak height # ============================================================================> if num_clusters > 0: num_clusters, z_clusters = blob_finder.filter_z_clusters_1( z_clusters=z_clusters) blob_finder.validate_clusters(z_clusters) if num_clusters == 0: log_strs.append('===> Minimum cluster peak/size not reached.') # ============================================================================> # Filter the clusters by distance from protein # ============================================================================> if num_clusters > 0: num_clusters, z_clusters = blob_finder.filter_z_clusters_2( z_clusters=z_clusters, dataset=dataset) blob_finder.validate_clusters(z_clusters) if num_clusters == 0: log_strs.append('===> Clusters too far from protein.') # ============================================================================> # Group Nearby Clusters Together # ============================================================================> if num_clusters > 0: num_clusters, z_clusters = blob_finder.group_clusters( z_clusters=z_clusters) blob_finder.validate_clusters(z_clusters) # ============================================================================> # Filter the clusters by symmetry equivalence # ============================================================================> if num_clusters > 0: num_clusters, z_clusters = blob_finder.filter_z_clusters_3( z_clusters=z_clusters, dataset=dataset) blob_finder.validate_clusters(z_clusters) # ============================================================================> ##### # WRITE MAPS ##### # ============================================================================> # write dataset maps in the reference frame # ============================================================================> if args.output.developer.write_reference_frame_maps: dataset_map.to_file( filename=dataset.file_manager.get_file('sampled_map'), space_group=grid.space_group()) mean_diff_map.to_file( filename=dataset.file_manager.get_file('mean_diff_map'), space_group=grid.space_group()) z_map.to_file(filename=dataset.file_manager.get_file('z_map'), space_group=grid.space_group()) # ============================================================================> # Write out mask of the high z-values # ============================================================================> if args.output.developer.write_reference_frame_grid_masks: # Write map of where the blobs are (high-Z mask) highz_points = [] [highz_points.extend(list(x[0])) for x in z_clusters] highz_points = [map(int, v) for v in highz_points] highz_indices = map(grid.indexer(), list(highz_points)) grid.write_indices_as_map( indices=highz_indices, f_name=dataset.file_manager.get_file('high_z_mask'), origin_shift=True) # ============================================================================> # Write different Z-Maps? (Probably only needed for testing) # ============================================================================> if args.output.developer.write_reference_frame_all_z_map_types: # z_map_naive.to_file(filename=dataset.file_manager.get_file('z_map_naive'), space_group=grid.space_group()) # z_map_naive_normalised.to_file(filename=dataset.file_manager.get_file('z_map_naive_normalised'), space_group=grid.space_group()) z_map_uncty.to_file( filename=dataset.file_manager.get_file('z_map_uncertainty'), space_group=grid.space_group()) z_map_uncty_normalised.to_file( filename=dataset.file_manager.get_file( 'z_map_uncertainty_normalised'), space_group=grid.space_group()) z_map_compl.to_file( filename=dataset.file_manager.get_file('z_map_corrected'), space_group=grid.space_group()) z_map_compl_normalised.to_file( filename=dataset.file_manager.get_file( 'z_map_corrected_normalised'), space_group=grid.space_group()) # ============================================================================> # Skip to next dataset if no clusters found # ============================================================================> if num_clusters > 0: log_strs.append('===> {!s} Cluster(s) found.'.format(num_clusters)) else: log_strs.append('===> No Clusters found.') return (dataset, dataset_map.meta, log_strs) assert num_clusters > 0, 'NUMBER OF CLUSTERS AFTER FILTERING == 0!' # ============================================================================> # Extract the map data in non-sparse format # ============================================================================> dset_map_data = dataset_map.get_map_data(sparse=False) avrg_map_data = map_analyser.average_map().get_map_data(sparse=False) # ============================================================================> # Process the identified features # ============================================================================> for event_idx, (event_points, event_values) in enumerate(z_clusters): # Number events from 1 event_num = event_idx + 1 # Create a unique identifier for this event event_key = (dataset.tag, event_num) # ============================================================================> # Create a point cluster object # ============================================================================> point_cluster = PointCluster(id=event_key, points=event_points, values=event_values) # ============================================================================> # Estimate the background correction of the detected feature # ============================================================================> # Extract sites for this cluster and estimate the background correction for the event log_strs.append('----------------------------------->>>') log_strs.append( 'Estimating Event {!s} Background Correction'.format( event_num)) # Generate custom grid mask for this dataset event_mask = GridMask(parent=grid, sites_cart=grid.grid2cart( point_cluster.points, origin_shift=True), max_dist=2.0, min_dist=0.0) log_strs.append( '=> Event sites ({!s} points) expanded to {!s} points'.format( len(point_cluster.points), len(event_mask.outer_mask_indices()))) # Select masks to define regions for bdc calculation exp_event_idxs = flex.size_t(event_mask.outer_mask_indices()) reference_idxs = flex.size_t( grid.global_mask().inner_mask_indices()) # ============================================================================> # Generate BDC-estimation curve and estimate BDC # ============================================================================> event_remains, event_corrs, global_corrs = calculate_varying_bdc_correlations( ref_map_data=avrg_map_data, query_map_data=dset_map_data, feature_idxs=exp_event_idxs, reference_idxs=reference_idxs, min_remain=1.0 - args.params.background_correction.max_bdc, max_remain=1.0 - args.params.background_correction.min_bdc, bdc_increment=args.params.background_correction.increment, verbose=verbose) event_remain_est = calculate_maximum_series_discrepancy( labels=event_remains, series_1=global_corrs, series_2=event_corrs) analyse_graphs.write_occupancy_graph( f_name=dataset.file_manager.get_file('bdc_est_png').format( event_num), x_values=event_remains, global_values=global_corrs, local_values=event_corrs) log_strs.append( '=> Event Background Correction estimated as {!s}'.format( 1 - event_remain_est)) # Reporting (log is normally silenced) blob_finder.log('Min-Max: {} {}'.format( 1.0 - args.params.background_correction.max_bdc, 1.0 - args.params.background_correction.min_bdc)) blob_finder.log('Event number: {}'.format(event_num)) blob_finder.log('Event Remains: {}'.format(','.join( map(str, event_remains)))) blob_finder.log('Event Corrs: {}'.format(','.join( map(str, event_corrs)))) blob_finder.log('Global Corrs: {}'.format(','.join( map(str, global_corrs)))) # Apply multiplier if provided blob_finder.log('Applying multiplier to output 1-BDC: {}'.format( args.params.background_correction.output_multiplier)) event_remain_est = min( event_remain_est * args.params.background_correction.output_multiplier, 1.0 - args.params.background_correction.min_bdc) # ============================================================================> # Calculate the map correlations at the selected BDC # ============================================================================> event_map_data = calculate_bdc_subtracted_map( ref_map_data=avrg_map_data, query_map_data=dset_map_data, bdc=1.0 - event_remain_est) global_corr = numpy.corrcoef( event_map_data.select(reference_idxs), avrg_map_data.select(reference_idxs))[0, 1] local_corr = numpy.corrcoef( event_map_data.select(exp_event_idxs), avrg_map_data.select(exp_event_idxs))[0, 1] # ============================================================================> # Write out EVENT map (in the reference frame) and grid masks # ============================================================================> if args.output.developer.write_reference_frame_maps: event_map = dataset_map.new_from_template(event_map_data, sparse=False) event_map.to_file( filename=dataset.file_manager.get_file('event_map').format( event_num, event_remain_est), space_group=grid.space_group()) if args.output.developer.write_reference_frame_grid_masks: grid.write_indices_as_map( indices=event_mask.outer_mask_indices(), f_name=dataset.file_manager.get_file('grid_mask').replace( '.ccp4', '') + '-event-mask-{}.ccp4'.format(event_num)) # ============================================================================> # Find the nearest atom to the event # ============================================================================> atm = find_nearest_atoms(atoms=list( protein(dataset.model.hierarchy).atoms_with_labels()), query=dataset.model.alignment.ref2nat( grid.grid2cart(sites_grid=[ map(int, point_cluster.centroid) ], origin_shift=True)))[0] log_strs.append( '=> Nearest Residue to event: Chain {}, Residue {} {}'.format( atm.chain_id, atm.resname, atm.resid())) # ============================================================================> # Create an event object # ============================================================================> event_obj = Event(id=point_cluster.id, cluster=point_cluster) event_obj.info.estimated_pseudo_occupancy = event_remain_est event_obj.info.estimated_bdc = 1.0 - event_remain_est event_obj.info.global_correlation = global_corr event_obj.info.local_correlation = local_corr # ============================================================================> # Append to dataset handler # ============================================================================> dataset.events.append(event_obj) # ============================================================================> # Write out pymol script to load all of the maps easily # ============================================================================> pml = PythonScript() pml.set_normalise_maps(False) # Load Structures name = pml.load_pdb( f_name=dataset.file_manager.get_file('aligned_model')) pml.repr_as(obj=name, style='sticks') name = pml.load_pdb( f_name=dataset.file_manager.get_file('symmetry_copies')) pml.repr_hide(obj=name) # Load Sampled Map name = pml.load_map( f_name=dataset.file_manager.get_file('sampled_map')) mesh = pml.make_mesh(obj=name, contour_level=1.0, colour='blue') # Load Z-maps name = pml.load_map(f_name=dataset.file_manager.get_file('z_map')) mesh = pml.make_mesh(obj=name, mesh_suffix='.plus', contour_level=3.0, colour='green') mesh = pml.make_mesh(obj=name, mesh_suffix='.mins', contour_level=-3.0, colour='red') # Load Event maps for f in sorted( glob.glob( dataset.file_manager.get_file('event_map').format( '*', '*'))): name = pml.load_map(f_name=f) mesh = pml.make_mesh(obj=name, contour_level=float(f.split('_')[-2]), colour='hotpink') # Load Miscellaneous maps (e.g. masks) for f in sorted( glob.glob( os.path.join(dataset.file_manager.get_dir('root'), '*mask*.ccp4'))): name = pml.load_map(f_name=f) mesh = pml.make_mesh(obj=name, contour_level=0.0, colour='grey') pml.write_script(f_name=dataset.file_manager.get_file('pymol_script'), overwrite=True) return (dataset, dataset_map.meta, log_strs)
def mask_reference_grid(self, dataset, selection=None): """Create masks for the reference grid based on distances from atoms in the reference structure""" # ============================================================================> # Get main and neighbouring symmetry copies of the masking structure # ============================================================================> ref_h = dataset.model.hierarchy sym_h = dataset.model.crystal_contacts(distance_cutoff=self.outer_mask+5.0, combine_copies=True) # ============================================================================> # Apply mask (protein=default if selection is not given) # ============================================================================> if selection: ref_h = ref_h.select(ref_h.atom_selection_cache().selection(selection), copy_atoms=True) else: ref_h = protein(ref_h) # ============================================================================> # Always generate symmetry mask using all non-water atoms - TODO also allow custom definitions? TODO # ============================================================================> sym_h = non_water(sym_h) # ============================================================================> # Check that these contain atoms # ============================================================================> if len(ref_h.atoms()) == 0: raise Sorry('Zero atoms have been selected to mask the grid') if len(sym_h.atoms()) == 0: raise Sorry('Zero atoms have been selected to mask the grid') # ============================================================================> # Extract coordinates # ============================================================================> ref_sites_cart = dataset.model.alignment.nat2ref(ref_h.atoms().extract_xyz()) sym_sites_cart = dataset.model.alignment.nat2ref(sym_h.atoms().extract_xyz()) # ============================================================================> # Global mask used for removing points in the bulk solvent regions # ============================================================================> if self.grid.global_mask() is None: global_mask = AtomicMask(parent = self.grid, sites_cart = ref_sites_cart, max_dist = self.outer_mask, min_dist = self.inner_mask) self.grid.set_global_mask(global_mask) # ============================================================================> # Global mask used for removing points close to symmetry copies of the protein # ============================================================================> if self.grid.symmetry_mask() is None: symmetry_mask = GridMask(parent = self.grid, sites_cart = sym_sites_cart, max_dist = self.outer_mask, min_dist = self.inner_mask_symmetry) self.grid.set_symmetry_mask(symmetry_mask) # ============================================================================> # Write masked maps # ============================================================================> # # Write protein masked map # indices = self.grid.global_mask().total_mask_indices() # f_name = self.file_manager.get_file('reference_dataset').replace('.mtz','.totalmask.ccp4') # if self.args.output.developer.write_grid_frame_masks: # self.grid.write_indices_as_map(indices=indices, f_name=splice_ext(f_name, 'grid', position=-1), origin_shift=False) # if 1 or self.args.output.developer.write_reference_frame_common_masks_and_maps: # self.grid.write_indices_as_map(indices=indices, f_name=splice_ext(f_name, 'ref', position=-1), origin_shift=True) # # # Write symmetry masked map # indices = self.grid.symmetry_mask().total_mask_indices() # f_name = self.file_manager.get_file('reference_dataset').replace('.mtz','.symmask.ccp4') # if self.args.output.developer.write_grid_frame_masks: # self.grid.write_indices_as_map(indices=indices, f_name=splice_ext(f_name, 'grid', position=-1), origin_shift=False) # if 1 or self.args.output.developer.write_reference_frame_common_masks_and_maps: # self.grid.write_indices_as_map(indices=indices, f_name=splice_ext(f_name, 'ref', position=-1), origin_shift=True) return self.grid
def filter_z_clusters_3(self, z_clusters, dataset, max_contact_dist=8): """Find and remove symmetry equivalent clusters""" if len(z_clusters) == 1: return 1, z_clusters else: self.log('----------------------------------->>>') self.log('Filtering symmetry equivalent clusters') # Extract the protein sites in the reference frame d_sites_cart = protein(dataset.model.hierarchy).atoms().extract_xyz() d_unit_cell = dataset.model.unit_cell d_sym_ops = dataset.model.crystal_contact_operators() # Cartesianise and fractionalise the points in each of the clusters (in the crystallographic frame) points_cart = [None] * len(z_clusters) points_frac = [None] * len(z_clusters) for c_idx, (c_gps, c_val) in enumerate(z_clusters): # Extract points in cluster points_cart[c_idx] = dataset.model.alignment.ref2nat( self.grid.grid2cart(c_gps)) # Fractionalise them to the unit cell of the dataset points_frac[c_idx] = d_unit_cell.fractionalize(points_cart[c_idx]) # Find the sets of clusters that are symmetry related sym_equiv_groups = find_symmetry_equivalent_groups( points_frac=points_frac, sym_ops=d_sym_ops, unit_cell=d_unit_cell, cutoff_cart=1.05 * 1.7321 * self.grid_spacing) # max_contact_dist - a point contacts an atom if the atoms is within this distance of it # Save time - calculate the square of the contact distance max_contact_dist_sq = max_contact_dist**2 # Iterate through and chose one from each group to keep filt_z_clusters = [] for g_id, g_idxs in generate_group_idxs(sym_equiv_groups): # Count the number of contact for each cluster in the group c_contacts = [] # Iterate through cluster in the group for c_idx in g_idxs: # Initialise contact counter contacts = 0 # Get the cartesian points for the cluster c_points_cart = points_cart[c_idx] # Again, use the brute force all-v-all method for rp in d_sites_cart: diffs_cart = c_points_cart - rp # Check to see if site closer to cluster than minimum if min(diffs_cart.dot()) < max_contact_dist_sq: contacts += 1 # Record the number of contacts (over size of cluster) c_contacts.append(1.0 * contacts / len(c_points_cart)) # if self.log.verbose: # print('CLUSTER:', c_idx, ', CONTACTS PER POINT:', round(c_contacts[-1],3)) # Find the cluster with the most contacts max_contacts = max(c_contacts) if max_contacts == 0: raise Exception('MAX CONTACTS IS 0!') else: cluster_to_keep = g_idxs[c_contacts.index(max_contacts)] filt_z_clusters.append(z_clusters[cluster_to_keep]) # if self.log.verbose: # print('KEEPING CLUSTER', cluster_to_keep) assert len(filt_z_clusters) == max( sym_equiv_groups ), 'NUMBER OF UNIQUE GROUPS AND GROUPS TO BE RETURNED NOT THE SAME' self.log('Filtered {!s} Clusters to {!s} Clusters'.format( len(z_clusters), len(filt_z_clusters))) return len(filt_z_clusters), filt_z_clusters
def score_model(params, pdb1, mtz1, pdb2=None, mtz2=None, label_prefix='', verbose=False): """ Score residues against density, and generate other model quality indicators. Identified residues in pdb1 are scored against mtz1 (and mtz2, if provided) using edstats. Identified residues in pdb1 are compared to the equivalent residues in pdb2, if provided. B-factors ratios of identified residues to surrounding sidechains are calculated. """ if label_prefix: label_prefix = label_prefix + '-' # Extract the residues to look for res_names = params.selection.res_names_list print 'Reading input structure:', pdb1 # Extract Structure h1_all = non_h(strip_pdb_to_input(pdb1, remove_ter=True, remove_end=True).hierarchy) # Normalise hierarchy (standardise atomic naming, etc...) sanitise_hierarchy(h1_all) h1_pro = protein(h1_all) h1_bck = backbone(h1_all) h1_sch = sidechains(h1_all) # Pull out residues to analyse if res_names: rg_for_analysis = [rg for rg in h1_all.residue_groups() if [n for n in rg.unique_resnames() if n in res_names]] print 'Selecting residues named {}: {} residue(s)'.format(' or '.join(res_names), len(rg_for_analysis)) else: rg_for_analysis = h1_all.residue_groups() print 'Analysing all residues ({} residues)'.format(len(rg_for_analysis)) # Check residues to analyse or skip if not rg_for_analysis: raise Exception('There are no residues called {} in {}'.format(' or '.join(params.selection.res_names_list), pdb1)) # Extract PDB2 if pdb2 is not None: print 'Reading input structure:', pdb2 h2_all = non_h(strip_pdb_to_input(pdb2, remove_ter=True, remove_end=True).hierarchy) sanitise_hierarchy(h2_all) # Score MTZ1 if mtz1 is not None: print 'Scoring model against mtz file' print 'Scoring {} >>> {}'.format(pdb1, mtz1) mtz1_edstats_scores = Edstats(mtz_file=mtz1, pdb_file=pdb1, f_label=params.input.f_label) else: mtz1_edstats_scores = None # Score MTZ2 if mtz2 is not None: print 'Scoring model against mtz file' print 'Scoring {} >>> {}'.format(pdb1, mtz2) mtz2_edstats_scores = Edstats(mtz_file=mtz2, pdb_file=pdb1, f_label=params.input.f_label) else: mtz2_edstats_scores = None # Prepare output table data_table = prepare_table() for rg_sel in rg_for_analysis: # Create label for the output table #rg_label = (label_prefix+rg_sel.unique_resnames()[0]+'-'+rg_sel.parent().id+'-'+rg_sel.resseq+rg_sel.icode).replace(' ','') #rg_label = (label_prefix+rg_sel.parent().id+'-'+rg_sel.resseq+rg_sel.icode).replace(' ','') rg_label = ShortLabeller.format(rg_sel).replace(' ','') tab_label = label_prefix + rg_label if len(rg_sel.unique_resnames()) != 1: raise Exception(tab_label+': More than one residue name associated with residue group -- cannot process') # Append empty row to output table data_table.loc[tab_label] = None data_table.set_value(index = tab_label, col = 'PDB', value = pdb1 ) data_table.set_value(index = tab_label, col = 'Occupancy', value = calculate_residue_group_occupancy(residue_group=rg_sel) ) data_table = calculate_residue_group_bfactor_ratio(residue_group = rg_sel, hierarchy = h1_sch, data_table = data_table, rg_label = tab_label) if pdb2 is not None: data_table.set_value(index = tab_label, col = 'PDB-2', value = pdb2 ) # Extract the equivalent residue in pdb2 rg_sel_2 = [rg for rg in h2_all.residue_groups() if ShortLabeller.format(rg).replace(' ','') == rg_label] try: assert rg_sel_2, 'Residue is not present in pdb file: {} not in {}'.format(rg_label, pdb2) assert len(rg_sel_2) == 1, 'More than one residue has been selected for {} in {}'.format(rg_label, pdb2) except: raise # Extract occupancy data_table.set_value(index = tab_label, col = 'Occupancy-2', value = calculate_residue_group_occupancy(residue_group=rg_sel_2[0]) ) # Calculate the RMSD between the models try: confs1, confs2, rmsds = zip(*calculate_paired_conformer_rmsds(conformers_1=rg_sel.conformers(), conformers_2=rg_sel_2[0].conformers())) data_table.set_value(index=tab_label, col='Model RMSD', value=min(rmsds)) except: raise print 'Could not calculate RMSD between pdb_1 and pdb_2 for residue {}'.format(rg_label) pass # Extract Density Scores - MTZ 1 if mtz1 is not None: data_table.set_value(index=tab_label, col='MTZ', value=mtz1) if mtz1_edstats_scores is not None: data_table = mtz1_edstats_scores.extract_residue_group_scores( residue_group = rg_sel, data_table = data_table, rg_label = tab_label ) # Normalise the RSZO by the Occupancy of the ligand data_table['RSZO/OCC'] = data_table['RSZO']/data_table['Occupancy'] # Extract Density Scores - MTZ 2 if mtz2 is not None: data_table.set_value(index=tab_label, col='MTZ-2', value=mtz2) if mtz2_edstats_scores is not None: data_table = mtz2_edstats_scores.extract_residue_group_scores( residue_group = rg_sel, data_table = data_table, rg_label = tab_label, column_suffix = '-2' ) # Normalise the RSZO by the Occupancy of the ligand data_table['RSZO/OCC-2'] = data_table['RSZO-2']/data_table['Occupancy-2'] return data_table
def __call__( self, dataset, dataset_map, ref_map, events, grid, ): # ============================================================================> # Extract the map data in non-sparse format # ============================================================================> dset_map_data = dataset_map.get_map_data(sparse=False) ref_map_data = ref_map.get_map_data(sparse=False) # ============================================================================> # Unpack cluster # ============================================================================> event_stats = OrderedDict() for event in events[2]: # ============================================================================> # Estimate the background correction of the detected feature # ============================================================================> # Extract sites for this cluster and estimate the background correction for the event # Generate custom grid mask for this dataset event_mask = GridMask( parent=grid, sites_cart=grid.grid2cart(event.cluster.points, origin_shift=True), max_dist=2.0, min_dist=0.0, ) # Select masks to define regions for bdc calculation exp_event_idxs = flex.size_t(event_mask.outer_mask_indices()) reference_idxs = flex.size_t( grid.global_mask().inner_mask_indices()) # ============================================================================> # Generate BDC-estimation curve and estimate BDC # ============================================================================> event_remains, event_corrs, global_corrs = calculate_varying_bdc_correlations( ref_map_data=ref_map_data, query_map_data=dset_map_data, feature_idxs=exp_event_idxs, reference_idxs=reference_idxs, min_remain=1.0 - self.max_bdc, max_remain=1.0 - self.min_bdc, bdc_increment=self.increment, verbose=True) event_remain_est = calculate_maximum_series_discrepancy( labels=event_remains, series_1=global_corrs, series_2=event_corrs) event_remain_est = min(event_remain_est * self.output_multiplier, 1.0 - self.min_bdc) # ============================================================================> # Calculate the map correlations at the selected BDC # ============================================================================> event_map_data = calculate_bdc_subtracted_map( ref_map_data=ref_map_data, query_map_data=dset_map_data, bdc=1.0 - event_remain_est) global_corr = \ numpy.corrcoef(event_map_data.select(reference_idxs), ref_map_data.select(reference_idxs))[ 0, 1] local_corr = numpy.corrcoef(event_map_data.select(exp_event_idxs), ref_map_data.select(exp_event_idxs))[0, 1] # ============================================================================> # Update event parameters # ============================================================================> event.info.estimated_pseudo_occupancy = event_remain_est event.info.estimated_bdc = 1.0 - event_remain_est event.info.global_correlation = global_corr event.info.local_correlation = local_corr # ============================================================================> # Find the nearest atom to the event # ============================================================================> # TODO: restore this? atm = find_nearest_atoms(atoms=list( protein(dataset.model.hierarchy).atoms_with_labels()), query=dataset.model.alignment.ref2nat( grid.grid2cart(sites_grid=[ map(int, event.cluster.centroid) ], origin_shift=True)))[0] event_stats[event.id] = OrderedDict() event_stats[ event.id]["estimated_pseudo_occupancy"] = event_remain_est event_stats[event.id]["estimated_bdc"] = 1.0 - event_remain_est event_stats[event.id]["global_corr"] = global_corr event_stats[event.id]["local_corr"] = global_corr return event_stats
def align_structures_rigid(mov_hier, ref_hier): """Extract c-alpha sites from the structures and align""" lsq_rt, alignment_sites, reference_sites = align_chains_rigid(mov_chain=protein(mov_hier, copy=True).models()[0].only_chain(), ref_chain=protein(ref_hier, copy=True).models()[0].only_chain()) return GlobalAlignment(alignment_mx=lsq_rt, alignment_sites=alignment_sites, reference_sites=reference_sites, id=None)