def get_program_name(self): software_name = self.cif_block.get('_software.name') software_classification = self.cif_block.get('_software.classification') if isinstance(software_classification, string_types): if software_classification == 'refinement': return software_name elif software_classification is not None: i = flex.first_index(software_classification, 'refinement') if i >= 0: return software_name[i]
def get_program_name(self): software_name = self.cif_block.get('_software.name') software_classification = self.cif_block.get('_software.classification') if (isinstance(software_classification, basestring) and software_classification == 'refinement'): return software_name if software_classification is not None: i = flex.first_index(software_classification, 'refinement') if i >= 0: return software_name[i]
def deposition_date(self): # date format: yyyy-mm-dd cif_block = self.cif_model.values()[0] rev_num = cif_block.get('_database_PDB_rev.num') if rev_num is not None: date_original = cif_block.get('_database_PDB_rev.date_original') if isinstance(rev_num, basestring): return date_original else: i = flex.first_index(rev_num, '1') if date_original is not None: return date_original[i]
def had_phase_transition(self): if len(self.differences) < 5: return False i_max = flex.max_index(self.differences) noise_before = (self.differences < self.noise_level_before*self.differences[i_max]) before = flex.last_index(noise_before[:i_max], True) if before is None: before = -1 before += 1 if i_max - before < 4: return False negative_after = self.differences < 0 after = flex.first_index(negative_after[i_max:], True) if after is None: return False after += i_max if after - before < 10: return False if len(self.values) - after < 10: return False tail_stats = scitbx.math.basic_statistics(self.differences[-5:]) if (tail_stats.max_absolute > self.noise_level_after*self.differences[i_max]): return False return True
def had_phase_transition(self): if len(self.differences) < 5: return False i_max = flex.max_index(self.differences) noise_before = (self.differences < self.noise_level_before * self.differences[i_max]) before = flex.last_index(noise_before[:i_max], True) if before is None: before = -1 before += 1 if i_max - before < 4: return False negative_after = self.differences < 0 after = flex.first_index(negative_after[i_max:], True) if after is None: return False after += i_max if after - before < 10: return False if len(self.values) - after < 10: return False tail_stats = scitbx.math.basic_statistics(self.differences[-5:]) if (tail_stats.max_absolute > self.noise_level_after * self.differences[i_max]): return False return True
def run(args): if len(args) == 0: args = ["1hbb"] for arg in args: import iotbx.pdb.fetch if os.path.isfile(arg): mmcif_file = arg pdb_id = os.path.splitext(os.path.basename(mmcif_file))[0] iotbx.pdb.fetch.validate_pdb_id(pdb_id) else: # download pdbx/mmcif file from the PDB pdb_id = arg mirror = "pdbe" mmcif_file = iotbx.pdb.fetch.get_pdb(pdb_id, data_type="pdb", mirror=mirror, log=sys.stdout, format="cif") # read the cif file and get an iotbx.cif object import iotbx.cif cif_reader = iotbx.cif.reader(file_path=mmcif_file) cif_object = cif_reader.model() cif_block = cif_object[pdb_id] # get single items from cif_block print("PDB id:", cif_block["_entry.id"]) # get a looped item from cif_block print("Authors:") for author in cif_block.get_looped_item("_citation_author.name"): print(author) print() print("Molecular Entities:") for pdbx_entity in cif_block.get_looped_item( "_entity.pdbx_description"): print(pdbx_entity) print() # extract crystal symmetry information import iotbx.cif.builders builder = iotbx.cif.builders.crystal_symmetry_builder(cif_block) builder.crystal_symmetry.show_summary() # 1) this works also for .pdb files, but re-reads the file import iotbx.pdb pdb_input = iotbx.pdb.input(file_name=mmcif_file) hierarchy = pdb_input.construct_hierarchy() # 2) This only works for mmcif files, but re-uses the cif_object from above: import iotbx.pdb.mmcif pdb_input = iotbx.pdb.mmcif.cif_input(cif_object=cif_object) hierarchy = pdb_input.construct_hierarchy() # some convenience methods of pdb_input object print("Software:", pdb_input.get_program_name()) print("Experiment type:", pdb_input.get_experiment_type()) print("Solvent content:", pdb_input.get_solvent_content()) print("Deposition date:", pdb_input.deposition_date()) r_rfree_sigma = pdb_input.get_r_rfree_sigma(mmcif_file) print("R-work/R-free: %s/%s" % (r_rfree_sigma.r_work, r_rfree_sigma.r_free)) # can also get crystal_symmetry from pdb_input object crystal_symmetry = pdb_input.crystal_symmetry() print() hierarchy.overall_counts().show() # level_id can be "model", "chain", "residue_group", "atom_group" or "atom" hierarchy.show(level_id="chain") # for a more detailed example of interacting with a pdb.hierarchy object, # see iotbx/examples/pdb_hierarchy.py # extract atom sites atoms = hierarchy.atoms() sites_cart = atoms.extract_xyz() print() for i in range(10): print(atoms[i].id_str(), atoms[i].xyz) print() # read some sequence information entity_poly_entity_id = cif_block.get_looped_item( "_entity_poly.entity_id") entity_id = cif_block.get_looped_item("_entity.id") entity_pdbx_description = cif_block.get_looped_item( "_entity.pdbx_description") entity_poly_one_letter_code = cif_block.get_looped_item( "_entity_poly.pdbx_seq_one_letter_code") from cctbx.array_family import flex for i in range(len(entity_poly_one_letter_code)): idx = flex.first_index(entity_id, entity_poly_entity_id[i]) print(entity_id[idx], entity_pdbx_description[i], end=' ') print("".join(entity_poly_one_letter_code[i].split()))
def validate_loop(self, loop, block): list_category = None for key, value in six.iteritems(loop): try: definition = self.get_definition(key) except KeyError: continue self.validate_enumeration(key, value, definition) self.validate_dependent(key, block, definition) self.validate_related(key, block, definition) _list = definition.get("_list") if self.DDL_version == 1 and _list in ('no', None): self.report_error(2501, key=key) # not allowed in list definition_category = definition.category if (definition_category is not None and not isinstance(definition_category, string_types)): definition_name = definition.name i = flex.first_index(definition_name, key) definition_category = definition_category[i] if list_category is None: list_category = definition_category elif (isinstance(list_category, string_types) and definition_category is not None and list_category != definition_category): print(list_category, list(definition_category)) self.report_error(2502, key=key) # multiple categories in loop mandatory = definition.mandatory == 'yes' references = definition.get('_list_reference') if references is not None: if isinstance(references, string_types): references = [references] for reference in references: try: ref_data = self.get_definition(reference) except KeyError: ref_data = self.get_definition(key) ref_names = ref_data['_name'] if isinstance(ref_names, string_types): ref_names = [ref_names] for name in ref_names: if name not in loop: self.report_error( 2505, key=key, reference=name) # missing _list_reference elif (self.DDL_version == 2 and isinstance(definition.category, string_types)): category_def = self.get_definition(definition.category) if category_def.category_key is not None: category_keys = category_def.category_key if isinstance(category_keys, string_types): category_keys = [category_keys] for cat_key in category_keys: cat_key_def = self.get_definition(cat_key) if (cat_key_def.mandatory == 'yes' and isinstance(cat_key_def.mandatory, string_types) and cat_key_def.name not in block): self.report_error(2203, key=cat_key_def.name, category=definition.category) # link_parent = definition.get('_list_link_parent', self.child_parent_relations.get(key)) if link_parent is not None: parent_values = loop.get(link_parent, block.get(link_parent)) if parent_values is not None: for v in loop[key]: if v != '.' and v not in parent_values: # missing parent value self.report_error(2503, value=v, child=key, parent=link_parent) else: self.report_error(2504, child=key, parent=link_parent) # missing parent
def __init__(self, **kwargs): group_args.__init__(self, **kwargs) print('finished Dij, now calculating rho_i and density') from xfel.clustering import Rodriguez_Laio_clustering_2014 as RL R = RL(distance_matrix=self.Dij, d_c=self.d_c) #from clustering.plot_with_dimensional_embedding import plot_with_dimensional_embedding #plot_with_dimensional_embedding(1-self.Dij/flex.max(self.Dij), show_plot=True) if hasattr(self, 'strategy') is False: self.strategy = 'default' self.rho = rho = R.get_rho() ave_rho = flex.mean(rho.as_double()) NN = self.Dij.focus()[0] i_max = flex.max_index(rho) delta_i_max = flex.max( flex.double([self.Dij[i_max, j] for j in range(NN)])) rho_order = flex.sort_permutation(rho, reverse=True) rho_order_list = list(rho_order) self.delta = delta = R.get_delta(rho_order=rho_order, delta_i_max=delta_i_max) cluster_id = flex.int(NN, -1) # -1 means no cluster delta_order = flex.sort_permutation(delta, reverse=True) MAX_PERCENTILE_RHO = self.max_percentile_rho # cluster centers have to be in the top percentile n_cluster = 0 # # print('Z_DELTA = ', self.Z_delta) pick_top_solution = False rho_stdev = flex.mean_and_variance( rho.as_double()).unweighted_sample_standard_deviation() delta_stdev = flex.mean_and_variance( delta).unweighted_sample_standard_deviation() if rho_stdev != 0.0 and delta_stdev != 0: rho_z = (rho.as_double() - flex.mean(rho.as_double())) / (rho_stdev) delta_z = (delta - flex.mean(delta)) / (delta_stdev) else: pick_top_solution = True if rho_stdev == 0.0: centroids = [flex.first_index(delta, flex.max(delta))] elif delta_stdev == 0.0: centroids = [flex.first_index(rho, flex.max(rho))] significant_delta = [] significant_rho = [] # Define strategy to decide cluster center here. Only one should be true debug_fix_clustering = True if self.strategy == 'one_cluster': debug_fix_clustering = False strategy2 = True if self.strategy == 'strategy_3': debug_fix_clustering = False strategy3 = True strategy2 = False if debug_fix_clustering: if not pick_top_solution: delta_z_cutoff = min(1.0, max(delta_z)) rho_z_cutoff = min(1.0, max(rho_z)) for ic in range(NN): # test the density & rho if delta_z[ic] >= delta_z_cutoff or delta_z[ ic] <= -delta_z_cutoff: significant_delta.append(ic) if rho_z[ic] >= rho_z_cutoff or rho_z[ic] <= -rho_z_cutoff: significant_rho.append(ic) if True: # Use idea quoted in Rodriguez Laio 2014 paper # " Thus, cluster centers are recognized as points for which the value of delta is anomalously large." centroid_candidates = list(significant_delta) candidate_delta_z = flex.double() for ic in centroid_candidates: if ic == rho_order[0]: delta_z_of_rho_order_0 = delta_z[ic] candidate_delta_z.append(delta_z[ic]) i_sorted = flex.sort_permutation(candidate_delta_z, reverse=True) # Check that once sorted the top one is not equal to the 2nd or 3rd position # If there is a tie, assign centroid to the first one in rho order centroids = [] # rho_order[0] has to be a centroid centroids.append(rho_order[0]) #centroids.append(centroid_candidates[i_sorted[0]]) for i in range(0, len(i_sorted[:])): if centroid_candidates[i_sorted[i]] == rho_order[0]: continue if delta_z_of_rho_order_0 - candidate_delta_z[ i_sorted[i]] > 1.0: if i > 1: if -candidate_delta_z[i_sorted[ i - 1]] + candidate_delta_z[ i_sorted[0]] > 1.0: centroids.append( centroid_candidates[i_sorted[i]]) else: centroids.append( centroid_candidates[i_sorted[i]]) else: break if False: centroid_candidates = list( set(significant_delta).intersection( set(significant_rho))) # Now compare the relative orders of the max delta_z and max rho_z to make sure they are within 1 stdev centroids = [] max_delta_z_candidates = -999.9 max_rho_z_candidates = -999.9 for ic in centroid_candidates: if delta_z[ic] > max_delta_z_candidates: max_delta_z_candidates = delta_z[ic] if rho_z[ic] > max_rho_z_candidates: max_rho_z_candidates = rho_z[ic] for ic in centroid_candidates: if max_delta_z_candidates - delta_z[ ic] < 1.0 and max_rho_z_candidates - rho_z[ ic] < 1.0: centroids.append(ic) #item_idxs = [delta_order[ic] for ic,centroid in enumerate(centroids)] item_idxs = centroids for item_idx in item_idxs: cluster_id[item_idx] = n_cluster print('CLUSTERING_STATS', item_idx, cluster_id[item_idx]) n_cluster += 1 #### elif strategy2: # Go through list of clusters, see which one has highest joint rank in both rho and delta lists # This will only assign one cluster center based on highest product of rho and delta ranks product_list_of_ranks = [] for ic in range(NN): rho_tmp = self.rho[ic] delta_tmp = self.delta[ic] product_list_of_ranks.append(rho_tmp * delta_tmp) import numpy as np item_idx = np.argmax(product_list_of_ranks) cluster_id[item_idx] = n_cluster # Only cluster assigned print('CLUSTERING_STATS', item_idx, cluster_id[item_idx]) n_cluster += 1 elif strategy3: # use product of delta and rho and pick out top candidates # have to use a significance z_score to filter out the very best product_list_of_ranks = flex.double() for ic in range(NN): rho_tmp = self.rho[ic] delta_tmp = self.delta[ic] product_list_of_ranks.append(rho_tmp * delta_tmp) import numpy as np iid_sorted = flex.sort_permutation(product_list_of_ranks, reverse=True) cluster_id[ iid_sorted[0]] = n_cluster # first point always a cluster n_cluster += 1 print('CLUSTERING_STATS S3', iid_sorted[0], cluster_id[iid_sorted[0]]) #product_list_of_ranks[iid_sorted[0]]=0.0 # set this to 0.0 so that the mean/stdev does not get biased by one point stdev = np.std(product_list_of_ranks) mean = np.mean(product_list_of_ranks) n_sorted = 3 #if stdev == 0.0: # n_sorted=1 z_critical = 3.0 # 2 sigma significance ? # Only go through say 3-4 datapoints # basically there won't be more than 2-3 lattices on an image realistically for iid in iid_sorted[1:n_sorted]: z_score = (product_list_of_ranks[iid] - mean) / stdev if z_score > z_critical: cluster_id[iid] = n_cluster n_cluster += 1 print('CLUSTERING_STATS S3', iid, cluster_id[iid]) else: break # No point going over all points once below threshold z_score else: for ic in range(NN): item_idx = delta_order[ic] if ic != 0: if delta[item_idx] <= 0.25 * delta[ delta_order[0]]: # too low to be a medoid continue item_rho_order = rho_order_list.index(item_idx) if (item_rho_order) / NN < MAX_PERCENTILE_RHO: cluster_id[item_idx] = n_cluster print('CLUSTERING_STATS', ic, item_idx, item_rho_order, cluster_id[item_idx]) n_cluster += 1 ### # print('Found %d clusters' % n_cluster) for x in range(NN): if cluster_id[x] >= 0: print("XC", x, cluster_id[x], rho[x], delta[x]) self.cluster_id_maxima = cluster_id.deep_copy() R.cluster_assignment(rho_order, cluster_id, rho) self.cluster_id_full = cluster_id.deep_copy() #halo = flex.bool(NN,False) #border = R.get_border( cluster_id = cluster_id ) #for ic in range(n_cluster): #loop thru all border regions; find highest density # this_border = (cluster_id == ic) & (border==True) # if this_border.count(True)>0: # highest_density = flex.max(rho.select(this_border)) # halo_selection = (rho < highest_density) & (this_border==True) # if halo_selection.count(True)>0: # cluster_id.set_selected(halo_selection,-1) # core_selection = (cluster_id == ic) & ~halo_selection # highest_density = flex.max(rho.select(core_selection)) # too_sparse = core_selection & (rho.as_double() < highest_density/10.) # another heuristic # if too_sparse.count(True)>0: # cluster_id.set_selected(too_sparse,-1) self.cluster_id_final = cluster_id.deep_copy()
def get_uc_consensus(experiments_list, show_plot=False, save_plot=False, return_only_first_indexed_model=False, finalize_method='reindex_with_known_crystal_models', clustering_params=None): ''' Uses the Rodriguez Laio 2014 method to do a hierarchical clustering of the crystal models and then vote for the highest consensus crystal mode. Input needs to be a list of experiments object. Clustering code taken from github.com/cctbx-xfel/cluster_regression Clustering is first done first based on unit cell dimensions. Then for each of the clusters identified, a further clustering is done based on orientational matrix A ''' if return_only_first_indexed_model: return [experiments_list[0].crystals()[0]], None cells = [] from xfel.clustering.singleframe import CellOnlyFrame # Flag for testing Lysozyme data from NKS.Make sure cluster_regression repository is present and configured # Program will exit after plots are displayed if this flag is true test_nks = False if clustering_params is None: clustering_params = clustering_iota_scope if test_nks: from cctbx import crystal import libtbx.load_env cluster_regression = libtbx.env.find_in_repositories( relative_path="cluster_regression", test=os.path.isdir) file_name = os.path.join(cluster_regression, 'examples', 'lysozyme1341.txt') for line in open(file_name, "r").xreadlines(): tokens = line.strip().split() unit_cell = tuple(float(x) for x in tokens[0:6]) space_group_symbol = tokens[6] crystal_symmetry = crystal.symmetry( unit_cell=unit_cell, space_group_symbol=space_group_symbol) cells.append(CellOnlyFrame(crystal_symmetry)) else: clustered_experiments_list = flex.int() for experiment in experiments_list: if len(experiment.crystals()) > 1: print('IOTA:Should have only one crystal model') crystal_symmetry = experiment.crystals()[0].get_crystal_symmetry() cells.append(CellOnlyFrame(crystal_symmetry)) # Maintain a list which is meaningless right now that will finally contain the # final clustering results clustered_experiments_list.append(-1) MM = [c.mm for c in cells] # metrical matrices MM_double = flex.double() for i in range(len(MM)): Tup = MM[i] for j in range(6): MM_double.append(Tup[j]) print('There are %d cells' % len(MM)) coord_x = flex.double([c.uc[0] for c in cells]) coord_y = flex.double([c.uc[1] for c in cells]) if show_plot or save_plot: import matplotlib if not show_plot: matplotlib.use('Agg') import matplotlib.pyplot as plt plt.plot([c.uc[0] for c in cells], [c.uc[1] for c in cells], "k.", markersize=3.) plt.axes().set_aspect("equal") if save_plot: plot_name = 'uc_cluster.png' plt.savefig(plot_name, size_inches=(10, 10), dpi=300, bbox_inches='tight') if show_plot: plt.show() print('Now constructing a Dij matrix: Starting Unit Cell clustering') NN = len(MM) from cctbx.uctbx.determine_unit_cell import NCDist_flatten Dij = NCDist_flatten(MM_double) from scitbx.math import five_number_summary d_c = clustering_params.d_c #five_number_summary(list(Dij))[1] d_c = estimate_d_c(Dij) #d_c = flex.mean_and_variance(Dij.as_1d()).unweighted_sample_standard_deviation() print('d_c = ', d_c) if len(cells) < 5: return [experiments_list[0].crystals()[0]], None CM = clustering_manager( Dij=Dij, d_c=d_c, max_percentile_rho=clustering_params.max_percentile_rho_uc, Z_delta=clustering_params.Z_delta, strategy='strategy_3') n_cluster = 1 + flex.max(CM.cluster_id_final) print(len(cells), ' datapoints have been analyzed') print('%d CLUSTERS' % n_cluster) for i in range(n_cluster): item = flex.first_index(CM.cluster_id_maxima, i) print('Cluster %d central Unit cell = %d' % (i, item)) cells[item].crystal_symmetry.show_summary() # More plots for debugging appcolors = [ 'b', 'r', '#ff7f0e', '#2ca02c', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf' ] if show_plot: # Decision graph import matplotlib.pyplot as plt plt.plot(CM.rho, CM.delta, "r.", markersize=3.) for x in range(NN): if CM.cluster_id_maxima[x] >= 0: plt.plot([CM.rho[x]], [CM.delta[x]], "ro") plt.show() if show_plot: import matplotlib.pyplot as plt colors = [appcolors[i % 10] for i in CM.cluster_id_full] plt.scatter(coord_x, coord_y, marker='o', color=colors, linewidth=0.4, edgecolor='k') for i in range(n_cluster): item = flex.first_index(CM.cluster_id_maxima, i) plt.plot([cells[item].uc[0]], cells[item].uc[1], 'y.') plt.axes().set_aspect("equal") plt.show() if test_nks: exit() # Now look at each unit cell cluster for orientational clustering # idea is to cluster the orientational component in each of the unit cell clusters # do_orientational_clustering = not return_only_first_indexed_model # temporary. dxtbx_crystal_models = [] if do_orientational_clustering: print('IOTA: Starting orientational clustering') Dij_ori = {} # dictionary to store Dij for each cluster uc_experiments_list = { } # dictionary to store experiments_lists for each cluster from collections import Counter uc_cluster_count = Counter(list(CM.cluster_id_final)) # instantiate the Dij_ori flat 1-d array # Put all experiments list from same uc cluster together if True: from scitbx.matrix import sqr from cctbx_orientation_ext import crystal_orientation crystal_orientation_list = [] all_A = [] for i in range(len(experiments_list)): crystal_orientation_list.append( crystal_orientation( experiments_list[i].crystals()[0].get_A(), True)) #exit() A_direct = sqr(crystal_orientation_list[i].reciprocal_matrix() ).transpose().inverse() all_A.append(A_direct[0]) #print ("Direct A matrix 1st element = %12.6f %12.6f %12.6f"%(A_direct[0], A_direct[1], A_direct[2])) # exit() CM_mapping = {} for i in range(len(experiments_list)): if CM.cluster_id_full[i] not in uc_experiments_list: uc_experiments_list[CM.cluster_id_full[i]] = [] CM_mapping[CM.cluster_id_full[i]] = [] uc_experiments_list[CM.cluster_id_full[i]].append( experiments_list[i]) # Maintain mapping between original experiments_list and uc_exeriments_list # Mapping: key> index_in_experiments_list | value> cluster_id, index_in_uc_cluster CM_mapping[CM.cluster_id_full[i]].append( (i, len(uc_experiments_list[CM.cluster_id_full[i]]) - 1)) for cluster in uc_cluster_count: # Make sure there are atleast a minimum number of samples in the cluster if uc_cluster_count[cluster] < clustering_params.min_datapts: continue Dij_ori[cluster] = flex.double( [[0.0] * uc_cluster_count[cluster]] * uc_cluster_count[cluster]) # Now populate the Dij_ori array N_samples_in_cluster = len(uc_experiments_list[cluster]) for i in range(N_samples_in_cluster - 1): for j in range(i + 1, N_samples_in_cluster): dij_ori = get_dij_ori( uc_experiments_list[cluster][i].crystals()[0], uc_experiments_list[cluster][j].crystals()[0]) A_direct_i = sqr( uc_experiments_list[cluster][i].crystals() [0].get_A()).transpose().inverse() A_direct_j = sqr( uc_experiments_list[cluster][j].crystals() [0].get_A()).transpose().inverse() #print ("Direct A matrix 1st element = %12.6f %12.6f %12.6f %12.6f %12.6f %12.6f %12.6f"%(dij_ori, A_direct_i[0], A_direct_j[0], A_direct_i[1],A_direct_j[1], A_direct_i[2], A_direct_j[2] )) Dij_ori[cluster][N_samples_in_cluster * i + j] = dij_ori Dij_ori[cluster][N_samples_in_cluster * j + i] = dij_ori # Now do the orientational cluster analysis d_c_ori = clustering_params.d_c_ori # 0.13 from exafel_project.ADSE13_25.clustering.plot_with_dimensional_embedding import plot_with_dimensional_embedding #plot_with_dimensional_embedding(1-Dij_ori[1]/flex.max(Dij_ori[1]), show_plot=True) A_matrices = [] for cluster in Dij_ori: #if cluster == 2: # CM_ori = clustering_manager(Dij=Dij_ori[cluster], d_c=d_c_ori, max_percentile_rho=0.85, debug=True) d_c_ori = estimate_d_c(Dij_ori[cluster]) #else: #d_c_ori=flex.mean_and_variance(Dij_ori[cluster].as_1d()).unweighted_sample_standard_deviation() print('d_c_ori=', d_c_ori) CM_ori = clustering_manager( Dij=Dij_ori[cluster], d_c=d_c_ori, max_percentile_rho=clustering_params.max_percentile_rho_ori, Z_delta=clustering_params.Z_delta, strategy='strategy_3') n_cluster_ori = 1 + flex.max(CM_ori.cluster_id_final) #from IPython import embed; embed(); exit() for i in range(n_cluster_ori): if len([zz for zz in CM_ori.cluster_id_final if zz == i ]) < clustering_params.min_datapts: continue item = flex.first_index(CM_ori.cluster_id_maxima, i) dxtbx_crystal_model = uc_experiments_list[cluster][ item].crystals()[0] dxtbx_crystal_models.append(dxtbx_crystal_model) # Map the orientational clusters to the original experiments_list indices # This should be the final list of clusters! for j, ori_cluster_id in enumerate(CM_ori.cluster_id_final): if ori_cluster_id == i: xx, yy = CM_mapping[cluster][j] clustered_experiments_list[xx] = len( dxtbx_crystal_models) - 1 from scitbx.matrix import sqr from cctbx_orientation_ext import crystal_orientation crystal_orientation = crystal_orientation( dxtbx_crystal_model.get_A(), True) A_direct = sqr(crystal_orientation.reciprocal_matrix() ).transpose().inverse() A_matrices.append(A_direct) print( "IOTA: Direct A matrix 1st element of orientational cluster %d = %12.6f" % (i, A_direct[0])) print(A_direct) if show_plot: # Decision graph stretch_plot_factor = 1.05 # (1+fraction of limits by which xlim,ylim should be set) import matplotlib.pyplot as plt plt.plot(CM_ori.rho, CM_ori.delta, "r.", markersize=3.) for x in range(len(list(CM_ori.cluster_id_final))): if CM_ori.cluster_id_maxima[x] >= 0: plt.plot([CM_ori.rho[x]], [CM_ori.delta[x]], "ro") #exit() plt.xlim([-10, stretch_plot_factor * flex.max(CM_ori.rho)]) plt.ylim([-10, stretch_plot_factor * flex.max(CM_ori.delta)]) plt.show() # FIXME Still to be worked out what exactly should be returned #if return_only_first_indexed_model: # return [experiments_list[0].crystals()[0]], clustered_experiments_list # Make sure the crystal models are not too close to each other # FIXME should be a PHIL #from IPython import embed; embed(); exit() min_angle = 5.0 # taken from indexer.py close_models_list = [] # Not used really; other fixes have been made to code to figure out outliers # Still keeping this in case it it useful later on. if len(dxtbx_crystal_models) > 10000: from dials.algorithms.indexing.compare_orientation_matrices import difference_rotation_matrix_axis_angle from cctbx_orientation_ext import crystal_orientation from dxtbx.model import Crystal for i_a in range(0, len(dxtbx_crystal_models) - 1): for i_b in range(i_a + 1, len(dxtbx_crystal_models)): cryst_a = dxtbx_crystal_models[i_a] cryst_b = dxtbx_crystal_models[i_b] cryst_a_ori = crystal_orientation(cryst_a.get_A(), True) cryst_b_ori = crystal_orientation(cryst_b.get_A(), True) try: best_similarity_transform = cryst_b_ori.best_similarity_transformation( other=cryst_a_ori, fractional_length_tolerance=20.00, unimodular_generator_range=1) cryst_b_ori_best = cryst_b_ori.change_basis( best_similarity_transform) except Exception as e: cryst_b_ori_best = cryst_b_ori # FIXME hardcoded space group for myoglobin LS49 cryst_b_best = Crystal(cryst_b_ori_best.direct_matrix()[0:3], cryst_b_ori_best.direct_matrix()[3:6], cryst_b_ori_best.direct_matrix()[6:9], 'P 1 21 1') R_ab, axis, angle, cb_op_ab = difference_rotation_matrix_axis_angle( cryst_a, cryst_b_best) # FIXME if abs(angle) < min_angle: # degrees close_models_list.append((i_a, i_b)) # Now prune the dxtbx_crystal_models list unique_experiments_list = flex.int(range(len(dxtbx_crystal_models))) for close_models in close_models_list: i_a, i_b = close_models if dxtbx_crystal_models[i_a] is not None and dxtbx_crystal_models[ i_b] is not None: dxtbx_crystal_models[i_b] = None unique_experiments_list[i_b] = i_a clustered_experiments_list.set_selected( clustered_experiments_list == i_b, i_a) counter = -1 for ii, model in enumerate(dxtbx_crystal_models): if model is not None: counter += 1 clustered_experiments_list.set_selected( clustered_experiments_list == unique_experiments_list[ii], counter) dxtbx_crystal_models = [ x for x in dxtbx_crystal_models if x is not None ] #from IPython import embed; embed(); exit() if len(dxtbx_crystal_models) > 0: return dxtbx_crystal_models, list(clustered_experiments_list) else: # If nothing works, atleast return the 1st crystal model that was found return [experiments_list[0].crystals()[0]], None
class pdb_hierarchy_builder(crystal_symmetry_builder): # The recommended translation for ATOM records can be found at: # http://mmcif.rcsb.org/dictionaries/pdb-correspondence/pdb2mmcif-2010.html#ATOM def __init__(self, cif_block): crystal_symmetry_builder.__init__(self, cif_block) self.hierarchy = hierarchy.root() # These items are mandatory for the _atom_site loop, all others are optional type_symbol = cif_block.get("_atom_site.type_symbol") atom_labels = cif_block.get("_atom_site.auth_atom_id") if atom_labels is None: atom_labels = cif_block.get("_atom_site.label_atom_id" ) # corresponds to chem comp atom name alt_id = cif_block.get( "_atom_site.label_alt_id") # alternate conformer id label_asym_id = cif_block.get("_atom_site.label_asym_id") # chain id auth_asym_id = cif_block.get("_atom_site.auth_asym_id") if label_asym_id is None: label_asym_id = auth_asym_id if auth_asym_id is None: auth_asym_id = label_asym_id comp_id = cif_block.get("_atom_site.auth_comp_id") if comp_id is None: comp_id = cif_block.get("_atom_site.label_comp_id") # residue name entity_id = cif_block.get("_atom_site.label_entity_id") seq_id = cif_block.get("_atom_site.auth_seq_id") if seq_id is None: seq_id = cif_block.get("_atom_site.label_seq_id") # residue number assert [atom_labels, alt_id, auth_asym_id, comp_id, entity_id, seq_id].count(None) == 0 assert type_symbol is not None atom_site_fp = cif_block.get('_atom_site.phenix_scat_dispersion_real') atom_site_fdp = cif_block.get('_atom_site.phenix_scat_dispersion_imag') pdb_ins_code = cif_block.get( "_atom_site.pdbx_PDB_ins_code") # insertion code model_ids = cif_block.get("_atom_site.pdbx_PDB_model_num") atom_site_id = cif_block.get("_atom_site.id") # only permitted values are ATOM or HETATM group_PDB = cif_block.get("_atom_site.group_PDB") # TODO: read esds B_iso_or_equiv = flex.double( cif_block.get("_atom_site.B_iso_or_equiv")) cart_x = flex.double(cif_block.get("_atom_site.Cartn_x")) cart_y = flex.double(cif_block.get("_atom_site.Cartn_y")) cart_z = flex.double(cif_block.get("_atom_site.Cartn_z")) occu = flex.double(cif_block.get("_atom_site.occupancy")) formal_charge = cif_block.get("_atom_site.pdbx_formal_charge") # anisotropic b-factors # TODO: read esds anisotrop_id = cif_block.get("_atom_site_anisotrop.id") adps = None if anisotrop_id is not None: u_ij = [ cif_block.get("_atom_site_anisotrop.U[%s][%s]" % (ij[0], ij[1])) for ij in ("11", "22", "33", "12", "13", "23") ] assert u_ij.count(None) in (0, 6) if u_ij.count(None) == 0: adps = u_ij else: assert u_ij.count(None) == 6 b_ij = [ cif_block.get("_atom_site_anisotrop.B[%s][%s]" % (ij[0], ij[1])) for ij in ("11", "22", "33", "12", "13", "23") ] assert b_ij.count(None) in (0, 6) if b_ij.count(None) == 0: adps = adptbx.b_as_u(b_ij) assert not (u_ij.count(None) and b_ij.count(None) ) # illegal for both to be present if adps is not None: try: adps = [flex.double(adp) for adp in adps] except ValueError, e: raise CifBuilderError("Error interpreting ADPs: " + str(e)) adps = flex.sym_mat3_double(*adps) current_model_id = None current_label_asym_id = None current_auth_asym_id = None current_residue_id = None current_ins_code = None for i_atom in range(atom_labels.size()): # model(s) last_model_id = current_model_id current_model_id = model_ids[i_atom] assert current_model_id is not None if current_model_id != last_model_id: model = hierarchy.model(id=current_model_id) self.hierarchy.append_model(model) # chain(s) last_label_asym_id = current_label_asym_id current_label_asym_id = label_asym_id[i_atom] assert current_label_asym_id is not None last_auth_asym_id = current_auth_asym_id current_auth_asym_id = auth_asym_id[i_atom] if current_auth_asym_id == ".": current_auth_asym_id = " " assert current_label_asym_id is not None if current_label_asym_id != last_label_asym_id: chain = hierarchy.chain(id=current_auth_asym_id) model.append_chain(chain) else: assert current_auth_asym_id == last_auth_asym_id # residue_group(s) # defined by residue id and insertion code last_residue_id = current_residue_id current_residue_id = seq_id[i_atom] assert current_residue_id is not None last_ins_code = current_ins_code if pdb_ins_code is not None: current_ins_code = pdb_ins_code[i_atom] if current_ins_code in ("?", ".", None): current_ins_code = " " if (current_residue_id != last_residue_id or current_ins_code != last_ins_code or current_label_asym_id != last_label_asym_id): try: resseq = hy36encode(width=4, value=int(current_residue_id)) except ValueError, e: resseq = current_residue_id assert len(resseq) == 4 residue_group = hierarchy.residue_group(resseq=resseq, icode=current_ins_code) chain.append_residue_group(residue_group) atom_groups = OrderedDict() # reset atom_groups cache # atom_group(s) # defined by resname and altloc id current_altloc = alt_id[i_atom] if current_altloc == ".": current_altloc = "" # Main chain atoms current_resname = comp_id[i_atom] if (current_altloc, current_resname) not in atom_groups: atom_group = hierarchy.atom_group(altloc=current_altloc, resname=current_resname) atom_groups[(current_altloc, current_resname)] = atom_group if current_altloc == "": residue_group.insert_atom_group(0, atom_group) else: residue_group.append_atom_group(atom_group) else: atom_group = atom_groups[(current_altloc, current_resname)] # atom(s) atom = hierarchy.atom() atom_group.append_atom(atom) atom.set_element(type_symbol[i_atom]) atom.set_name( format_pdb_atom_name(atom_labels[i_atom], type_symbol[i_atom])) atom.set_xyz(new_xyz=(cart_x[i_atom], cart_y[i_atom], cart_z[i_atom])) atom.set_b(B_iso_or_equiv[i_atom]) atom.set_occ(occu[i_atom]) # hy36encode should go once the pdb.hierarchy has been # modified to no longer store fixed-width strings atom.set_serial( hy36encode(width=5, value=int(atom_site_id[i_atom]))) # some code relies on an empty segid being 4 spaces atom.set_segid(" ") if group_PDB is not None and group_PDB[i_atom] == "HETATM": atom.hetero = True if formal_charge is not None: charge = formal_charge[i_atom] if charge not in ("?", "."): if charge.endswith("-") or charge.startswith("-"): sign = "-" else: sign = "+" charge = charge.strip(" -+") charge = int(charge) if charge == 0: sign = "" atom.set_charge("%i%s" % (charge, sign)) if atom_site_fp is not None: fp = atom_site_fp[i_atom] if fp not in ("?", "."): atom.set_fp(new_fp=float(fp)) if atom_site_fdp is not None: fdp = atom_site_fdp[i_atom] if fdp not in ("?", "."): atom.set_fdp(new_fdp=float(fdp)) if anisotrop_id is not None and adps is not None: u_ij_index = flex.first_index(anisotrop_id, atom.serial.strip()) if u_ij_index is not None: u_ij = adps[u_ij_index] atom.set_uij(u_ij) else: pass
def __init__(self, **kwargs): group_args.__init__(self, **kwargs) print('finished Dij, now calculating rho_i and density') from xfel.clustering import Rodriguez_Laio_clustering_2014 as RL R = RL(distance_matrix=self.Dij, d_c=self.d_c) #from IPython import embed; embed(); exit() #from clustering.plot_with_dimensional_embedding import plot_with_dimensional_embedding #plot_with_dimensional_embedding(1-self.Dij/flex.max(self.Dij), show_plot=True) self.rho = rho = R.get_rho() ave_rho = flex.mean(rho.as_double()) NN = self.Dij.focus()[0] i_max = flex.max_index(rho) delta_i_max = flex.max( flex.double([self.Dij[i_max, j] for j in range(NN)])) rho_order = flex.sort_permutation(rho, reverse=True) rho_order_list = list(rho_order) self.delta = delta = R.get_delta(rho_order=rho_order, delta_i_max=delta_i_max) cluster_id = flex.int(NN, -1) # -1 means no cluster delta_order = flex.sort_permutation(delta, reverse=True) MAX_PERCENTILE_RHO = self.max_percentile_rho # cluster centers have to be in the top percentile n_cluster = 0 # pick_top_solution = False rho_stdev = flex.mean_and_variance( rho.as_double()).unweighted_sample_standard_deviation() delta_stdev = flex.mean_and_variance( delta).unweighted_sample_standard_deviation() if rho_stdev != 0.0 and delta_stdev != 0: rho_z = (rho.as_double() - flex.mean(rho.as_double())) / (rho_stdev) delta_z = (delta - flex.mean(delta)) / (delta_stdev) else: pick_top_solution = True if rho_stdev == 0.0: centroids = [flex.first_index(delta, flex.max(delta))] elif delta_stdev == 0.0: centroids = [flex.first_index(rho, flex.max(rho))] significant_delta = [] significant_rho = [] debug_fix_clustering = True if debug_fix_clustering: if not pick_top_solution: delta_z_cutoff = min(1.0, max(delta_z)) rho_z_cutoff = min(1.0, max(rho_z)) for ic in range(NN): # test the density & rho if delta_z[ic] >= delta_z_cutoff: significant_delta.append(ic) if rho_z[ic] >= rho_z_cutoff: significant_rho.append(ic) centroid_candidates = list( set(significant_delta).intersection(set(significant_rho))) # Now compare the relative orders of the max delta_z and max rho_z to make sure they are within 1 stdev centroids = [] max_delta_z_candidates = -999.9 max_rho_z_candidates = -999.9 for ic in centroid_candidates: if delta_z[ic] > max_delta_z_candidates: max_delta_z_candidates = delta_z[ic] if rho_z[ic] > max_rho_z_candidates: max_rho_z_candidates = rho_z[ic] for ic in centroid_candidates: if max_delta_z_candidates - delta_z[ ic] < 1.0 and max_rho_z_candidates - rho_z[ ic] < 1.0: centroids.append(ic) item_idxs = [ delta_order[ic] for ic, centroid in enumerate(centroids) ] for item_idx in item_idxs: cluster_id[item_idx] = n_cluster print('CLUSTERING_STATS', item_idx, cluster_id[item_idx]) n_cluster += 1 #### else: for ic in range(NN): item_idx = delta_order[ic] if ic != 0: if delta[item_idx] <= 0.25 * delta[ delta_order[0]]: # too low to be a medoid continue item_rho_order = rho_order_list.index(item_idx) if (item_rho_order) / NN < MAX_PERCENTILE_RHO: cluster_id[item_idx] = n_cluster print('CLUSTERING_STATS', ic, item_idx, item_rho_order, cluster_id[item_idx]) n_cluster += 1 ### # # print('Found %d clusters' % n_cluster) for x in range(NN): if cluster_id[x] >= 0: print("XC", x, cluster_id[x], rho[x], delta[x]) self.cluster_id_maxima = cluster_id.deep_copy() R.cluster_assignment(rho_order, cluster_id) self.cluster_id_full = cluster_id.deep_copy() #halo = flex.bool(NN,False) #border = R.get_border( cluster_id = cluster_id ) #for ic in range(n_cluster): #loop thru all border regions; find highest density # this_border = (cluster_id == ic) & (border==True) # if this_border.count(True)>0: # highest_density = flex.max(rho.select(this_border)) # halo_selection = (rho < highest_density) & (this_border==True) # if halo_selection.count(True)>0: # cluster_id.set_selected(halo_selection,-1) # core_selection = (cluster_id == ic) & ~halo_selection # highest_density = flex.max(rho.select(core_selection)) # too_sparse = core_selection & (rho.as_double() < highest_density/10.) # another heuristic # if too_sparse.count(True)>0: # cluster_id.set_selected(too_sparse,-1) self.cluster_id_final = cluster_id.deep_copy()
class crystal_structure_builder(crystal_symmetry_builder): def __init__(self, cif_block): # XXX To do: interpret _atom_site_refinement_flags crystal_symmetry_builder.__init__(self, cif_block, strict=True) atom_sites_frac = [ as_double_or_none_if_all_question_marks( _, column_name='_atom_site_fract_%s' % axis) for _ in [ cif_block.get('_atom_site_fract_%s' % axis) for axis in ('x', 'y', 'z') ] ] if atom_sites_frac.count(None) == 3: atom_sites_cart = [ as_double_or_none_if_all_question_marks( _, column_name='_atom_site_Cartn_%s' % axis) for _ in [ cif_block.get('_atom_site_Cartn_%s' % axis) for axis in ('x', 'y', 'z') ] ] if atom_sites_cart.count(None) != 0: raise CifBuilderError("No atomic coordinates could be found") atom_sites_cart = flex.vec3_double(*atom_sites_cart) # XXX do we need to take account of _atom_sites_Cartn_tran_matrix_ ? atom_sites_frac = self.crystal_symmetry.unit_cell().fractionalize( atom_sites_cart) else: if atom_sites_frac.count(None) != 0: raise CifBuilderError("No atomic coordinates could be found") atom_sites_frac = flex.vec3_double(*atom_sites_frac) labels = cif_block.get('_atom_site_label') type_symbol = cif_block.get('_atom_site_type_symbol') U_iso_or_equiv = flex_double_else_none( cif_block.get('_atom_site_U_iso_or_equiv', cif_block.get('_atom_site_U_equiv_geom_mean'))) if U_iso_or_equiv is None: B_iso_or_equiv = flex_double_else_none( cif_block.get('_atom_site_B_iso_or_equiv', cif_block.get('_atom_site_B_equiv_geom_mean'))) adp_type = cif_block.get('_atom_site_adp_type') occupancy = flex_double_else_none( cif_block.get('_atom_site_occupancy')) scatterers = flex.xray_scatterer() atom_site_aniso_label = flex_std_string_else_none( cif_block.get('_atom_site_aniso_label')) if atom_site_aniso_label is not None: atom_site_aniso_label = atom_site_aniso_label adps = [ cif_block.get('_atom_site_aniso_U_%i' % i) for i in (11, 22, 33, 12, 13, 23) ] have_Bs = False if adps.count(None) > 0: adps = [ cif_block.get('_atom_site_aniso_B_%i' % i) for i in (11, 22, 33, 12, 13, 23) ] have_Bs = True if adps.count(None) == 6: adps = None elif adps.count(None) > 0: CifBuilderError("Some ADP items are missing") else: sel = None for adp in adps: f = (adp == "?") if (sel is None): sel = f else: sel &= f sel = ~sel atom_site_aniso_label = atom_site_aniso_label.select(sel) try: adps = [flex.double(adp.select(sel)) for adp in adps] except ValueError, e: raise CifBuilderError("Error interpreting ADPs: " + str(e)) adps = flex.sym_mat3_double(*adps) for i in range(len(atom_sites_frac)): kwds = {} if labels is not None: kwds.setdefault('label', str(labels[i])) if type_symbol is not None: kwds.setdefault('scattering_type', str(type_symbol[i])) if (atom_site_aniso_label is not None and adps is not None and labels is not None and labels[i] in atom_site_aniso_label): adp = adps[flex.first_index(atom_site_aniso_label, labels[i])] if have_Bs: adp = adptbx.b_as_u(adp) kwds.setdefault( 'u', adptbx.u_cif_as_u_star(self.crystal_symmetry.unit_cell(), adp)) elif U_iso_or_equiv is not None: kwds.setdefault('u', float_from_string(U_iso_or_equiv[i])) elif B_iso_or_equiv is not None: kwds.setdefault('b', float_from_string(B_iso_or_equiv[i])) if occupancy is not None: kwds.setdefault('occupancy', float_from_string(occupancy[i])) scatterers.append(xray.scatterer(**kwds)) scatterers.set_sites(atom_sites_frac) special_position_settings = crystal.special_position_settings( crystal_symmetry=self.crystal_symmetry, min_distance_sym_equiv=0.0001) self.structure = xray.structure( special_position_settings=special_position_settings, scatterers=scatterers)
def run(args): if len(args) == 0: args = ["1hbb"] for arg in args: import iotbx.pdb.fetch if os.path.isfile(arg): mmcif_file = arg pdb_id = os.path.splitext(os.path.basename(mmcif_file))[0] iotbx.pdb.fetch.validate_pdb_id(pdb_id) else: # download pdbx/mmcif file from the PDB pdb_id = arg mirror = "pdbe" mmcif_file = iotbx.pdb.fetch.get_pdb( pdb_id, data_type="pdb", mirror=mirror, log=sys.stdout, format="cif") # read the cif file and get an iotbx.cif object import iotbx.cif cif_reader = iotbx.cif.reader(file_path=mmcif_file) cif_object = cif_reader.model() cif_block = cif_object[pdb_id] # get single items from cif_block print "PDB id:", cif_block["_entry.id"] # get a looped item from cif_block print "Authors:" for author in cif_block.get_looped_item("_citation_author.name"): print author print print "Molecular Entities:" for pdbx_entity in cif_block.get_looped_item("_entity.pdbx_description"): print pdbx_entity print # extract crystal symmetry information import iotbx.cif.builders builder = iotbx.cif.builders.crystal_symmetry_builder(cif_block) builder.crystal_symmetry.show_summary() # 1) this works also for .pdb files, but re-reads the file import iotbx.pdb pdb_input = iotbx.pdb.input(file_name=mmcif_file) hierarchy = pdb_input.construct_hierarchy() # 2) This only works for mmcif files, but re-uses the cif_object from above: import iotbx.pdb.mmcif pdb_input = iotbx.pdb.mmcif.cif_input(cif_object=cif_object) hierarchy = pdb_input.construct_hierarchy() # some convenience methods of pdb_input object print "Software:", pdb_input.get_program_name() print "Experiment type:", pdb_input.get_experiment_type() print "Solvent content:", pdb_input.get_solvent_content() print "Deposition date:", pdb_input.deposition_date() r_rfree_sigma = pdb_input.get_r_rfree_sigma(mmcif_file) print "R-work/R-free: %s/%s" %(r_rfree_sigma.r_work, r_rfree_sigma.r_free) # can also get crystal_symmetry from pdb_input object crystal_symmetry = pdb_input.crystal_symmetry() print hierarchy.overall_counts().show() # level_id can be "model", "chain", "residue_group", "atom_group" or "atom" hierarchy.show(level_id="chain") # for a more detailed example of interacting with a pdb.hierarchy object, # see iotbx/examples/pdb_hierarchy.py # extract atom sites atoms = hierarchy.atoms() sites_cart = atoms.extract_xyz() print for i in range(10): print atoms[i].id_str(), atoms[i].xyz print # read some sequence information entity_poly_entity_id = cif_block.get_looped_item("_entity_poly.entity_id") entity_id = cif_block.get_looped_item("_entity.id") entity_pdbx_description = cif_block.get_looped_item("_entity.pdbx_description") entity_poly_one_letter_code = cif_block.get_looped_item( "_entity_poly.pdbx_seq_one_letter_code") from cctbx.array_family import flex for i in range(len(entity_poly_one_letter_code)): idx = flex.first_index(entity_id, entity_poly_entity_id[i]) print entity_id[idx], entity_pdbx_description[i], print "".join(entity_poly_one_letter_code[i].split())
def run_detail(show_plot, save_plot): P = Profiler("0. Read data") import sys file_name = sys.argv[1] from xfel.clustering.singleframe import CellOnlyFrame cells = [] for line in open(file_name, "r").xreadlines(): tokens = line.strip().split() cells.append(CellOnlyFrame(args=tokens, path=None)) MM = [c.mm for c in cells] # get all metrical matrices MM_double = flex.double() for i in xrange(len(MM)): Tup = MM[i] for j in xrange(6): MM_double.append(Tup[j]) print("There are %d cells X" % (len(MM))) CX = 0 CY = 3 coord_x = flex.double([c.uc[CX] for c in cells]) coord_y = flex.double([c.uc[CY] for c in cells]) if show_plot or save_plot: import matplotlib if not show_plot: # http://matplotlib.org/faq/howto_faq.html#generate-images-without-having-a-window-appear matplotlib.use('Agg') # use a non-interactive backend from matplotlib import pyplot as plt plt.plot(coord_x, coord_y, "k.", markersize=3.) #plt.axes().set_aspect("equal") if save_plot: plt.savefig(plot_name, size_inches=(10, 10), dpi=300, bbox_inches='tight') if show_plot: plt.show() print "Now constructing a Dij matrix." P = Profiler("1. compute Dij matrix") NN = len(MM) from cctbx.uctbx.determine_unit_cell import NCDist_matrix, NCDist_flatten #Dij = NCDist_matrix(MM_double) Dij = NCDist_flatten(MM_double) #from cctbx.uctbx.determine_unit_cell import NCDist # can this be refactored with MPI? #Dij = flex.double(flex.grid(NN,NN)) #for i in xrange(NN): # for j in xrange(i+1,NN): # Dij[i,j] = NCDist(MM[i], MM[j]) del P d_c = 10000 # the distance cutoff, such that average item neighbors 1-2% of all items CM = clustering_manager(Dij=Dij, d_c=d_c) # Summarize the results here n_cluster = 1 + flex.max(CM.cluster_id_final) print len(cells), "have been analyzed" print("# ------------ %d CLUSTERS ----------------" % (n_cluster)) for i in xrange(n_cluster): item = flex.first_index(CM.cluster_id_maxima, i) print "Cluster %d. Central unit cell: item %d" % (i, item) cells[item].crystal_symmetry.show_summary() print "Cluster has %d items, or %d after trimming borders" % ( (CM.cluster_id_full == i).count(True), (CM.cluster_id_final == i).count(True)) print appcolors = [ 'b', 'r', '#ff7f0e', '#2ca02c', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf' ] if show_plot: #Decision graph from matplotlib import pyplot as plt plt.plot(CM.rho, CM.delta, "r.", markersize=3.) for x in xrange(NN): if CM.cluster_id_maxima[x] >= 0: plt.plot([CM.rho[x]], [CM.delta[x]], "ro") plt.show() #No-halo plot from matplotlib import pyplot as plt colors = [appcolors[i % 10] for i in CM.cluster_id_full] plt.scatter(coord_x, coord_y, marker='o', color=colors, linewidths=0.4, edgecolor='k') for i in xrange(n_cluster): item = flex.first_index(CM.cluster_id_maxima, i) plt.plot([cells[item].uc[CX]], [cells[item].uc[CY]], 'y.') #plt.axes().set_aspect("equal") plt.show() #Final plot halo = (CM.cluster_id_final == -1) core = ~halo plt.plot(coord_x.select(halo), coord_y.select(halo), "k.") colors = [appcolors[i % 10] for i in CM.cluster_id_final.select(core)] plt.scatter(coord_x.select(core), coord_y.select(core), marker="o", color=colors, linewidths=0.4, edgecolor='k') for i in xrange(n_cluster): item = flex.first_index(CM.cluster_id_maxima, i) plt.plot([cells[item].uc[CX]], [cells[item].uc[CY]], 'y.') #plt.axes().set_aspect("equal") plt.show()
def validate_loop(self, loop, block): list_category = None for key, value in loop.iteritems(): try: definition = self.get_definition(key) except KeyError: continue self.validate_enumeration(key, value, definition) self.validate_dependent(key, block, definition) self.validate_related(key, block, definition) _list = definition.get("_list") if self.DDL_version == 1 and _list in ('no', None): self.report_error(2501, key=key) # not allowed in list definition_category = definition.category if (definition_category is not None and not isinstance(definition_category, basestring)): definition_name = definition.name i = flex.first_index(definition_name, key) definition_category = definition_category[i] if list_category is None: list_category = definition_category elif (isinstance(list_category, basestring) and definition_category is not None and list_category != definition_category): print list_category, list(definition_category) self.report_error(2502, key=key) # multiple categories in loop mandatory = definition.mandatory == 'yes' references = definition.get('_list_reference') if references is not None: if isinstance(references, basestring): references = [references] for reference in references: ref_data = self.get_definition(reference) ref_names = ref_data['_name'] if isinstance(ref_names, basestring): ref_names = [ref_names] for name in ref_names: if name not in loop: self.report_error(2505, key=key, reference=name) # missing _list_reference elif (self.DDL_version == 2 and isinstance(definition.category, basestring)): category_def = self.get_definition(definition.category) if category_def.category_key is not None: category_keys = category_def.category_key if isinstance(category_keys, basestring): category_keys = [category_keys] for cat_key in category_keys: cat_key_def = self.get_definition(cat_key) if (cat_key_def.mandatory == 'yes' and isinstance(cat_key_def.mandatory, basestring) and cat_key_def.name not in block): self.report_error( 2203, key=cat_key_def.name, category=definition.category) # link_parent = definition.get( '_list_link_parent', self.child_parent_relations.get(key)) if link_parent is not None: parent_values = loop.get(link_parent, block.get(link_parent)) if parent_values is not None: for v in loop[key]: if v != '.' and v not in parent_values: # missing parent value self.report_error(2503, value=v, child=key, parent=link_parent) else: self.report_error(2504, child=key, parent=link_parent) # missing parent
def get_uc_consensus(experiments_list, show_plot=False, return_only_first_indexed_model=False, finalize_method=None, clustering_params=None): ''' Uses the Rodriguez Laio 2014 method to do a clustering of the unit cells and then vote for the highest consensus unit cell. Input needs to be a list of experiments object. Clustering code taken from github.com/cctbx-xfel/cluster_regression Returns an experiment object with crystal unit cell from the cluster with the most points ''' if return_only_first_indexed_model: return [experiments_list[0].crystals()[0]], None cells = [] from xfel.clustering.singleframe import CellOnlyFrame save_plot = False # Flag for testing Lysozyme data from NKS.Make sure cluster_regression repository is present and configured # Program will exit after plots are displayed if this flag is true test_nks = False if test_nks: from cctbx import crystal import libtbx.load_env cluster_regression = libtbx.env.find_in_repositories( relative_path="cluster_regression", test=os.path.isdir) file_name = os.path.join(cluster_regression, 'examples', 'lysozyme1341.txt') for line in open(file_name, "r").xreadlines(): tokens = line.strip().split() unit_cell = tuple(float(x) for x in tokens[0:6]) space_group_symbol = tokens[6] crystal_symmetry = crystal.symmetry( unit_cell=unit_cell, space_group_symbol=space_group_symbol) cells.append(CellOnlyFrame(crystal_symmetry)) else: for experiment in experiments_list: if len(experiment.crystals()) > 1: print('IOTA:Should have only one crystal model') crystal_symmetry = experiment.crystals()[0].get_crystal_symmetry() cells.append(CellOnlyFrame(crystal_symmetry)) MM = [c.mm for c in cells] # metrical matrices MM_double = flex.double() for i in range(len(MM)): Tup = MM[i] for j in range(6): MM_double.append(Tup[j]) print('There are %d cells' % len(MM)) coord_x = flex.double([c.uc[0] for c in cells]) coord_y = flex.double([c.uc[1] for c in cells]) if show_plot or save_plot: import matplotlib if not show_plot: matplotlib.use('Agg') import matplotlib.pyplot as plt #from IPython import embed; embed(); exit() plt.plot([c.uc[0] for c in cells], [c.uc[1] for c in cells], "k.", markersize=3.) plt.axes().set_aspect("equal") if save_plot: plot_name = 'uc_cluster.png' plt.savefig(plot_name, size_inches=(10, 10), dpi=300, bbox_inches='tight') if show_plot: plt.show() print('Now constructing a Dij matrix: Starting Unit Cell clustering') NN = len(MM) from cctbx.uctbx.determine_unit_cell import NCDist_flatten Dij = NCDist_flatten(MM_double) d_c = flex.mean_and_variance( Dij.as_1d()).unweighted_sample_standard_deviation() #6.13 #FIXME should be a PHIL param if len(cells) < 5: return [experiments_list[0].crystals()[0]], None CM = clustering_manager(Dij=Dij, d_c=d_c, max_percentile_rho=0.95) n_cluster = 1 + flex.max(CM.cluster_id_final) print(len(cells), ' datapoints have been analyzed') print('%d CLUSTERS' % n_cluster) for i in range(n_cluster): item = flex.first_index(CM.cluster_id_maxima, i) print('Cluster %d central Unit cell = %d' % (i, item)) cells[item].crystal_symmetry.show_summary() # More plots for debugging appcolors = [ 'b', 'r', '#ff7f0e', '#2ca02c', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf' ] if show_plot: # Decision graph import matplotlib.pyplot as plt plt.plot(CM.rho, CM.delta, "r.", markersize=3.) for x in range(NN): if CM.cluster_id_maxima[x] >= 0: plt.plot([CM.rho[x]], [CM.delta[x]], "ro") plt.show() if show_plot: import matplotlib.pyplot as plt colors = [appcolors[i % 10] for i in CM.cluster_id_full] plt.scatter(coord_x, coord_y, marker='o', color=colors, linewidth=0.4, edgecolor='k') for i in range(n_cluster): item = flex.first_index(CM.cluster_id_maxima, i) plt.plot([cells[item].uc[0]], cells[item].uc[1], 'y.') plt.axes().set_aspect("equal") plt.show() if test_nks: exit() # Now look at each unit cell cluster for orientational clustering # idea is to cluster the orientational component in each of the unit cell clusters # do_orientational_clustering = not return_only_first_indexed_model # temporary. dxtbx_crystal_models = [] if do_orientational_clustering: print('IOTA: Starting orientational clustering') Dij_ori = {} # dictionary to store Dij for each cluster uc_experiments_list = { } # dictionary to store experiments_lists for each cluster from collections import Counter uc_cluster_count = Counter(list(CM.cluster_id_final)) # instantiate the Dij_ori flat 1-d array # Put all experiments list from same uc cluster together if True: from scitbx.matrix import sqr from cctbx_orientation_ext import crystal_orientation #crystal_orientation_list = [] #for i in range(len(experiments_list)): # crystal_orientation_list.append(crystal_orientation(experiments_list[i].crystals()[0].get_A(), True)) #from IPython import embed; embed(); exit() #A_direct = sqr(crystal_orientation_list[i].reciprocal_matrix()).transpose().inverse() #print ("Direct A matrix 1st element = %12.6f"%A_direct[0]) for i in range(len(experiments_list)): if CM.cluster_id_full[i] not in uc_experiments_list: uc_experiments_list[CM.cluster_id_full[i]] = [] uc_experiments_list[CM.cluster_id_full[i]].append( experiments_list[i]) for cluster in uc_cluster_count: # Make sure there are atleast a minimum number of samples in the cluster if uc_cluster_count[cluster] < 5: continue Dij_ori[cluster] = flex.double( [[0.0] * uc_cluster_count[cluster]] * uc_cluster_count[cluster]) # Now populate the Dij_ori array N_samples_in_cluster = len(uc_experiments_list[cluster]) for i in range(N_samples_in_cluster - 1): for j in range(i + 1, N_samples_in_cluster): dij_ori = get_dij_ori( uc_experiments_list[cluster][i].crystals()[0], uc_experiments_list[cluster][j].crystals()[0]) Dij_ori[cluster][N_samples_in_cluster * i + j] = dij_ori Dij_ori[cluster][N_samples_in_cluster * j + i] = dij_ori # Now do the orientational cluster analysis #from IPython import embed; embed(); exit() d_c_ori = 0.13 from exafel_project.ADSE13_25.clustering.plot_with_dimensional_embedding import plot_with_dimensional_embedding #plot_with_dimensional_embedding(1-Dij_ori[1]/flex.max(Dij_ori[1]), show_plot=True) for cluster in Dij_ori: d_c_ori = flex.mean_and_variance(Dij_ori[cluster].as_1d( )).unweighted_sample_standard_deviation() CM_ori = clustering_manager(Dij=Dij_ori[cluster], d_c=d_c_ori, max_percentile_rho=0.85) n_cluster_ori = 1 + flex.max(CM_ori.cluster_id_final) #from IPython import embed; embed() #FIXME should be a PHIL param for i in range(n_cluster_ori): if len([zz for zz in CM_ori.cluster_id_final if zz == i]) < 5: continue item = flex.first_index(CM_ori.cluster_id_maxima, i) dxtbx_crystal_model = uc_experiments_list[cluster][ item].crystals()[0] dxtbx_crystal_models.append(dxtbx_crystal_model) from scitbx.matrix import sqr from cctbx_orientation_ext import crystal_orientation crystal_orientation = crystal_orientation( dxtbx_crystal_model.get_A(), True) A_direct = sqr(crystal_orientation.reciprocal_matrix() ).transpose().inverse() print( "IOTA: Direct A matrix 1st element of orientational cluster %d = %12.6f" % (i, A_direct[0])) if show_plot: # Decision graph stretch_plot_factor = 1.05 # (1+fraction of limits by which xlim,ylim should be set) import matplotlib.pyplot as plt plt.plot(CM_ori.rho, CM_ori.delta, "r.", markersize=3.) for x in range(len(list(CM_ori.cluster_id_final))): if CM_ori.cluster_id_maxima[x] >= 0: plt.plot([CM_ori.rho[x]], [CM_ori.delta[x]], "ro") #from IPython import embed; embed(); exit() plt.xlim([-10, stretch_plot_factor * flex.max(CM_ori.rho)]) plt.ylim([-10, stretch_plot_factor * flex.max(CM_ori.delta)]) plt.show() # Make sure the crystal models are not too close to each other # FIXME should be a PHIL min_angle = 5.0 # taken from indexer.py close_models_list = [] if len(dxtbx_crystal_models) > 1: from dials.algorithms.indexing.compare_orientation_matrices import difference_rotation_matrix_axis_angle for i_a in range(0, len(dxtbx_crystal_models) - 1): for i_b in range(i_a, len(dxtbx_crystal_models)): cryst_a = dxtbx_crystal_models[i_a] cryst_b = dxtbx_crystal_models[i_b] R_ab, axis, angle, cb_op_ab = difference_rotation_matrix_axis_angle( cryst_a, cryst_b) # FIXME if abs(angle) < min_angle: # degrees close_models_list.append((i_a, i_b)) # Now prune the dxtbx_crystal_models list for close_models in close_models_list: i_a, i_b = close_models if dxtbx_crystal_models[i_a] is not None and dxtbx_crystal_models[ i_b] is not None: dxtbx_crystal_models[i_a] = None dxtbx_crystal_models = [x for x in dxtbx_crystal_models if x is not None] if len(dxtbx_crystal_models) > 0: return dxtbx_crystal_models, None else: # If nothing works, atleast return the 1st crystal model that was found return [experiments_list[0].crystals()[0]], None