def __drawOrigCoords(origCoords): """Display the original pre-rotamerize coordinates as a generic object ARGUMENTS: origCoords - a chain object containing the atoms to be drawn RETURNS: drawNum - the generic object number EFFECTS: creates and displays a new Coot generic object """ #create a new generic display object drawNum = new_generic_object_number("Pre-rotamerize coordinates") set_display_generic_object(drawNum, 1) lineList = [] #a list of all the bonds to be drawn #draw bonds for each nucleotide prevO3 = None for curNuc in origCoords.nucs: #draw each bond if both atoms are present for (atom1, atom2) in (BOND_LIST_FULL["backbone"] + BOND_LIST_FULL[curNuc.type]): if curNuc.hasAtom(atom1) and curNuc.hasAtom(atom2): atom1Coords = curNuc.atoms[atom1] atom2Coords = curNuc.atoms[atom2] lineList.append(atom1Coords + atom2Coords) #draw a bond between O5' and the previous phosphate if both atoms are present if prevO3 is not None and curNuc.hasAtom("P"): lineList.append(prevO3 + curNuc.atoms["P"]) #update prevPhos if curNuc.hasAtom("O3'"): prevO3 = curNuc.atoms["O3'"] else: prevO3 = None #actually draw the bonds for curLine in lineList: to_generic_object_add_line(drawNum, ORIG_COORDS_COLOR, 6, *curLine) graphics_draw() return drawNum
def cluster_and_display_waters(site_number, w_positions_np): def optimize_n(positions_np, n_data): bic = {} for n in [x + 1 for x in range(20)]: if n < len(positions_np): gmm = mixture.GMM(n_components=n, covariance_type="spherical", n_iter=20) gmm.fit(positions_np) score = sum(gmm.score(positions_np)) lambda_c = 15 # 3 too few bic_l = score - lambda_c * 0.5 * math.log(n_data) * n bic[n] = bic_l for key in bic: print " water bic", key, bic[key] key, value = max(bic.iteritems(), key=lambda x: x[1]) return key n_components = optimize_n(w_positions_np, len(w_positions_np)) print "optimize_n for water:::::::::::::", n_components dpgmm = mixture.GMM(n_components, covariance_type="spherical", n_iter=40) dpgmm.fit(w_positions_np) cluster_assignments = dpgmm.predict(w_positions_np) color_list = [ "green", "greentint", "sea", "yellow", "yellowtint", "aquamarine", "forestgreen", "goldenrod", "orangered", "orange", "cyan", "red", "blue", ] color_list.extend(color_list) color_list.extend(color_list) color_list.extend(color_list) color_list.extend(color_list) color_list.extend(color_list) color_list.extend(color_list) color_list.extend(color_list) color_list.extend(color_list) color_list.extend(color_list) color_list.extend(color_list) color_list.extend(color_list) means = dpgmm.means_ cvs = dpgmm._get_covars() weights = dpgmm.weights_ obj = coot.new_generic_object_number("CFC Site " + str(site_number) + " selected waters") for i, pos in enumerate(w_positions_np): mean = means[cluster_assignments[i]] # reject spheres at the origin - (from DPGMM strangeness) d = mean[0] * mean[0] + mean[1] * mean[1] + mean[2] * mean[2] if d > 1.0: col = color_list[cluster_assignments[i]] coot.to_generic_object_add_point(obj, col, 10, pos[0], pos[1], pos[2]) else: print "reject prediction", i, "for cluster", cluster_assignments[i] # set_display_generic_object(obj, 1) obj = coot.new_generic_object_number("CFC Site " + str(site_number) + " water cluster means") for i, cv in enumerate(cvs): mean = means[i] d = mean[0] * mean[0] + mean[1] * mean[1] + mean[2] * mean[2] v, w = linalg.eigh(cv) # print "mean ", mean # print "weight", weights[i], "prec", precs[i] # print "weight", weights[i] # print "v", v if d > 1.0: pos = mean thick = 2 cluster_star_obj(obj, pos, thick, v[0]) else: print "reject", mean, v coot.set_display_generic_object(obj, 1) cluster_assignments_as_list = [int(x) for x in cluster_assignments] return (dpgmm, cluster_assignments_as_list)
def find_the_sites(self, file_name_comp_id_list): # main line # coords_with_spec = [] for fn_comp_id in file_name_comp_id_list: fn = fn_comp_id[0] comp_id = fn_comp_id[1] imol = coot.handle_read_draw_molecule_with_recentre(fn_comp_id[0], 0) # what are the residue specs for the given comp_ids? residue_specs = coot.get_residue_specs_in_mol_py(imol, comp_id) print fn, residue_specs for spec in residue_specs: # centre = residue_centre_from_spec_py(imol, spec) chain_id = rsu.residue_spec_to_chain_id(spec) res_no = rsu.residue_spec_to_res_no(spec) ins_code = "" res_info = coot.residue_info_py(imol, chain_id, res_no, ins_code) for atom in res_info: coords_with_spec.append([rsu.residue_atom_to_position(atom), imol, spec]) # print coords_with_spec # now cluster coords. There will be 1 (usually), maybe 2 possibly 3 sites if len(coords_with_spec) < 3: return False else: coords = [x[0] for x in coords_with_spec] positions_np = np.array(coords) n_components = self.optimize_n(positions_np, len(positions_np)) print "optimize_n for sites::::::::::::", n_components dpgmm = mixture.GMM(n_components, covariance_type="full", n_iter=40) dpgmm.fit(positions_np) cluster_assignments = dpgmm.predict(positions_np) means = dpgmm.means_ weights = dpgmm.weights_ print cluster_assignments print means print weights print "cluster_assignments", cluster_assignments merge_map = self.find_mergeable_clusters(means, weights) # which key (i.e. cluster index) has the most number of other clusters # that can be merged in? # # convert to a list of ints (not <type 'numpy.int64'>) (because, on decoding Python->C++ object # we do a PyInt_Check for the site_idx (and a <type 'numpy.int64'> fails that test) # new_cluster_assignments = [int(x) for x in self.merge_clusters(cluster_assignments, merge_map)] print "new cluster_assignments", new_cluster_assignments specs = [x[1:] for x in coords_with_spec] cluster_assignments_with_specs = zip(new_cluster_assignments, specs) sites = coot.chemical_feature_clusters_accept_site_clusters_info_py(cluster_assignments_with_specs) # show me them if True: # debug o = coot.new_generic_object_number("site clusters") for mean in means: cluster_star_obj(o, mean, 2, 2) # coot.set_display_generic_object(o, 1) this is for debugging self.sites = sites
def cluster_and_display_chemical_features(site_number, type, chemical_features_list): def optimize_n(type, positions_np, n_data): print "cluster_and_display_chemical_features.optimize_n called " "with n_data = ", n_data bic = {} for n in [x + 1 for x in range(10)]: if n < n_data: gmm = mixture.GMM(n_components=n, covariance_type="spherical", n_iter=20) gmm.fit(positions_np) score = sum(gmm.score(positions_np)) lambda_c = 15 if type == "Aromatic": lambda_c = 20 bic_l = score - lambda_c * 0.5 * math.log(n_data) * n bic[n] = bic_l if len(bic) > 1: key, value = max(bic.iteritems(), key=lambda x: x[1]) return key else: return 1 def analyse_bic(type, positions_np, n_data): for n in [x + 1 for x in range(14)]: gmm = mixture.GMM(n_components=n, covariance_type="spherical", n_iter=20) gmm.fit(positions_np) score = sum(gmm.score(positions_np)) lambda_c = 3 if type == "Aromatic": lambda_c = 3000 bic = score - lambda_c * 0.5 * n_data * n print type, len(positions_np), n, "converged?", gmm.converged_, "score:", score, "bic", bic def get_cfc_col(type): if type == "Donor": return "blue" if type == "Acceptor": return "red" if type == "Hydrophobe": return "yellow" if type == "Aromatic": return "orange" return "grey" # --- main line ---- # no fake points # positions_np = np.array([item[0] for item in chemical_features_list]) ext_chemical_features_list = [item[0] for item in chemical_features_list] for item_b in chemical_features_list: delta = 0.25 item = item_b[0] p1 = [item[0], item[1], item[2] + delta] p2 = [item[0], item[1], item[2] - delta] p3 = [item[0], item[1] + delta, item[2]] p4 = [item[0], item[1] - delta, item[2]] p5 = [item[0] + delta, item[1], item[2]] p6 = [item[0] - delta, item[1], item[2]] ext_chemical_features_list.append(p1) ext_chemical_features_list.append(p2) ext_chemical_features_list.append(p3) ext_chemical_features_list.append(p4) ext_chemical_features_list.append(p5) ext_chemical_features_list.append(p6) positions_np = np.array(ext_chemical_features_list) # analyse_bic(type, positions_np, len(chemical_features_list)) n_data = len(chemical_features_list) n = 1 if n_data > 1: n = optimize_n(type, positions_np, n_data) if n <= len(chemical_features_list): gmm = mixture.GMM(n_components=n, covariance_type="spherical", n_iter=20) gmm.fit(positions_np) print type, len(positions_np), n, "converged? ", gmm.converged_, "score:", sum(gmm.score(positions_np)) cluster_assignments = gmm.predict(positions_np) features = [] for i, cf in enumerate(chemical_features_list): # print " ", cf, cluster_assignments[i] features.append([cf, int(cluster_assignments[i])]) means = gmm.means_ means_as_list = [[x[0], x[1], x[2]] for x in means] obj_name = "CFC Site " + str(site_number) + " " + type + " pharmacophore-clusters" cfc_obj = coot.new_generic_object_number(obj_name) cfc_col = get_cfc_col(type) for mean in means_as_list: # coot.to_generic_object_add_dodecahedron(cfc_obj, cfc_col, 0.2, mean[0], mean[1], mean[2]) coot.to_generic_object_add_pentakis_dodecahedron(cfc_obj, cfc_col, 2.3, 0.1, mean[0], mean[1], mean[2]) coot.set_display_generic_object(cfc_obj, 1) return [type, features, means_as_list] # oops too many parameters for the model return False
def cluster_and_display_waters(site_number, w_positions_np): def optimize_n(positions_np, n_data): bic = {} for n in [x + 1 for x in range(20)]: if n < len(positions_np): gmm = mixture.GMM(n_components=n, covariance_type='spherical', n_iter=20) gmm.fit(positions_np) score = sum(gmm.score(positions_np)) lambda_c = 15 # 3 too few bic_l = score - lambda_c * 0.5 * math.log(n_data) * n bic[n] = bic_l for key in bic: print(" water bic", key, bic[key]) key, value = max(iter(bic.items()), key=lambda x: x[1]) return key n_components = optimize_n(w_positions_np, len(w_positions_np)) print("optimize_n for water:::::::::::::", n_components) dpgmm = mixture.GMM(n_components, covariance_type='spherical', n_iter=40) dpgmm.fit(w_positions_np) cluster_assignments = dpgmm.predict(w_positions_np) color_list = [ 'green', 'greentint', "sea", 'yellow', "yellowtint", "aquamarine", "forestgreen", "goldenrod", "orangered", "orange", "cyan", 'red', "blue" ] color_list.extend(color_list) color_list.extend(color_list) color_list.extend(color_list) color_list.extend(color_list) color_list.extend(color_list) color_list.extend(color_list) color_list.extend(color_list) color_list.extend(color_list) color_list.extend(color_list) color_list.extend(color_list) color_list.extend(color_list) means = dpgmm.means_ cvs = dpgmm._get_covars() weights = dpgmm.weights_ obj = coot.new_generic_object_number("CFC Site " + str(site_number) + " selected waters") for i, pos in enumerate(w_positions_np): mean = means[cluster_assignments[i]] # reject spheres at the origin - (from DPGMM strangeness) d = mean[0] * mean[0] + mean[1] * mean[1] + mean[2] * mean[2] if d > 1.0: col = color_list[cluster_assignments[i]] coot.to_generic_object_add_point(obj, col, 10, pos[0], pos[1], pos[2]) else: print("reject prediction", i, "for cluster", cluster_assignments[i]) # set_display_generic_object(obj, 1) obj = coot.new_generic_object_number("CFC Site " + str(site_number) + " water cluster means") for i, cv in enumerate(cvs): mean = means[i] d = mean[0] * mean[0] + mean[1] * mean[1] + mean[2] * mean[2] v, w = linalg.eigh(cv) # print "mean ", mean # print "weight", weights[i], "prec", precs[i] # print "weight", weights[i] # print "v", v if d > 1.0: pos = mean thick = 2 cluster_star_obj(obj, pos, thick, v[0]) else: print("reject", mean, v) coot.set_display_generic_object(obj, 1) cluster_assignments_as_list = [int(x) for x in cluster_assignments] return (dpgmm, cluster_assignments_as_list)
def find_the_sites(self, file_name_comp_id_list): # main line # coords_with_spec = [] for fn_comp_id in file_name_comp_id_list: fn = fn_comp_id[0] comp_id = fn_comp_id[1] imol = coot.handle_read_draw_molecule_with_recentre( fn_comp_id[0], 0) # what are the residue specs for the given comp_ids? residue_specs = coot.get_residue_specs_in_mol_py(imol, comp_id) print(fn, residue_specs) for spec in residue_specs: # centre = residue_centre_from_spec_py(imol, spec) chain_id = rsu.residue_spec_to_chain_id(spec) res_no = rsu.residue_spec_to_res_no(spec) ins_code = '' res_info = coot.residue_info_py(imol, chain_id, res_no, ins_code) for atom in res_info: coords_with_spec.append( [rsu.residue_atom_to_position(atom), imol, spec]) # print coords_with_spec # now cluster coords. There will be 1 (usually), maybe 2 possibly 3 sites if len(coords_with_spec) < 3: return False else: coords = [x[0] for x in coords_with_spec] positions_np = np.array(coords) n_components = self.optimize_n(positions_np, len(positions_np)) print("optimize_n for sites::::::::::::", n_components) dpgmm = mixture.GMM(n_components, covariance_type='full', n_iter=40) dpgmm.fit(positions_np) cluster_assignments = dpgmm.predict(positions_np) means = dpgmm.means_ weights = dpgmm.weights_ print(cluster_assignments) print(means) print(weights) print("cluster_assignments", cluster_assignments) merge_map = self.find_mergeable_clusters(means, weights) # which key (i.e. cluster index) has the most number of other clusters # that can be merged in? # # convert to a list of ints (not <type 'numpy.int64'>) (because, on decoding Python->C++ object # we do a PyInt_Check for the site_idx (and a <type 'numpy.int64'> fails that test) # new_cluster_assignments = [ int(x) for x in self.merge_clusters(cluster_assignments, merge_map) ] print("new cluster_assignments", new_cluster_assignments) specs = [x[1:] for x in coords_with_spec] cluster_assignments_with_specs = zip(new_cluster_assignments, specs) sites = coot.chemical_feature_clusters_accept_site_clusters_info_py( cluster_assignments_with_specs) # show me them if True: # debug o = coot.new_generic_object_number("site clusters") for mean in means: cluster_star_obj(o, mean, 2, 2) # coot.set_display_generic_object(o, 1) this is for debugging self.sites = sites
def cluster_and_display_chemical_features(site_number, type, chemical_features_list): def optimize_n(type, positions_np, n_data): print("cluster_and_display_chemical_features.optimize_n called " \ "with n_data = ", n_data) bic = {} for n in [x + 1 for x in range(10)]: if n < n_data: gmm = mixture.GMM(n_components=n, covariance_type='spherical', n_iter=20) gmm.fit(positions_np) score = sum(gmm.score(positions_np)) lambda_c = 15 if type == 'Aromatic': lambda_c = 20 bic_l = score - lambda_c * 0.5 * math.log(n_data) * n bic[n] = bic_l if len(bic) > 1: key, value = max(iter(bic.items()), key=lambda x: x[1]) return key else: return 1 def analyse_bic(type, positions_np, n_data): for n in [x + 1 for x in range(14)]: gmm = mixture.GMM(n_components=n, covariance_type='spherical', n_iter=20) gmm.fit(positions_np) score = sum(gmm.score(positions_np)) lambda_c = 3 if type == 'Aromatic': lambda_c = 3000 bic = score - lambda_c * 0.5 * n_data * n print(type, len(positions_np), n, "converged?", gmm.converged_, "score:", score, "bic", bic) def get_cfc_col(type): if type == "Donor": return "blue" if type == "Acceptor": return "red" if type == "Hydrophobe": return "yellow" if type == "Aromatic": return "orange" return "grey" # --- main line ---- # no fake points # positions_np = np.array([item[0] for item in chemical_features_list]) ext_chemical_features_list = [item[0] for item in chemical_features_list] for item_b in chemical_features_list: delta = 0.25 item = item_b[0] p1 = [item[0], item[1], item[2] + delta] p2 = [item[0], item[1], item[2] - delta] p3 = [item[0], item[1] + delta, item[2]] p4 = [item[0], item[1] - delta, item[2]] p5 = [item[0] + delta, item[1], item[2]] p6 = [item[0] - delta, item[1], item[2]] ext_chemical_features_list.append(p1) ext_chemical_features_list.append(p2) ext_chemical_features_list.append(p3) ext_chemical_features_list.append(p4) ext_chemical_features_list.append(p5) ext_chemical_features_list.append(p6) positions_np = np.array(ext_chemical_features_list) # analyse_bic(type, positions_np, len(chemical_features_list)) n_data = len(chemical_features_list) n = 1 if n_data > 1: n = optimize_n(type, positions_np, n_data) if n <= len(chemical_features_list): gmm = mixture.GMM(n_components=n, covariance_type='spherical', n_iter=20) gmm.fit(positions_np) print(type, len(positions_np), n, "converged? ", gmm.converged_, "score:", sum(gmm.score(positions_np))) cluster_assignments = gmm.predict(positions_np) features = [] for i, cf in enumerate(chemical_features_list): # print " ", cf, cluster_assignments[i] features.append([cf, int(cluster_assignments[i])]) means = gmm.means_ means_as_list = [[x[0], x[1], x[2]] for x in means] obj_name = "CFC Site " + str( site_number) + " " + type + " pharmacophore-clusters" cfc_obj = coot.new_generic_object_number(obj_name) cfc_col = get_cfc_col(type) for mean in means_as_list: # coot.to_generic_object_add_dodecahedron(cfc_obj, cfc_col, 0.2, mean[0], mean[1], mean[2]) coot.to_generic_object_add_pentakis_dodecahedron( cfc_obj, cfc_col, 2.3, 0.1, mean[0], mean[1], mean[2]) coot.set_display_generic_object(cfc_obj, 1) return [type, features, means_as_list] # oops too many parameters for the model return False