def _run(self, simulation, groups, crystals="all", catt=None, suffix=""): pd.set_option('display.max_columns', None) pd.set_option('display.max_rows', None) pd.set_option('display.expand_frame_repr', False) pd.set_option('display.max_colwidth', None) pd.set_option('display.max_seq_items', None) list_crystals = get_list_crystals(simulation._crystals, crystals, catt) cvg = {} for i in groups.keys(): cvg[i] = 0 for crystal in list_crystals: crystal._cvs[self._name + suffix] = copy.deepcopy(cvg) for group in groups.keys(): if crystal._name in groups[group]: crystal._cvs[self._name + suffix][group] += 1 break file_hd = open( "{}/Groups_{}_{}.dat".format(simulation._path_output, self._name, simulation._name), "w") file_hd.write("# Group_name Crystal_IDs\n") for group in groups.keys(): file_hd.write("{:<25}: {}\n".format(str(group), groups[group])) file_hd.close()
def generate_input(self, simulation: Union[MolecularDynamics, Metadynamics], crystals="all", catt=None): """ Generate the plumed input files. This is particularly useful for crystals with tilted boxes. If the catt option is used, only crystals with the specified attribute are used. Attributes must be specified in the form of a python dict, menaning catt={"AttributeLabel": "AttributeValue"}. NB: The <simulation>.mdrun_options attribute is modified to include "-plumed plumed_<name>.dat" :param catt: Use crystal attributes to select the crystal list :param simulation: Simulation object :param crystals: It can be either "all", use all non-melted Crystal objects from the previous simulation or "centers", use only cluster centers from the previous simulation. Alternatively, you can select a specific subset of crystals by listing crystal names. :return: """ list_crystals = get_list_crystals(simulation._crystals, crystals, attributes=catt) add_plumed_file = False file_plumed = None if "-plumed" in simulation._mdrun_options: add_plumed_file = input( "A plumed file has been found in the mdrun options. \n" "Do you want to add it the plumed input (NB: if not, it will be ignored for this " "simulation)? [y/n] ") if add_plumed_file.lower() in ("yes", "y", "true"): add_plumed_file = True it = iter(simulation._mdrun_options.split()) for i in it: if i == "-plumed": file_plumed = next(it) else: add_plumed_file = False simulation._mdrun_options = " -plumed plumed_{}.dat ".format( self._name) for crystal in list_crystals: txt = self._metad() f = open(crystal._path + "plumed_{}", "w") f.write(txt) if add_plumed_file: if os.path.exists(crystal._path + file_plumed): f2 = open(crystal._path + file_plumed, "r") f.write("".join(f2.readlines())) f2.close() f.close()
def run(self, simulation: Union[EnergyMinimization, CellRelaxation, MolecularDynamics, Metadynamics], crystals="all", catt=None, suffix=""): """ Creates groups from the crystal attributes in the simulation object. :param simulation: Simulation Object (EnergyMinimization, CellRelaxation, MolecularDynamics, Metadynamics) :param crystals: It can be either "all", use all non-melted Crystal objects from the previous simulation or "centers", use only cluster centers from the previous simulation. Alternatively, you can select a specific subset of crystals by listing crystal names. :param catt: Use crystal attributes to select the crystal list :param suffix: TODO :return: """ pd.set_option('display.max_columns', None) pd.set_option('display.max_rows', None) pd.set_option('display.expand_frame_repr', False) pd.set_option('display.max_colwidth', None) pd.set_option('display.max_seq_items', None) groups = {} list_crystals = get_list_crystals(simulation._crystals, crystals, catt) if not all(self._attribute in crystal._attributes for crystal in list_crystals): print( f"Error: some of the Crystals do not have attribute '{self._attribute}'" ) exit() for crystal in list_crystals: gatt = crystal._attributes[self._attribute] if gatt in groups.keys(): groups[gatt].append(crystal._name) else: groups[gatt] = [crystal._name] self._run(simulation, groups, crystals, catt, suffix)
def run(self, simulation: Union[EnergyMinimization, CellRelaxation, MolecularDynamics, Metadynamics], crystals="all", catt=None, suffix=""): """ Creates groups from the crystal distributions in the simulation object. :param simulation: Simulation Object (EnergyMinimization, CellRelaxation, MolecularDynamics, Metadynamics) :param crystals: It can be either "all", use all non-melted Crystal objects from the previous simulation or "centers", use only cluster centers from the previous simulation. Alternatively, you can select a specific subset of crystals by listing crystal names. :param catt: Use crystal attributes to select the crystal list :param suffix: TODO :return: """ pd.set_option('display.max_columns', None) pd.set_option('display.max_rows', None) pd.set_option('display.expand_frame_repr', False) pd.set_option('display.max_colwidth', None) pd.set_option('display.max_seq_items', None) groups = {} list_crystals = get_list_crystals(simulation._crystals, crystals, catt) if self._grouping_method == "groups": combinations: list = [] for i in range(self._D): combinations.append( [c for c in self._group_bins.keys() if c[0] == i]) # noinspection PyTypeChecker dataset = np.full( (len(list_crystals), len( (list(its.product(*combinations)))) + 1), np.nan) index = [] for cidx in range(len(list_crystals)): crystal = list_crystals[cidx] index.append(crystal._name) dist = crystal._cvs[self._dist_cv._name + suffix] / np.sum( crystal._cvs[self._dist_cv._name + suffix]) c = 0 for i in its.product(*combinations): dataset[cidx, c] = np.sum(dist[np.ix_(self._group_bins[i[0]], self._group_bins[i[1]])]) c += 1 # noinspection PyTypeChecker dataset = pd.DataFrame( np.where(dataset > self._group_threshold, 1, 0), index=index, columns=[(i[0][1], i[1][1]) for i in its.product(*combinations)] + ["Others"]) groups = dataset.groupby(dataset.columns.to_list()).groups # noinspection PyUnresolvedReferences groups = { k: groups[k].to_list() for k in sorted(groups.keys(), key=lambda x: np.sum(x)) } elif self._grouping_method == "similarity": from scipy.sparse import csr_matrix from scipy.sparse.csgraph import breadth_first_order index = [] for crystal in list_crystals: index.append(crystal._name) dmat = pd.DataFrame(np.zeros((len(index), len(index))), columns=index, index=index) bar = progressbar.ProgressBar( maxval=int(len(crystals) * (len(crystals) - 1) / 2)).start() nbar = 1 for i in range(len(list_crystals) - 1): from copy import deepcopy di = deepcopy(list_crystals[i]._cvs[self._dist_cv._name + suffix]) ci = list_crystals[i]._name for j in range(i + 1, len(crystals)): dj = deepcopy(list_crystals[j]._cvs[self._dist_cv._name + suffix]) cj = list_crystals[j]._name bar.update(nbar) nbar += 1 if self._dist_cv._type == "Radial Distribution Function": if len(di) > len(dj): hd = hellinger(di.copy()[:len(dj)], dj.copy(), self._int_type) else: hd = hellinger(di.copy(), dj.copy()[:len(di)], self._int_type) else: hd = hellinger(di.copy(), dj.copy(), self._int_type) dmat.at[ci, cj] = dmat.at[cj, ci] = hd bar.finish() dmat = pd.DataFrame(np.where(dmat.values < self._group_threshold, 1., 0.), columns=index, index=index) graph = csr_matrix(dmat) removed = [] for c in range(len(dmat.index)): if dmat.index.to_list()[c] in removed: continue bfs = breadth_first_order(graph, c, False, False) group_index = [index[i] for i in range(len(index)) if i in bfs] removed = removed + group_index groups[group_index[0]] = group_index self._run(simulation, groups, crystals, catt, suffix)
def run(self, simulation, crystals="all", group_threshold: float = 0.8, catt=None, suffix="", path_output="", _check=True): from PyPol.gromacs import EnergyMinimization, MolecularDynamics, CellRelaxation, Metadynamics if type(simulation) not in [ EnergyMinimization, MolecularDynamics, CellRelaxation, Metadynamics ]: print( "Error: simulation object not suitable for clustering analysis" ) # TODO Simplify script. Rewrite crystal sorting in groups and clustering # ---> too complicated and probably inefficient to use pandas Dataframe in this context # Use crystal attributes or dict? pd.set_option('display.max_columns', None) pd.set_option('display.max_rows', None) pd.set_option('display.expand_frame_repr', False) pd.set_option('display.max_colwidth', None) pd.set_option('display.max_seq_items', None) simulation._clusters = {} simulation._cluster_data = {} list_crystals = get_list_crystals(simulation._crystals, crystals, catt) if not simulation._completed and not _check: print( "Simulation {} is not completed yet. Run simulation.get_results() to check termination and import " "results.".format(simulation._name)) if not path_output: path_output = simulation._path_output path_output_data = simulation._path_output + str( self._name) + f"{suffix}_data/" else: path_output_data = path_output + str( self._name) + f"{suffix}_data/" if not os.path.exists(path_output_data): os.mkdir(path_output_data) if not os.path.exists(path_output): os.mkdir(path_output) d_c = np.array([]) # Sort crystals into groups group_options = [] group_names = [] for cv in self._cvp: if cv.clustering_type == "classification": crystal = list_crystals[0] group_options.append( list(crystal._cvs[cv._name + suffix].keys())) group_names.append(cv._name) if group_options: if len(group_names) == 1: combinations = group_options[0] + [None] index = [str(i) for i in range(len(combinations) - 1)] + ["Others"] combinations = pd.concat( (pd.Series(combinations, name=group_names[0], index=index), pd.Series([0 for _ in range(len(combinations))], name="Number of structures", dtype=int, index=index), pd.Series([[] for _ in range(len(combinations))], name="Structures", index=index)), axis=1) else: combinations = list(its.product(*group_options)) + \ [tuple([None for _ in range(len(group_names))])] index = [str(i) for i in range(len(combinations) - 1)] + ["Others"] combinations = pd.concat( (pd.DataFrame( combinations, columns=group_names, index=index), pd.Series([0 for _ in range(len(combinations))], name="Number of structures", dtype=int, index=index), pd.Series([[] for _ in range(len(combinations))], name="Structures", index=index)), axis=1) combinations.index.name = "Combinations" for crystal in list_crystals: combinations = self._sort_crystal(crystal, combinations, group_threshold, suffix) else: combinations = pd.DataFrame( [[len(list_crystals), list_crystals]], columns=["Number of structures", "Structures"], dtype=None, index=["all"]) combinations.index.name = "Combinations" # for crystal in list_crystals: # combinations.loc["all", "Structures"].append(crystal) # combinations.loc["all", "Number of structures"] += 1 slist = [ np.full((combinations.loc[i, "Number of structures"], combinations.loc[i, "Number of structures"]), 0.0) for i in combinations.index ] combinations = pd.concat( (combinations, pd.Series(slist, name="Distance Matrix", index=combinations.index)), axis=1) # Generate Distance Matrix of each set of distributions distributions = [ cv for cv in self._cvp if cv.clustering_type != "classification" ] n_factors = {} for cv in distributions: combinations[cv._name + suffix] = pd.Series( copy.deepcopy(combinations["Distance Matrix"].to_dict()), index=combinations.index) n_factors[cv._name + suffix] = 0. print("\nCV: {}".format(cv._name)) bar = progressbar.ProgressBar(maxval=len(list_crystals)).start() nbar = 1 for index in combinations.index: if combinations.at[index, "Number of structures"] > 1: crystals = combinations.at[index, "Structures"] for i in range(len(crystals) - 1): di = crystals[i]._cvs[cv._name + suffix] bar.update(nbar) nbar += 1 for j in range(i + 1, len(crystals)): dj = crystals[j]._cvs[cv._name + suffix] if di.shape != dj.shape: di = di.copy()[tuple(map(slice, dj.shape))] dj = dj.copy()[tuple(map(slice, di.shape))] hd = hellinger(di.copy(), dj.copy(), self._int_type) combinations.loc[index, cv._name + suffix][ i, j] = combinations.loc[index, cv._name + suffix][j, i] = hd if hd > n_factors[cv._name + suffix]: n_factors[cv._name + suffix] = hd bar.finish() # Normalize distances print("Normalization...", end="") normalization = [] for cv in distributions: normalization.append(1. / n_factors[cv._name + suffix]) for index in combinations.index: if combinations.at[index, "Number of structures"] > 1: combinations.at[index, cv._name + suffix] /= n_factors[cv._name + suffix] print("done\nGenerating Distance Matrix...", end="") # Generate Distance Matrix normalization = np.linalg.norm(np.array(normalization)) for index in combinations.index: if combinations.at[index, "Number of structures"] > 1: if len(distributions) > 1: combinations.at[ index, "Distance Matrix"][:, :] = np.linalg.norm( np.dstack( tuple([ k for k in combinations.loc[index, [ cv._name + suffix for cv in distributions ]] ])), axis=2) / normalization else: combinations.at[index, "Distance Matrix"][:, :] = combinations.loc[ index, distributions[0]._name + suffix] / \ normalization d_c = np.append( d_c, combinations.at[index, "Distance Matrix"][np.triu_indices( combinations.at[index, "Distance Matrix"].shape[0], 1)]) # combinations.at[index, "Distance Matrix"][:, :] = np.linalg.norm( # np.dstack(set([k for k in combinations.loc[index, [cv._name for cv in distributions]]])), # axis=2) / normalization # for i in range(combinations.at[index, "Number of structures"] - 1): # for j in range(i + 1, combinations.at[index, "Number of structures"]): # dist_ij = np.linalg.norm([k[i, j] for k in # combinations.loc[index, [cv._name for cv in distributions]]]) # combinations.at[index, "Distance Matrix"][i, j] = \ # combinations.at[index, "Distance Matrix"][j, i] = dist_ij / normalization # d_c.append(dist_ij) for index in combinations.index: if combinations.at[index, "Number of structures"] > 1: idx = [i._name for i in combinations.at[index, "Structures"]] for mat in combinations.loc[index, "Distance Matrix":].index: combinations.at[index, mat] = pd.DataFrame(combinations.at[index, mat], index=idx, columns=idx) combinations.at[index, mat].to_csv(path_output_data + mat.replace(" ", "") + "_" + index + ".dat") # with open(path_output + mat.replace(" ", "") + "_" + index + ".dat", 'w') as fo: # fo.write(combinations.loc[index, mat].__str__()) print("done\nSaving Distance Matrix...", end="") for i in combinations.loc[:, "Distance Matrix":].columns: total = pd.concat([ m for m in combinations.loc[:, i] if not isinstance(m, np.ndarray) ]) total.to_csv(path_output + str(self._name) + "_" + i.replace(" ", "") + ".dat") plt.imshow(total, interpolation="nearest", cmap="viridis") plt.colorbar() plt.tight_layout() plt.savefig(path_output + str(self._name) + "_" + i.replace(" ", "") + ".png", dpi=300) plt.close('all') list_crys = [[i._name for i in row["Structures"]] for index, row in combinations.iterrows()] file_output = pd.concat( (combinations.loc[:, :"Number of structures"], pd.Series(list_crys, name="IDs", index=combinations.index)), axis=1) with open(path_output + str(self._name) + "_Groups.dat", 'w') as fo: fo.write("Normalization Factors:\n") for n in n_factors.keys(): fo.write("{:15}: {:<1.3f}\n".format(n, n_factors[n])) fo.write(file_output.__str__()) d_c = np.sort(np.array(d_c))[int(float(len(d_c)) * self._d_c_fraction)] print("done\nClustering...", end="") # Remove structures that are not cluster centers changes_string = "" with open(path_output + str(self._name) + "_FSFDP.dat", 'w') as fo: fo.write("# FSFDP parameters for every group:\n") for index in combinations.index: if int(combinations.at[index, "Number of structures"]) == 0: continue elif int(combinations.at[index, "Number of structures"]) == 1: nc = combinations.at[index, "Structures"][0]._name columns = ["rho", "sigma", "NN", "cluster", "distance"] simulation._cluster_data[index] = pd.DataFrame( [[0, 0, pd.NA, nc, 0]], index=[nc], columns=columns) simulation._clusters[index] = {nc: [nc]} elif int(combinations.at[index, "Number of structures"]) == 2: nc1 = combinations.at[index, "Structures"][0]._name nc2 = combinations.at[index, "Structures"][1]._name columns = ["rho", "sigma", "NN", "cluster", "distance"] d_12 = combinations.at[index, "Distance Matrix"].values[0, 1] if d_12 > d_c: simulation._cluster_data[index] = pd.DataFrame( [[0, 0, nc2, nc1, 0], [0, 0, nc1, nc2, 0]], index=[nc1, nc2], columns=columns) simulation._clusters[index] = {nc1: [nc1], nc2: [nc2]} else: simulation._cluster_data[index] = pd.DataFrame( [[0, 0, nc2, nc1, 0], [0, 0, nc1, nc1, d_12]], index=[nc1, nc2], columns=columns) simulation._clusters[index] = {nc1: [nc1, nc2]} elif int(combinations.at[index, "Number of structures"]) > 2: if self._algorithm == "fsfdp": simulation._cluster_data[index], sc = FSFDP( combinations.at[index, "Distance Matrix"], kernel=self._kernel, d_c=d_c, d_c_neighbors_fraction=self._d_c_fraction, sigma_cutoff=self._sigma_cutoff) _save_decision_graph( simulation._cluster_data[index].loc[:, "rho"].values, simulation._cluster_data[index].loc[:, "sigma"].values, sigma_cutoff=sc, path=path_output_data + "Decision_graph_" + str(index) + ".png") with open( path_output_data + "FSFDP_" + str(index) + ".dat", 'w') as fo: fo.write(simulation._cluster_data[index].__str__()) with open(path_output + str(self._name) + "_FSFDP.dat", 'a') as fo: fo.write("\n# Group {}\n".format(str(index))) fo.write(simulation._cluster_data[index].__str__()) simulation._clusters[index] = { k: simulation._cluster_data[index].index[ simulation._cluster_data[index]["cluster"] == k].tolist() for k in list(simulation._cluster_data[index] ["cluster"].unique()) } if self._centers.lower() == "energy": new_clusters = copy.deepcopy(simulation._clusters[index]) energies = { k._name: k._energy for k in combinations.at[index, "Structures"] } for center in simulation._clusters[index].keys(): changes = [center, None] emin = energies[center] for crystal in simulation._clusters[index][center]: if energies[crystal] < emin: changes[1] = crystal emin = energies[crystal] if changes[1]: new_clusters[changes[1]] = new_clusters.pop(changes[0]) changes_string += "{:>25} ---> {:25}\n".format( changes[0], changes[1]) simulation._clusters[index] = new_clusters for crystal in combinations.at[index, "Structures"]: for cc in simulation._clusters[index].keys(): if crystal._name in simulation._clusters[index][cc]: crystal._state = cc break cluster_groups = [ g for g in simulation._clusters.keys() for _ in simulation._clusters[g].keys() ] simulation._clusters = { k: v for g in simulation._clusters.keys() for k, v in simulation._clusters[g].items() } simulation._clusters = pd.concat( (pd.Series(data=[ len(simulation._clusters[x]) for x in simulation._clusters.keys() ], index=simulation._clusters.keys(), name="Number of Structures"), pd.Series(data=cluster_groups, index=simulation._clusters.keys(), name="Group"), pd.Series(data=[ ", ".join(str(y) for y in simulation._clusters[x]) for x in simulation._clusters.keys() ], index=simulation._clusters.keys(), name="Structures")), axis=1).sort_values(by="Number of Structures", ascending=False) with open(path_output + str(self._name) + "_Clusters.dat", 'w') as fo: if changes_string: fo.write( "Cluster centers changed according to potential energy:\n") fo.write(changes_string) fo.write(simulation._clusters.__str__()) with open(path_output + str(self._name) + "_DFC.dat", 'w') as fo: if changes_string: fo.write( "Cluster centers changed according to potential energy:\n") fo.write(changes_string) total = pd.concat([ m for m in combinations.loc[:, "Distance Matrix"] if not isinstance(m, np.ndarray) ]) index = [] centers = [] distances = [] for crystal in get_list_crystals(simulation._crystals, crystals=total.index.to_list()): index.append(crystal._name) centers.append(crystal._state) distances.append(total.at[crystal._name, crystal._state]) dfc = pd.DataFrame({ "Center": centers, "Distance": distances }, index=index).sort_values(by="Distance") fo.write(dfc.__str__()) print("done")