コード例 #1
0
    def _run(self, simulation, groups, crystals="all", catt=None, suffix=""):

        pd.set_option('display.max_columns', None)
        pd.set_option('display.max_rows', None)
        pd.set_option('display.expand_frame_repr', False)
        pd.set_option('display.max_colwidth', None)
        pd.set_option('display.max_seq_items', None)

        list_crystals = get_list_crystals(simulation._crystals, crystals, catt)

        cvg = {}
        for i in groups.keys():
            cvg[i] = 0

        for crystal in list_crystals:
            crystal._cvs[self._name + suffix] = copy.deepcopy(cvg)
            for group in groups.keys():
                if crystal._name in groups[group]:
                    crystal._cvs[self._name + suffix][group] += 1
                    break

        file_hd = open(
            "{}/Groups_{}_{}.dat".format(simulation._path_output, self._name,
                                         simulation._name), "w")
        file_hd.write("# Group_name             Crystal_IDs\n")
        for group in groups.keys():
            file_hd.write("{:<25}: {}\n".format(str(group), groups[group]))
        file_hd.close()
コード例 #2
0
 def generate_input(self,
                    simulation: Union[MolecularDynamics, Metadynamics],
                    crystals="all",
                    catt=None):
     """
     Generate the plumed input files. This is particularly useful for crystals with tilted boxes.
     If the catt option is used, only crystals with the specified attribute are used.
     Attributes must be specified in the form of a python dict, menaning catt={"AttributeLabel": "AttributeValue"}.
     NB: The <simulation>.mdrun_options attribute is modified to include "-plumed plumed_<name>.dat"
     :param catt: Use crystal attributes to select the crystal list
     :param simulation: Simulation object
     :param crystals: It can be either "all", use all non-melted Crystal objects from the previous simulation or
                      "centers", use only cluster centers from the previous simulation. Alternatively, you can select
                      a specific subset of crystals by listing crystal names.
     :return:
     """
     list_crystals = get_list_crystals(simulation._crystals,
                                       crystals,
                                       attributes=catt)
     add_plumed_file = False
     file_plumed = None
     if "-plumed" in simulation._mdrun_options:
         add_plumed_file = input(
             "A plumed file has been found in the mdrun options. \n"
             "Do you want to add it the plumed input (NB: if not, it will be ignored for this "
             "simulation)? [y/n] ")
         if add_plumed_file.lower() in ("yes", "y", "true"):
             add_plumed_file = True
             it = iter(simulation._mdrun_options.split())
             for i in it:
                 if i == "-plumed":
                     file_plumed = next(it)
         else:
             add_plumed_file = False
     simulation._mdrun_options = " -plumed plumed_{}.dat ".format(
         self._name)
     for crystal in list_crystals:
         txt = self._metad()
         f = open(crystal._path + "plumed_{}", "w")
         f.write(txt)
         if add_plumed_file:
             if os.path.exists(crystal._path + file_plumed):
                 f2 = open(crystal._path + file_plumed, "r")
                 f.write("".join(f2.readlines()))
                 f2.close()
         f.close()
コード例 #3
0
    def run(self,
            simulation: Union[EnergyMinimization, CellRelaxation,
                              MolecularDynamics, Metadynamics],
            crystals="all",
            catt=None,
            suffix=""):
        """
        Creates groups from the crystal attributes in the simulation object.
        :param simulation: Simulation Object (EnergyMinimization, CellRelaxation, MolecularDynamics, Metadynamics)
        :param crystals: It can be either "all", use all non-melted Crystal objects from the previous simulation or
               "centers", use only cluster centers from the previous simulation. Alternatively, you can select
               a specific subset of crystals by listing crystal names.
        :param catt: Use crystal attributes to select the crystal list
        :param suffix: TODO
        :return:
        """

        pd.set_option('display.max_columns', None)
        pd.set_option('display.max_rows', None)
        pd.set_option('display.expand_frame_repr', False)
        pd.set_option('display.max_colwidth', None)
        pd.set_option('display.max_seq_items', None)

        groups = {}
        list_crystals = get_list_crystals(simulation._crystals, crystals, catt)

        if not all(self._attribute in crystal._attributes
                   for crystal in list_crystals):
            print(
                f"Error: some of the Crystals do not have attribute '{self._attribute}'"
            )
            exit()

        for crystal in list_crystals:
            gatt = crystal._attributes[self._attribute]
            if gatt in groups.keys():
                groups[gatt].append(crystal._name)
            else:
                groups[gatt] = [crystal._name]

        self._run(simulation, groups, crystals, catt, suffix)
コード例 #4
0
    def run(self,
            simulation: Union[EnergyMinimization, CellRelaxation,
                              MolecularDynamics, Metadynamics],
            crystals="all",
            catt=None,
            suffix=""):
        """
        Creates groups from the crystal distributions in the simulation object.
        :param simulation: Simulation Object (EnergyMinimization, CellRelaxation, MolecularDynamics, Metadynamics)
        :param crystals: It can be either "all", use all non-melted Crystal objects from the previous simulation or
               "centers", use only cluster centers from the previous simulation. Alternatively, you can select
               a specific subset of crystals by listing crystal names.
        :param catt: Use crystal attributes to select the crystal list
        :param suffix: TODO
        :return:
        """

        pd.set_option('display.max_columns', None)
        pd.set_option('display.max_rows', None)
        pd.set_option('display.expand_frame_repr', False)
        pd.set_option('display.max_colwidth', None)
        pd.set_option('display.max_seq_items', None)

        groups = {}
        list_crystals = get_list_crystals(simulation._crystals, crystals, catt)

        if self._grouping_method == "groups":
            combinations: list = []
            for i in range(self._D):
                combinations.append(
                    [c for c in self._group_bins.keys() if c[0] == i])

            # noinspection PyTypeChecker
            dataset = np.full(
                (len(list_crystals), len(
                    (list(its.product(*combinations)))) + 1), np.nan)
            index = []
            for cidx in range(len(list_crystals)):
                crystal = list_crystals[cidx]

                index.append(crystal._name)
                dist = crystal._cvs[self._dist_cv._name + suffix] / np.sum(
                    crystal._cvs[self._dist_cv._name + suffix])
                c = 0
                for i in its.product(*combinations):
                    dataset[cidx,
                            c] = np.sum(dist[np.ix_(self._group_bins[i[0]],
                                                    self._group_bins[i[1]])])
                    c += 1

            # noinspection PyTypeChecker
            dataset = pd.DataFrame(
                np.where(dataset > self._group_threshold, 1, 0),
                index=index,
                columns=[(i[0][1], i[1][1])
                         for i in its.product(*combinations)] + ["Others"])

            groups = dataset.groupby(dataset.columns.to_list()).groups
            # noinspection PyUnresolvedReferences
            groups = {
                k: groups[k].to_list()
                for k in sorted(groups.keys(), key=lambda x: np.sum(x))
            }

        elif self._grouping_method == "similarity":
            from scipy.sparse import csr_matrix
            from scipy.sparse.csgraph import breadth_first_order
            index = []
            for crystal in list_crystals:
                index.append(crystal._name)

            dmat = pd.DataFrame(np.zeros((len(index), len(index))),
                                columns=index,
                                index=index)
            bar = progressbar.ProgressBar(
                maxval=int(len(crystals) * (len(crystals) - 1) / 2)).start()
            nbar = 1

            for i in range(len(list_crystals) - 1):
                from copy import deepcopy
                di = deepcopy(list_crystals[i]._cvs[self._dist_cv._name +
                                                    suffix])
                ci = list_crystals[i]._name
                for j in range(i + 1, len(crystals)):
                    dj = deepcopy(list_crystals[j]._cvs[self._dist_cv._name +
                                                        suffix])
                    cj = list_crystals[j]._name
                    bar.update(nbar)
                    nbar += 1
                    if self._dist_cv._type == "Radial Distribution Function":
                        if len(di) > len(dj):
                            hd = hellinger(di.copy()[:len(dj)], dj.copy(),
                                           self._int_type)
                        else:
                            hd = hellinger(di.copy(),
                                           dj.copy()[:len(di)], self._int_type)
                    else:
                        hd = hellinger(di.copy(), dj.copy(), self._int_type)
                    dmat.at[ci, cj] = dmat.at[cj, ci] = hd
            bar.finish()

            dmat = pd.DataFrame(np.where(dmat.values < self._group_threshold,
                                         1., 0.),
                                columns=index,
                                index=index)

            graph = csr_matrix(dmat)
            removed = []
            for c in range(len(dmat.index)):
                if dmat.index.to_list()[c] in removed:
                    continue
                bfs = breadth_first_order(graph, c, False, False)
                group_index = [index[i] for i in range(len(index)) if i in bfs]
                removed = removed + group_index
                groups[group_index[0]] = group_index

        self._run(simulation, groups, crystals, catt, suffix)
コード例 #5
0
    def run(self,
            simulation,
            crystals="all",
            group_threshold: float = 0.8,
            catt=None,
            suffix="",
            path_output="",
            _check=True):

        from PyPol.gromacs import EnergyMinimization, MolecularDynamics, CellRelaxation, Metadynamics

        if type(simulation) not in [
                EnergyMinimization, MolecularDynamics, CellRelaxation,
                Metadynamics
        ]:
            print(
                "Error: simulation object not suitable for clustering analysis"
            )

        # TODO Simplify script. Rewrite crystal sorting in groups and clustering
        #      ---> too complicated and probably inefficient to use pandas Dataframe in this context
        #      Use crystal attributes or dict?
        pd.set_option('display.max_columns', None)
        pd.set_option('display.max_rows', None)
        pd.set_option('display.expand_frame_repr', False)
        pd.set_option('display.max_colwidth', None)
        pd.set_option('display.max_seq_items', None)

        simulation._clusters = {}
        simulation._cluster_data = {}

        list_crystals = get_list_crystals(simulation._crystals, crystals, catt)

        if not simulation._completed and not _check:
            print(
                "Simulation {} is not completed yet. Run simulation.get_results() to check termination and import "
                "results.".format(simulation._name))

        if not path_output:
            path_output = simulation._path_output
            path_output_data = simulation._path_output + str(
                self._name) + f"{suffix}_data/"
        else:
            path_output_data = path_output + str(
                self._name) + f"{suffix}_data/"

        if not os.path.exists(path_output_data):
            os.mkdir(path_output_data)
        if not os.path.exists(path_output):
            os.mkdir(path_output)

        d_c = np.array([])

        # Sort crystals into groups
        group_options = []
        group_names = []
        for cv in self._cvp:
            if cv.clustering_type == "classification":
                crystal = list_crystals[0]
                group_options.append(
                    list(crystal._cvs[cv._name + suffix].keys()))
                group_names.append(cv._name)
        if group_options:
            if len(group_names) == 1:
                combinations = group_options[0] + [None]
                index = [str(i)
                         for i in range(len(combinations) - 1)] + ["Others"]
                combinations = pd.concat(
                    (pd.Series(combinations, name=group_names[0], index=index),
                     pd.Series([0 for _ in range(len(combinations))],
                               name="Number of structures",
                               dtype=int,
                               index=index),
                     pd.Series([[] for _ in range(len(combinations))],
                               name="Structures",
                               index=index)),
                    axis=1)
            else:
                combinations = list(its.product(*group_options)) + \
                               [tuple([None for _ in range(len(group_names))])]
                index = [str(i)
                         for i in range(len(combinations) - 1)] + ["Others"]
                combinations = pd.concat(
                    (pd.DataFrame(
                        combinations, columns=group_names, index=index),
                     pd.Series([0 for _ in range(len(combinations))],
                               name="Number of structures",
                               dtype=int,
                               index=index),
                     pd.Series([[] for _ in range(len(combinations))],
                               name="Structures",
                               index=index)),
                    axis=1)
            combinations.index.name = "Combinations"
            for crystal in list_crystals:
                combinations = self._sort_crystal(crystal, combinations,
                                                  group_threshold, suffix)
        else:
            combinations = pd.DataFrame(
                [[len(list_crystals), list_crystals]],
                columns=["Number of structures", "Structures"],
                dtype=None,
                index=["all"])
            combinations.index.name = "Combinations"
            # for crystal in list_crystals:
            #     combinations.loc["all", "Structures"].append(crystal)
            #     combinations.loc["all", "Number of structures"] += 1

        slist = [
            np.full((combinations.loc[i, "Number of structures"],
                     combinations.loc[i, "Number of structures"]), 0.0)
            for i in combinations.index
        ]
        combinations = pd.concat(
            (combinations,
             pd.Series(slist, name="Distance Matrix",
                       index=combinations.index)),
            axis=1)

        # Generate Distance Matrix of each set of distributions
        distributions = [
            cv for cv in self._cvp if cv.clustering_type != "classification"
        ]
        n_factors = {}
        for cv in distributions:
            combinations[cv._name + suffix] = pd.Series(
                copy.deepcopy(combinations["Distance Matrix"].to_dict()),
                index=combinations.index)
            n_factors[cv._name + suffix] = 0.

            print("\nCV: {}".format(cv._name))
            bar = progressbar.ProgressBar(maxval=len(list_crystals)).start()
            nbar = 1

            for index in combinations.index:
                if combinations.at[index, "Number of structures"] > 1:
                    crystals = combinations.at[index, "Structures"]

                    for i in range(len(crystals) - 1):
                        di = crystals[i]._cvs[cv._name + suffix]
                        bar.update(nbar)
                        nbar += 1
                        for j in range(i + 1, len(crystals)):
                            dj = crystals[j]._cvs[cv._name + suffix]
                            if di.shape != dj.shape:
                                di = di.copy()[tuple(map(slice, dj.shape))]
                                dj = dj.copy()[tuple(map(slice, di.shape))]
                            hd = hellinger(di.copy(), dj.copy(),
                                           self._int_type)
                            combinations.loc[index, cv._name + suffix][
                                i, j] = combinations.loc[index, cv._name +
                                                         suffix][j, i] = hd

                            if hd > n_factors[cv._name + suffix]:
                                n_factors[cv._name + suffix] = hd

            bar.finish()

        # Normalize distances
        print("Normalization...", end="")
        normalization = []
        for cv in distributions:
            normalization.append(1. / n_factors[cv._name + suffix])
            for index in combinations.index:
                if combinations.at[index, "Number of structures"] > 1:
                    combinations.at[index, cv._name +
                                    suffix] /= n_factors[cv._name + suffix]
        print("done\nGenerating Distance Matrix...", end="")

        # Generate Distance Matrix
        normalization = np.linalg.norm(np.array(normalization))
        for index in combinations.index:
            if combinations.at[index, "Number of structures"] > 1:
                if len(distributions) > 1:
                    combinations.at[
                        index, "Distance Matrix"][:, :] = np.linalg.norm(
                            np.dstack(
                                tuple([
                                    k for k in combinations.loc[index, [
                                        cv._name + suffix
                                        for cv in distributions
                                    ]]
                                ])),
                            axis=2) / normalization
                else:
                    combinations.at[index, "Distance Matrix"][:, :] = combinations.loc[
                                                                          index, distributions[0]._name + suffix] / \
                                                                      normalization
                d_c = np.append(
                    d_c,
                    combinations.at[index, "Distance Matrix"][np.triu_indices(
                        combinations.at[index, "Distance Matrix"].shape[0],
                        1)])

        # combinations.at[index, "Distance Matrix"][:, :] = np.linalg.norm(
        #             np.dstack(set([k for k in combinations.loc[index, [cv._name for cv in distributions]]])),
        #             axis=2) / normalization
        #     for i in range(combinations.at[index, "Number of structures"] - 1):
        #         for j in range(i + 1, combinations.at[index, "Number of structures"]):
        #             dist_ij = np.linalg.norm([k[i, j] for k in
        #                                       combinations.loc[index, [cv._name for cv in distributions]]])
        #             combinations.at[index, "Distance Matrix"][i, j] = \
        #                 combinations.at[index, "Distance Matrix"][j, i] = dist_ij / normalization
        #             d_c.append(dist_ij)

        for index in combinations.index:
            if combinations.at[index, "Number of structures"] > 1:
                idx = [i._name for i in combinations.at[index, "Structures"]]
                for mat in combinations.loc[index, "Distance Matrix":].index:
                    combinations.at[index,
                                    mat] = pd.DataFrame(combinations.at[index,
                                                                        mat],
                                                        index=idx,
                                                        columns=idx)
                    combinations.at[index, mat].to_csv(path_output_data +
                                                       mat.replace(" ", "") +
                                                       "_" + index + ".dat")
                    # with open(path_output + mat.replace(" ", "") + "_" + index + ".dat", 'w') as fo:
                    #     fo.write(combinations.loc[index, mat].__str__())

        print("done\nSaving Distance Matrix...", end="")
        for i in combinations.loc[:, "Distance Matrix":].columns:
            total = pd.concat([
                m for m in combinations.loc[:, i]
                if not isinstance(m, np.ndarray)
            ])
            total.to_csv(path_output + str(self._name) + "_" +
                         i.replace(" ", "") + ".dat")
            plt.imshow(total, interpolation="nearest", cmap="viridis")
            plt.colorbar()
            plt.tight_layout()
            plt.savefig(path_output + str(self._name) + "_" +
                        i.replace(" ", "") + ".png",
                        dpi=300)
            plt.close('all')

        list_crys = [[i._name for i in row["Structures"]]
                     for index, row in combinations.iterrows()]
        file_output = pd.concat(
            (combinations.loc[:, :"Number of structures"],
             pd.Series(list_crys, name="IDs", index=combinations.index)),
            axis=1)

        with open(path_output + str(self._name) + "_Groups.dat", 'w') as fo:
            fo.write("Normalization Factors:\n")
            for n in n_factors.keys():
                fo.write("{:15}: {:<1.3f}\n".format(n, n_factors[n]))
            fo.write(file_output.__str__())
        d_c = np.sort(np.array(d_c))[int(float(len(d_c)) * self._d_c_fraction)]
        print("done\nClustering...", end="")

        # Remove structures that are not cluster centers
        changes_string = ""
        with open(path_output + str(self._name) + "_FSFDP.dat", 'w') as fo:
            fo.write("# FSFDP parameters for every group:\n")

        for index in combinations.index:
            if int(combinations.at[index, "Number of structures"]) == 0:
                continue
            elif int(combinations.at[index, "Number of structures"]) == 1:
                nc = combinations.at[index, "Structures"][0]._name
                columns = ["rho", "sigma", "NN", "cluster", "distance"]
                simulation._cluster_data[index] = pd.DataFrame(
                    [[0, 0, pd.NA, nc, 0]], index=[nc], columns=columns)
                simulation._clusters[index] = {nc: [nc]}
            elif int(combinations.at[index, "Number of structures"]) == 2:
                nc1 = combinations.at[index, "Structures"][0]._name
                nc2 = combinations.at[index, "Structures"][1]._name
                columns = ["rho", "sigma", "NN", "cluster", "distance"]
                d_12 = combinations.at[index, "Distance Matrix"].values[0, 1]
                if d_12 > d_c:
                    simulation._cluster_data[index] = pd.DataFrame(
                        [[0, 0, nc2, nc1, 0], [0, 0, nc1, nc2, 0]],
                        index=[nc1, nc2],
                        columns=columns)
                    simulation._clusters[index] = {nc1: [nc1], nc2: [nc2]}
                else:
                    simulation._cluster_data[index] = pd.DataFrame(
                        [[0, 0, nc2, nc1, 0], [0, 0, nc1, nc1, d_12]],
                        index=[nc1, nc2],
                        columns=columns)
                    simulation._clusters[index] = {nc1: [nc1, nc2]}
            elif int(combinations.at[index, "Number of structures"]) > 2:
                if self._algorithm == "fsfdp":
                    simulation._cluster_data[index], sc = FSFDP(
                        combinations.at[index, "Distance Matrix"],
                        kernel=self._kernel,
                        d_c=d_c,
                        d_c_neighbors_fraction=self._d_c_fraction,
                        sigma_cutoff=self._sigma_cutoff)
                    _save_decision_graph(
                        simulation._cluster_data[index].loc[:, "rho"].values,
                        simulation._cluster_data[index].loc[:, "sigma"].values,
                        sigma_cutoff=sc,
                        path=path_output_data + "Decision_graph_" +
                        str(index) + ".png")

                    with open(
                            path_output_data + "FSFDP_" + str(index) + ".dat",
                            'w') as fo:
                        fo.write(simulation._cluster_data[index].__str__())

                    with open(path_output + str(self._name) + "_FSFDP.dat",
                              'a') as fo:
                        fo.write("\n# Group {}\n".format(str(index)))
                        fo.write(simulation._cluster_data[index].__str__())

                simulation._clusters[index] = {
                    k: simulation._cluster_data[index].index[
                        simulation._cluster_data[index]["cluster"] ==
                        k].tolist()
                    for k in list(simulation._cluster_data[index]
                                  ["cluster"].unique())
                }

            if self._centers.lower() == "energy":
                new_clusters = copy.deepcopy(simulation._clusters[index])
                energies = {
                    k._name: k._energy
                    for k in combinations.at[index, "Structures"]
                }
                for center in simulation._clusters[index].keys():
                    changes = [center, None]
                    emin = energies[center]
                    for crystal in simulation._clusters[index][center]:
                        if energies[crystal] < emin:
                            changes[1] = crystal
                            emin = energies[crystal]
                    if changes[1]:
                        new_clusters[changes[1]] = new_clusters.pop(changes[0])
                        changes_string += "{:>25} ---> {:25}\n".format(
                            changes[0], changes[1])
                simulation._clusters[index] = new_clusters

            for crystal in combinations.at[index, "Structures"]:
                for cc in simulation._clusters[index].keys():
                    if crystal._name in simulation._clusters[index][cc]:
                        crystal._state = cc
                        break
        cluster_groups = [
            g for g in simulation._clusters.keys()
            for _ in simulation._clusters[g].keys()
        ]
        simulation._clusters = {
            k: v
            for g in simulation._clusters.keys()
            for k, v in simulation._clusters[g].items()
        }
        simulation._clusters = pd.concat(
            (pd.Series(data=[
                len(simulation._clusters[x])
                for x in simulation._clusters.keys()
            ],
                       index=simulation._clusters.keys(),
                       name="Number of Structures"),
             pd.Series(data=cluster_groups,
                       index=simulation._clusters.keys(),
                       name="Group"),
             pd.Series(data=[
                 ", ".join(str(y) for y in simulation._clusters[x])
                 for x in simulation._clusters.keys()
             ],
                       index=simulation._clusters.keys(),
                       name="Structures")),
            axis=1).sort_values(by="Number of Structures", ascending=False)

        with open(path_output + str(self._name) + "_Clusters.dat", 'w') as fo:
            if changes_string:
                fo.write(
                    "Cluster centers changed according to potential energy:\n")
                fo.write(changes_string)
            fo.write(simulation._clusters.__str__())

        with open(path_output + str(self._name) + "_DFC.dat", 'w') as fo:
            if changes_string:
                fo.write(
                    "Cluster centers changed according to potential energy:\n")
                fo.write(changes_string)
            total = pd.concat([
                m for m in combinations.loc[:, "Distance Matrix"]
                if not isinstance(m, np.ndarray)
            ])
            index = []
            centers = []
            distances = []
            for crystal in get_list_crystals(simulation._crystals,
                                             crystals=total.index.to_list()):
                index.append(crystal._name)
                centers.append(crystal._state)
                distances.append(total.at[crystal._name, crystal._state])
            dfc = pd.DataFrame({
                "Center": centers,
                "Distance": distances
            },
                               index=index).sort_values(by="Distance")
            fo.write(dfc.__str__())
        print("done")