Beispiel #1
0
    def generate_next_inpcrds(self, msm, clusters):
        """
        Writes the input coordinate files for the next generation.
        Each file is in its own numbered directory and called just "inpcrd"
        """
        # Check if inpcrds are already made
        sysdir = os.path.join(self.dir, "systems", str(self.generation + 1))
        if len(glob(os.path.join(sysdir, "*.inpcrd"))) == self.nreps:
            print("   Already have samplers... skipping inpcrd_generation")
            return

        # Make directory to contain topologies and inpcrds
        if not os.path.isdir(os.path.join(self.dir, "systems")):
            os.mkdir(os.path.join(self.dir, "systems"))
        gendir = os.path.join(self.dir, "systems", str(self.generation + 1))
        if not os.path.isdir(gendir):
            os.mkdir(gendir)

        scores = hub_scores(msm)
        utils.dump(scores, "mmsm_scores.pkl")
        gen = InpcrdGenerator(prodfiles=self.prodfiles,
                              clusters=clusters,
                              msm=msm,
                              scores=scores,
                              config=self.config,
                              criteria=self.config.get("model",
                                                       "criteria",
                                                       fallback="hub_scores"))
        gen.run()
Beispiel #2
0
    def update_features(self):
        """
        Uses the current trajectories to update the features.

        Returns: featurized all trajectories ready for tica
        """

        # Check feature string has correct format (space for generation)
        if "%d" not in self.featurized:
            print("ERROR: Need format string %d in featurized option")
            quit(1)

        # Featurize this generation
        if not os.path.isfile(self.featurized % self.generation) and \
           not os.path.isfile("%s.pkl" % self.featurized % self.generation) and \
           not os.path.isfile("%s.h5" % self.featurized % self.generation):
            featr = MultiligandContactFeaturizer(ligands=self.ligands,
                                                 scheme="closest-heavy",
                                                 protein=None,
                                                 scaling_function=None,
                                                 log=True)
            feated = []
            for traj in self.new_prodfiles:
                topo = utils.get_topology(traj, self.dir)
                if not os.path.isfile(topo):
                    topo = os.path.abspath(
                        self.config.get("system", "topologypsf"))
                featme = md.load(traj, top=topo, stride=1)
                # Hilariously this requires a list to be the right output shape
                feated.extend(featr.transform([featme]))

            # Save this feature set, with backwards compatibility for pickle runs
            if ".pkl" in self.featurized:
                utils.dump(feated, self.featurized % self.generation)
            else:
                utils.save_features_h5(
                    feated, "%s.h5" % self.featurized % self.generation)
        else:
            print("Already have features for generation %d" % self.generation)
            if os.path.isfile("%s.h5" % self.featurized % self.generation):
                feated = utils.load_features_h5("%s.h5" % self.featurized %
                                                self.generation)
            else:
                feated = utils.load("%s.pkl" % self.featurized %
                                    self.generation)

        # Check feature file isn't empty. If so, delete it and recurse
        if len(feated) == 0:
            print("Empty features generation %d... Regenerating" %
                  self.generation)
            os.remove("%s.h5" % self.featurized % self.generation)
            feated = self.update_features()

        # We only need to update tica with new features this generation
        return feated
Beispiel #3
0
    def generate_clusters(self, ticad):
        """
        Updates the cluster data. Needs to be re-done each iteration as
        cluster from previous trajectories may change as we get more data.

        Returns: clustered dataset
        """
        clustr = MiniBatchKMeans(
            n_clusters=self.config.getint("model", "num_clusters"))
        clustered = clustr.fit_transform(ticad)
        if self.save_extras:
            utils.dump(clustr, "microstater.pkl")
        return clustered
Beispiel #4
0
    def generate_msm(self, clustered):
        """
        Generates a MSM from the current cluster data

        Returns: Msm
        """
        # Generate microstate MSM
        self.currtime = time.time()
        msm = MarkovStateModel(lag_time=self.config.getint("model", "msm_lag"),
                               reversible_type="transpose",
                               ergodic_cutoff="off",
                               prior_counts=0.000001)
        msm.fit(clustered)
        print("TIME\tmicromsm:\t%f" % (time.time() - self.currtime))
        utils.dump(msm, "msm_G%d.pkl" % self.generation)

        # Lump into macrostates
        self.currtime = time.time()
        pcca = PCCAPlus.from_msm(msm,
                                 n_macrostates=self.config.getint(
                                     "model", "macrostates"))
        mclustered = pcca.transform(clustered, mode="fill")
        if any(any(np.isnan(x) for x in m) for m in mclustered):  #pylint: disable=no-member
            print(
                "WARNING: Unassignable clusters in PCCA with %d macrostates!" %
                self.config.getint("model", "macrostates"))
        print("TIME\tpccaplus:\t%f" % (time.time() - self.currtime))
        if self.save_extras:
            utils.dump(pcca, "macrostater.pkl")

        # Generate macrostate MSM
        self.currtime = time.time()
        mmsm = MarkovStateModel(lag_time=self.config.getint(
            "model", "msm_lag"),
                                reversible_type="transpose",
                                ergodic_cutoff="off",
                                prior_counts=0.000001)
        mmsm.fit(mclustered)
        print("TIME\tmacromsm\t%f" % (time.time() - self.currtime))
        utils.dump(mmsm, "mmsm_G%d.pkl" % self.generation)

        return mmsm, mclustered
Beispiel #5
0
    def run(self):
        """
        Actually does the hard work of building the MSM and creating the
        next generation
        """
        # Check if a pickled msm already exists. If so, use it to save time
        if os.path.isfile("mmsm_G%d.pkl" % self.generation) and \
           os.path.isfile("testing.cluster.pkl"):
            print("Loading MSM for generation %d:" % self.generation)
            msm = utils.load("mmsm_G%d.pkl" % self.generation)
            if os.path.isfile("testing.mcluster.pkl"):
                mclusters = utils.load("testing.mcluster.pkl")
            else:
                mclusters = utils.load("testing.cluster.pkl")

        else:
            print("GENERATION %d:" % self.generation)
            # Check if tica exists already
            if os.path.isfile("testing.tica.pkl"):
                print("Loading tics...")
                tics = utils.load("testing.tica.pkl")
            elif os.path.isfile("ticad_%d.h5" % self.generation):
                tics = utils.load_features_h5("ticad_%d.h5" % self.generation)
            else:
                # Featurize new trajectories
                print("  Featurizing...")
                sys.stdout.flush()
                self.currtime = time.time()
                features = self.update_features()
                print("TIME:\tfeaturize\t%f" % (time.time() - self.currtime))
                print("Files: %d Features: %d" %
                      (len(self.new_prodfiles), len(features)))

                # Regenerate tics
                print("  tICing...")
                sys.stdout.flush()
                self.currtime = time.time()
                tics = self.generate_tics(features)
                print("TIME:\tticaing\t%f" % (time.time() - self.currtime))

            print("  Clustering...")
            sys.stdout.flush()
            self.currtime = time.time()
            clusters = self.generate_clusters(tics)
            print("TIME:\tcluster\t%f" % (time.time() - self.currtime))
            utils.dump(clusters, "testing.cluster.pkl")

            print("  MSMing...")
            sys.stdout.flush()
            msm, mclusters = self.generate_msm(clusters)
            utils.dump(mclusters, "testing.mcluster.pkl")  # DEBUG

        # Resample, if we haven't reached max generation
        if self.generation < self.config.getint(
                "production", "max_generation", fallback=1000000):
            print("  Sampling and starting...")
            sys.stdout.flush()
            self.currtime = time.time()
            self.generate_next_inpcrds(msm, mclusters)
            print("TIME:\tinpcrd:\t%f" % (time.time() - self.currtime))
        else:
            self.finish_run()

        # The generation is incremented by the last ligand_adder.

        # Indicate that the model has completed successfully
        # This isn't really necessary but whatever
        self.config["model"]["JobID"] = "0"
        with open(self.configfile, 'w') as configfile:
            self.config.write(configfile)

        # Save representative clusters last
        self.currtime = time.time()
        self.save_cluster_means(mclusters)
        print("TIME:\taggregate:\t%f" % (time.time() - self.currtime))