Exemple #1
0
    def __init__(self, cfg: ExperimentConfig):
        self.cfg = cfg
        self.stage_idx = 0

        self.api = DeepDriveMD_API(cfg.experiment_directory)
        self.pipeline = Pipeline()
        self.pipeline.name = self.PIPELINE_NAME

        self._init_experiment_dir()
Exemple #2
0
    def __init__(self, cfg: OpenMMConfig):

        self.cfg = cfg
        self.api = DeepDriveMD_API(cfg.experiment_directory)
        self._prefix = self.api.molecular_dynamics_stage.unique_name(
            cfg.output_path)

        # Use node local storage if available. Otherwise, write to output directory.
        if cfg.node_local_path is not None:
            self.workdir = cfg.node_local_path.joinpath(self._prefix)
        else:
            self.workdir = cfg.output_path

        self._init_workdir()
def latest_model_checkpoint(cfg: LatestCheckpointConfig):
    r"""Select the latest model checkpoint and write path to JSON.

    Find the latest model checkpoint written by the machine learning
    stage and write the path into a JSON file to be consumed by the
    agent stage.

    Parameters
    ----------
    cfg : LatestCheckpointConfig
        pydantic YAML configuration for model selection task.
    """
    api = DeepDriveMD_API(cfg.experiment_directory)

    # Check if there is a new model
    if cfg.stage_idx % cfg.retrain_freq == 0:
        # Select latest model checkpoint.
        model_checkpoint = latest_checkpoint(api, cfg.checkpoint_dir,
                                             cfg.checkpoint_suffix)
        # Get latest model YAML configuration.
        model_config = api.machine_learning_stage.config_path(
            cfg.stage_idx, cfg.task_idx)
    else:  # Use old model
        token = get_model_path(cfg.stage_idx - 1, cfg.task_idx, api)
        assert token is not None, f"{cfg.stage_idx - 1}, {cfg.task_idx}"
        model_config, model_checkpoint = token

    # Format data into JSON serializable list of dictionaries
    data = [{
        "model_checkpoint": str(model_checkpoint),
        "model_config": str(model_config)
    }]
    # Dump metadata to disk for MD stage
    api.model_selection_stage.write_task_json(data, cfg.stage_idx,
                                              cfg.task_idx)
Exemple #4
0
def get_h5_training_file(cfg: KerasCVAEModelConfig) -> Tuple[Path, List[str]]:
    # Collect training data
    api = DeepDriveMD_API(cfg.experiment_directory)
    md_data = api.get_last_n_md_runs()
    all_h5_files = md_data["data_files"]

    virtual_h5_path, h5_files = get_virtual_h5_file(
        output_path=cfg.output_path,
        all_h5_files=all_h5_files,
        last_n=cfg.last_n_h5_files,
        k_random_old=cfg.k_random_old_h5_files,
        virtual_name=f"virtual_{cfg.model_tag}",
        node_local_path=cfg.node_local_path,
    )

    return virtual_h5_path, h5_files
class DeepDriveMD_Analysis:
    def __init__(self, experiment_directory: PathLike):
        self.api = DeepDriveMD_API(experiment_directory)

    def get_agent_json(self,
                       iterations: int = -1) -> List[List[Dict[str, Any]]]:
        if iterations == -1:
            iterations = self.api.get_total_iterations()
        agent_json_data = [
            self.api.agent_stage.read_task_json(stage_idx)
            for stage_idx in range(iterations)
        ]
        assert None not in agent_json_data
        return agent_json_data

    def get_agent_h5(self,
                     iterations: int = -1,
                     fields: List[str] = []) -> List[Dict[str, np.ndarray]]:
        if iterations == -1:
            iterations = self.api.get_total_iterations()
        h5_data = [
            parse_h5(
                next(
                    self.api.agent_stage.stage_dir(stage_idx).glob("**/*.h5")),
                fields) for stage_idx in range(iterations)
        ]
        return h5_data

    def apply_analysis_fn(
        self,
        fn: Callable,
        num_workers: Optional[int] = None,
        n: Optional[int] = None,
        data_file_suffix: str = ".h5",
        traj_file_suffix: str = ".dcd",
        structure_file_suffix: str = ".pdb",
    ) -> List[Any]:
        md_data = self.api.get_last_n_md_runs(n, data_file_suffix,
                                              traj_file_suffix,
                                              structure_file_suffix)
        output_data = []
        with ProcessPoolExecutor(max_workers=num_workers) as executor:
            for data in tqdm(executor.map(fn, zip(md_data.values()))):
                output_data.append(data)
        return output_data
def get_model_path(
    stage_idx: int = -1,
    task_idx: int = 0,
    api: Optional[DeepDriveMD_API] = None,
    experiment_dir: Optional[PathLike] = None,
) -> Optional[Tuple[Path, Path]]:
    r"""Get the current best model.

    Should be imported by other stages to retrieve the best model path.

    Parameters
    ----------
    api : DeepDriveMD_API, optional
        API to DeepDriveMD to access the machine learning model path.
    experiment_dir : Union[str, Path], optional
        Experiment directory to initialize DeepDriveMD_API.

    Returns
    -------
    None
        If model selection has not run before.
    model_config : Path, optional
        Path to the most recent model YAML configuration file
        selected by the model selection stage. Contains hyperparameters.
    model_checkpoint : Path, optional
        Path to the most recent model weights selected by the model
        selection stage.


    Raises
    ------
    ValueError
        If both `api` and `experiment_dir` are None.
    """
    if api is None and experiment_dir is None:
        raise ValueError("Both `api` and `experiment_dir` are None")

    if api is None:
        assert experiment_dir is not None
        api = DeepDriveMD_API(experiment_dir)

    data = api.model_selection_stage.read_task_json(stage_idx, task_idx)
    if data is None:
        return

    model_config = Path(data[0]["model_config"])
    model_checkpoint = Path(data[0]["model_checkpoint"])

    return model_config, model_checkpoint
 def __init__(self, experiment_directory: PathLike):
     self.api = DeepDriveMD_API(experiment_directory)
Exemple #8
0
class SimulationContext:
    def __init__(self, cfg: OpenMMConfig):

        self.cfg = cfg
        self.api = DeepDriveMD_API(cfg.experiment_directory)
        self._prefix = self.api.molecular_dynamics_stage.unique_name(
            cfg.output_path)

        # Use node local storage if available. Otherwise, write to output directory.
        if cfg.node_local_path is not None:
            self.workdir = cfg.node_local_path.joinpath(self._prefix)
        else:
            self.workdir = cfg.output_path

        self._init_workdir()

    @property
    def _sim_prefix(self) -> Path:
        return self.workdir.joinpath(self._prefix)

    @property
    def pdb_file(self) -> str:
        return self._pdb_file.as_posix()

    @property
    def traj_file(self) -> str:
        return self._sim_prefix.with_suffix(".dcd").as_posix()

    @property
    def h5_prefix(self) -> str:
        return self._sim_prefix.as_posix()

    @property
    def log_file(self) -> str:
        return self._sim_prefix.with_suffix(".log").as_posix()

    @property
    def top_file(self) -> Optional[str]:
        if self._top_file is None:
            return None
        return self._top_file.as_posix()

    @property
    def reference_pdb_file(self) -> Optional[str]:
        if self.cfg.reference_pdb_file is None:
            return None
        return self.cfg.reference_pdb_file.as_posix()

    def _init_workdir(self):
        """Setup workdir and copy PDB/TOP files."""

        self.workdir.mkdir(exist_ok=True)

        self._pdb_file = self._get_pdb_file()

        if self.cfg.solvent_type == "explicit":
            self._top_file = self._copy_top_file()
        else:
            self._top_file = None

    def _get_pdb_file(self) -> Path:
        if self.cfg.pdb_file is not None:
            # Initial iteration
            return self._copy_pdb_file()

        # Iterations after outlier detection
        outlier = self.api.get_restart_pdb(self.cfg.task_idx,
                                           self.cfg.stage_idx - 1)
        system_name = self.api.get_system_name(outlier["structure_file"])
        pdb_file = self.workdir.joinpath(f"{system_name}__{self._prefix}.pdb")
        self.api.write_pdb(
            pdb_file,
            outlier["structure_file"],
            outlier["traj_file"],
            outlier["frame"],
            self.cfg.in_memory,
        )
        return pdb_file

    def _copy_pdb_file(self) -> Path:
        assert self.cfg.pdb_file is not None
        copy_to_file = self.api.get_system_pdb_name(self.cfg.pdb_file)
        local_pdb_file = shutil.copy(self.cfg.pdb_file,
                                     self.workdir.joinpath(copy_to_file))
        return Path(local_pdb_file)

    def _copy_top_file(self) -> Path:
        assert self.cfg.top_suffix is not None
        top_file = self.api.get_topology(self.cfg.initial_pdb_dir,
                                         Path(self.pdb_file),
                                         self.cfg.top_suffix)
        assert top_file is not None
        local_top_file = shutil.copy(top_file,
                                     self.workdir.joinpath(top_file.name))
        return Path(local_top_file)

    def move_results(self):
        if self.workdir != self.cfg.output_path:
            for p in self.workdir.iterdir():
                shutil.move(str(p), str(self.cfg.output_path.joinpath(p.name)))
Exemple #9
0
def concatenate_last_n_h5(cfg: BasicAggegation):

    fields = []
    if cfg.rmsd:
        fields.append("rmsd")
    if cfg.fnc:
        fields.append("fnc")
    if cfg.contact_map:
        fields.append("contact_map")
    if cfg.point_cloud:
        fields.append("point_cloud")

    # Get list of input h5 files
    api = DeepDriveMD_API(cfg.experiment_directory)
    md_data = api.get_last_n_md_runs(n=cfg.last_n_h5_files)
    files = md_data["data_files"]

    if cfg.verbose:
        print(f"Collected {len(files)} h5 files.")

    # Open output file
    fout = h5py.File(cfg.output_path, "w", libver="latest")

    # Initialize data buffers
    data = {x: [] for x in fields}

    for in_file in files:

        if cfg.verbose:
            print("Reading", in_file)

        with h5py.File(in_file, "r") as fin:
            for field in fields:
                data[field].append(fin[field][...])

    # Concatenate data
    for field in data:
        data[field] = np.concatenate(data[field])

    # Centor of mass (CMS) subtraction
    if "point_cloud" in data:
        if cfg.verbose:
            print("Subtract center of mass (CMS) from point cloud")
        cms = np.mean(data["point_cloud"][:, 0:3, :].astype(np.float128),
                      axis=2,
                      keepdims=True).astype(np.float32)
        data["point_cloud"][:, 0:3, :] -= cms

    # Create new dsets from concatenated dataset
    for field, concat_dset in data.items():
        if field == "traj_file":
            utf8_type = h5py.string_dtype("utf-8")
            fout.create_dataset("traj_file", data=concat_dset, dtype=utf8_type)
            continue

        shape = concat_dset.shape
        chunkshape = (1, ) + shape[1:]
        # Create dataset
        if concat_dset.dtype != np.object:
            if np.any(np.isnan(concat_dset)):
                raise ValueError("NaN detected in concat_dset.")
            dset = fout.create_dataset(field,
                                       shape,
                                       chunks=chunkshape,
                                       dtype=concat_dset.dtype)
        else:
            dset = fout.create_dataset(field,
                                       shape,
                                       chunks=chunkshape,
                                       dtype=h5py.vlen_dtype(np.int16))
        # write data
        dset[...] = concat_dset[...]

    # Clean up
    fout.flush()
    fout.close()
Exemple #10
0
class PipelineManager:

    PIPELINE_NAME = "DeepDriveMD"
    MOLECULAR_DYNAMICS_STAGE_NAME = "MolecularDynamics"
    AGGREGATION_STAGE_NAME = "Aggregating"
    MACHINE_LEARNING_STAGE_NAME = "MachineLearning"
    MODEL_SELECTION_STAGE_NAME = "ModelSelection"
    AGENT_STAGE_NAME = "Agent"

    def __init__(self, cfg: ExperimentConfig):
        self.cfg = cfg
        self.stage_idx = 0

        self.api = DeepDriveMD_API(cfg.experiment_directory)
        self.pipeline = Pipeline()
        self.pipeline.name = self.PIPELINE_NAME

        self._init_experiment_dir()

    def _init_experiment_dir(self):
        # Make experiment directories
        self.cfg.experiment_directory.mkdir()
        self.api.molecular_dynamics_stage.runs_dir.mkdir()
        self.api.aggregation_stage.runs_dir.mkdir()
        self.api.machine_learning_stage.runs_dir.mkdir()
        self.api.model_selection_stage.runs_dir.mkdir()
        self.api.agent_stage.runs_dir.mkdir()

    def func_condition(self):
        if self.stage_idx < self.cfg.max_iteration:
            self.func_on_true()
        else:
            self.func_on_false()

    def func_on_true(self):
        print(f"Finishing stage {self.stage_idx} of {self.cfg.max_iteration}")
        self._generate_pipeline_iteration()

    def func_on_false(self):
        print("Done")

    def _generate_pipeline_iteration(self):

        self.pipeline.add_stages(self.generate_molecular_dynamics_stage())

        if not cfg.aggregation_stage.skip_aggregation:
            self.pipeline.add_stages(self.generate_aggregating_stage())

        if self.stage_idx % cfg.machine_learning_stage.retrain_freq == 0:
            self.pipeline.add_stages(self.generate_machine_learning_stage())
        self.pipeline.add_stages(self.generate_model_selection_stage())

        agent_stage = self.generate_agent_stage()
        agent_stage.post_exec = self.func_condition
        self.pipeline.add_stages(agent_stage)

        self.stage_idx += 1

    def generate_pipelines(self) -> List[Pipeline]:
        self._generate_pipeline_iteration()
        return [self.pipeline]

    def generate_molecular_dynamics_stage(self) -> Stage:
        stage = Stage()
        stage.name = self.MOLECULAR_DYNAMICS_STAGE_NAME
        cfg = self.cfg.molecular_dynamics_stage
        stage_api = self.api.molecular_dynamics_stage

        if self.stage_idx == 0:
            filenames = self.api.get_initial_pdbs(
                cfg.task_config.initial_pdb_dir)
            filenames = itertools.cycle(filenames)
        else:
            filenames = None

        for task_idx in range(cfg.num_tasks):

            output_path = stage_api.task_dir(self.stage_idx,
                                             task_idx,
                                             mkdir=True)
            assert output_path is not None

            # Update base parameters
            cfg.task_config.experiment_directory = self.cfg.experiment_directory
            cfg.task_config.stage_idx = self.stage_idx
            cfg.task_config.task_idx = task_idx
            cfg.task_config.node_local_path = self.cfg.node_local_path
            cfg.task_config.output_path = output_path
            if self.stage_idx == 0:
                assert filenames is not None
                cfg.task_config.pdb_file = next(filenames)
            else:
                cfg.task_config.pdb_file = None

            cfg_path = stage_api.config_path(self.stage_idx, task_idx)
            cfg.task_config.dump_yaml(cfg_path)
            task = generate_task(cfg)
            task.arguments += ["-c", cfg_path.as_posix()]
            stage.add_tasks(task)

        return stage

    def generate_aggregating_stage(self) -> Stage:
        stage = Stage()
        stage.name = self.AGGREGATION_STAGE_NAME
        cfg = self.cfg.aggregation_stage
        stage_api = self.api.aggregation_stage

        task_idx = 0
        output_path = stage_api.task_dir(self.stage_idx, task_idx, mkdir=True)
        assert output_path is not None

        # Update base parameters
        cfg.task_config.experiment_directory = self.cfg.experiment_directory
        cfg.task_config.stage_idx = self.stage_idx
        cfg.task_config.task_idx = task_idx
        cfg.task_config.node_local_path = self.cfg.node_local_path
        cfg.task_config.output_path = output_path

        # Write yaml configuration
        cfg_path = stage_api.config_path(self.stage_idx, task_idx)
        cfg.task_config.dump_yaml(cfg_path)
        task = generate_task(cfg)
        task.arguments += ["-c", cfg_path.as_posix()]
        stage.add_tasks(task)

        return stage

    def generate_machine_learning_stage(self) -> Stage:
        stage = Stage()
        stage.name = self.MACHINE_LEARNING_STAGE_NAME
        cfg = self.cfg.machine_learning_stage
        stage_api = self.api.machine_learning_stage

        task_idx = 0
        output_path = stage_api.task_dir(self.stage_idx, task_idx, mkdir=True)
        assert output_path is not None

        # Update base parameters
        cfg.task_config.experiment_directory = self.cfg.experiment_directory
        cfg.task_config.stage_idx = self.stage_idx
        cfg.task_config.task_idx = task_idx
        cfg.task_config.node_local_path = self.cfg.node_local_path
        cfg.task_config.output_path = output_path
        cfg.task_config.model_tag = stage_api.unique_name(output_path)
        if self.stage_idx > 0:
            # Machine learning should use model selection API
            cfg.task_config.init_weights_path = None

        # Write yaml configuration
        cfg_path = stage_api.config_path(self.stage_idx, task_idx)
        cfg.task_config.dump_yaml(cfg_path)
        task = generate_task(cfg)
        task.arguments += ["-c", cfg_path.as_posix()]
        stage.add_tasks(task)

        return stage

    def generate_model_selection_stage(self) -> Stage:
        stage = Stage()
        stage.name = self.MODEL_SELECTION_STAGE_NAME
        cfg = self.cfg.model_selection_stage
        stage_api = self.api.model_selection_stage

        task_idx = 0
        output_path = stage_api.task_dir(self.stage_idx, task_idx, mkdir=True)
        assert output_path is not None

        # Update base parameters
        cfg.task_config.experiment_directory = self.cfg.experiment_directory
        cfg.task_config.stage_idx = self.stage_idx
        cfg.task_config.task_idx = task_idx
        cfg.task_config.node_local_path = self.cfg.node_local_path
        cfg.task_config.output_path = output_path

        # Write yaml configuration
        cfg_path = stage_api.config_path(self.stage_idx, task_idx)
        cfg.task_config.dump_yaml(cfg_path)
        task = generate_task(cfg)
        task.arguments += ["-c", cfg_path.as_posix()]
        stage.add_tasks(task)

        return stage

    def generate_agent_stage(self) -> Stage:
        stage = Stage()
        stage.name = self.AGENT_STAGE_NAME
        cfg = self.cfg.agent_stage
        stage_api = self.api.agent_stage

        task_idx = 0
        output_path = stage_api.task_dir(self.stage_idx, task_idx, mkdir=True)
        assert output_path is not None

        # Update base parameters
        cfg.task_config.experiment_directory = self.cfg.experiment_directory
        cfg.task_config.stage_idx = self.stage_idx
        cfg.task_config.task_idx = task_idx
        cfg.task_config.node_local_path = self.cfg.node_local_path
        cfg.task_config.output_path = output_path

        # Write yaml configuration
        cfg_path = stage_api.config_path(self.stage_idx, task_idx)
        cfg.task_config.dump_yaml(cfg_path)
        task = generate_task(cfg)
        task.arguments += ["-c", cfg_path.as_posix()]
        stage.add_tasks(task)

        return stage