Beispiel #1
0
    def store(self, *records: MoleculeRecord):
        """Store the molecules and their computed properties in the data store.

        Parameters
        ----------
        records
            The records to store.
        """

        with capture_toolkit_warnings():

            records_by_inchi_key: Dict[
                str, List[MoleculeRecord]] = defaultdict(list)

            for record in tqdm(records,
                               desc="grouping records to store by InChI key"):
                records_by_inchi_key[smiles_to_inchi_key(
                    record.smiles)].append(record)

            with self._get_session() as db:

                for inchi_key, inchi_records in tqdm(
                        records_by_inchi_key.items(),
                        desc="storing grouped records"):
                    self._store_records_with_inchi_key(db, inchi_key,
                                                       inchi_records)
Beispiel #2
0
def label_molecules(
    molecules: List[Union[str, "Molecule"]],
    guess_stereochemistry: bool,
    partial_charge_methods: Optional[List[ChargeMethod]],
    bond_order_methods: Optional[List[WBOMethod]],
    n_conformers: int = 500,
    rms_cutoff: float = 0.05,
) -> List[Tuple[Optional[MoleculeRecord], Optional[str]]]:
    """Labels a batch of molecules using ``label_molecule``.

    Args:
        molecules: A list of the molecule (or SMILES representation of the molecules) to
            label.
        guess_stereochemistry: Whether to guess the stereochemistry of the SMILES
            representation of the molecule if provided and if the stereochemistry of
            some atoms / bonds is not fully defined.
        partial_charge_methods: The methods to compute the partial charges using. By
            default, all available methods will be used.
        bond_order_methods: The methods to compute the bond orders using. By
            default, all available methods will be used.
        n_conformers: The *maximum* number of conformers to compute partial charge and
            bond orders using.
        rms_cutoff: The RMS cutoff [Å] to use when generating the conformers.

    Returns:
        A list of tuples of the form ``(labelled_record, error_message)``.
    """

    partial_charge_methods = (partial_charge_methods if partial_charge_methods
                              is not None else get_args(ChargeMethod))
    bond_order_methods = (bond_order_methods if bond_order_methods is not None
                          else get_args(WBOMethod))

    molecule_records = []

    with capture_toolkit_warnings():

        for molecule in tqdm(molecules, ncols=80, desc="labelling batch"):

            molecule_record = None
            error = None

            try:
                molecule_record = label_molecule(
                    molecule,
                    guess_stereochemistry,
                    partial_charge_methods,
                    bond_order_methods,
                    n_conformers,
                    rms_cutoff,
                )
            except (BaseException, Exception) as e:

                formatted_traceback = traceback.format_exception(
                    etype=type(e), value=e, tb=e.__traceback__)
                error = f"Failed to process {str(molecule)}: {formatted_traceback}"

            molecule_records.append((molecule_record, error))

    return molecule_records
Beispiel #3
0
def apply_filter(molecule: "Molecule",
                 retain_largest: bool) -> Tuple["Molecule", bool]:

    with capture_toolkit_warnings():

        try:
            from openff.toolkit.topology import Molecule
            from simtk import unit as simtk_unit

            split_smiles = molecule.to_smiles().split(".")
            n_sub_molecules = len(split_smiles)

            if retain_largest and n_sub_molecules > 1:

                largest_smiles = max(split_smiles, key=len)
                molecule = Molecule.from_smiles(largest_smiles,
                                                allow_undefined_stereo=True)

            # Retain H, C, N, O, F, P, S, Cl, Br, I
            allowed_elements = [1, 6, 7, 8, 9, 15, 16, 17, 35, 53]

            mass = sum(
                atom.mass.value_in_unit(simtk_unit.gram / simtk_unit.mole)
                for atom in molecule.atoms)

            return (
                molecule,
                (all(atom.atomic_number in allowed_elements
                     for atom in molecule.atoms) and (250.0 < mass < 350.0)
                 and (len(molecule.find_rotatable_bonds()) <= 7)),
            )

        except BaseException:
            _logger.exception("failed to apply filter")
            return molecule, False
Beispiel #4
0
def filter_cli(
    input_path: str,
    output_path: str,
    n_processes: int,
    strip_ions: bool,
):

    print(" - Filtering molecules")

    with capture_toolkit_warnings():
        with stream_to_file(output_path) as writer:

            with Pool(processes=n_processes) as pool:

                for molecule, should_include in tqdm(
                        pool.imap(
                            functools.partial(apply_filter,
                                              retain_largest=strip_ions),
                            stream_from_file(input_path),
                        ), ):

                    if not should_include:
                        continue

                    writer(molecule)
Beispiel #5
0
def _enumerate_tautomers(
    smiles: str,
    enumerate_tautomers: bool,
    max_tautomers: int,
    enumerate_protomers: bool,
    max_protomers: int,
) -> Set[str]:

    found_forms = {smiles}

    with capture_toolkit_warnings():

        from openff.toolkit.topology import Molecule
        from openff.toolkit.utils import (
            OpenEyeToolkitWrapper,
            RDKitToolkitWrapper,
            ToolkitRegistry,
        )

        molecule: Molecule = Molecule.from_smiles(smiles,
                                                  allow_undefined_stereo=True)

        if enumerate_tautomers:

            toolkit_registry = ToolkitRegistry(
                toolkit_precedence=[
                    RDKitToolkitWrapper, OpenEyeToolkitWrapper
                ],
                exception_if_unavailable=False,
            )

            found_forms.update(tautomer.to_smiles()
                               for tautomer in molecule.enumerate_tautomers(
                                   max_states=max_tautomers,
                                   toolkit_registry=toolkit_registry))

        if enumerate_protomers:  # pragma: no cover

            from openeye import oechem, oequacpac

            oe_molecule: oechem.OEMol = molecule.to_openeye()

            for i, oe_protomer in enumerate(
                    oequacpac.OEGetReasonableProtomers(oe_molecule)):
                found_forms.add(oechem.OEMolToSmiles(oe_protomer))

                if i >= max_protomers:
                    break

    return found_forms
Beispiel #6
0
def enumerate_cli(
    input_path: str,
    output_path: str,
    enumerate_tautomers: bool,
    max_tautomers: int,
    enumerate_protomers: bool,
    max_protomers: int,
    n_processes: int,
):

    print(f" - Enumerating"
          f"{' tautomers' if enumerate_tautomers else ''}"
          f"{'/' if enumerate_protomers and enumerate_tautomers else ''}"
          f"{' protomers' if enumerate_protomers else ''}")

    unique_molecules = set()

    with capture_toolkit_warnings():
        with stream_to_file(output_path) as writer:

            with Pool(processes=n_processes) as pool:

                for smiles in tqdm(
                        pool.imap(
                            functools.partial(
                                _enumerate_tautomers,
                                enumerate_tautomers=enumerate_tautomers,
                                max_tautomers=max_tautomers,
                                enumerate_protomers=enumerate_protomers,
                                max_protomers=max_protomers,
                            ),
                            stream_from_file(input_path, as_smiles=True),
                        ), ):

                    for pattern in smiles:

                        from openff.toolkit.topology import Molecule

                        molecule: Molecule = Molecule.from_smiles(
                            pattern, allow_undefined_stereo=True)

                        inchi_key = molecule.to_inchikey(fixed_hydrogens=True)

                        if inchi_key in unique_molecules:
                            continue

                        writer(molecule)
                        unique_molecules.add(inchi_key)
Beispiel #7
0
def label_cli(
    input_path: str,
    output_path: str,
    guess_stereo: bool,
    rms_cutoff: float,
    worker_type: str,
    n_workers: int,
    batch_size: int,
    lsf_memory: int,
    lsf_walltime: str,
    lsf_queue: str,
    lsf_env: str,
):

    from dask import distributed

    root_logger: logging.Logger = logging.getLogger("nagl")
    root_logger.setLevel(logging.INFO)

    root_handler = logging.StreamHandler()
    root_handler.setFormatter(logging.Formatter("%(message)s"))

    _logger.info("Labeling molecules")

    with capture_toolkit_warnings():

        all_smiles = [
            smiles
            for smiles in tqdm(
                stream_from_file(input_path, as_smiles=True),
                desc="loading molecules",
                ncols=80,
            )
        ]

    unique_smiles = sorted({*all_smiles})

    if len(unique_smiles) != len(all_smiles):

        _logger.warning(
            f"{len(all_smiles) - len(unique_smiles)} duplicate molecules were ignored"
        )

    n_batches = int(math.ceil(len(all_smiles) / batch_size))

    if n_workers < 0:
        n_workers = n_batches

    if n_workers > n_batches:

        _logger.warning(
            f"More workers were requested then there are batches to compute. Only "
            f"{n_batches} workers will be requested."
        )

        n_workers = n_batches

    # Set-up dask to distribute the processing.
    if worker_type == "lsf":
        dask_cluster = setup_dask_lsf_cluster(
            n_workers, lsf_queue, lsf_memory, lsf_walltime, lsf_env
        )
    elif worker_type == "local":
        dask_cluster = setup_dask_local_cluster(n_workers)
    else:
        raise NotImplementedError()

    _logger.info(
        f"{len(unique_smiles)} molecules will labelled in {n_batches} batches across "
        f"{n_workers} workers\n"
    )

    dask_client = distributed.Client(dask_cluster)

    # Submit the tasks to be computed in chunked batches.
    def batch(iterable):
        n_iterables = len(iterable)

        for i in range(0, n_iterables, batch_size):
            yield iterable[i : min(i + batch_size, n_iterables)]

    futures = [
        dask_client.submit(
            functools.partial(
                label_molecules,
                guess_stereochemistry=guess_stereo,
                partial_charge_methods=["am1", "am1bcc"],
                bond_order_methods=["am1"],
                rms_cutoff=rms_cutoff,
            ),
            batched_molecules,
        )
        for batched_molecules in batch(unique_smiles)
    ]

    # Create a database to store the labelled molecules in and store general
    # provenance information.
    storage = MoleculeStore(output_path)

    storage.set_provenance(
        general_provenance={
            "date": datetime.now().strftime("%d-%m-%Y"),
        },
        software_provenance=get_labelling_software_provenance(),
    )

    # Save out the molecules as they are ready.
    error_file_path = output_path.replace(".sqlite", "-errors.log")

    with open(error_file_path, "w") as file:

        for future in tqdm(
            distributed.as_completed(futures, raise_errors=False),
            total=n_batches,
            desc="labelling molecules",
            ncols=80,
        ):

            for molecule_record, error in tqdm(
                future.result(),
                desc="storing batch",
                ncols=80,
            ):

                try:

                    with capture_toolkit_warnings():

                        if molecule_record is not None and error is None:
                            storage.store(molecule_record)

                except BaseException as e:

                    formatted_traceback = traceback.format_exception(
                        etype=type(e), value=e, tb=e.__traceback__
                    )
                    error = f"Could not store record: {formatted_traceback}"

                if error is not None:

                    file.write("=".join(["="] * 40) + "\n")
                    file.write(error + "\n")
                    file.flush()

                    continue

            future.release()

    if worker_type == "lsf":
        dask_cluster.scale(n=0)
Beispiel #8
0
    def from_molecule_stores(
        cls: Type["DGLMoleculeDataset"],
        molecule_stores: Union["MoleculeStore", Collection["MoleculeStore"]],
        partial_charge_method: Optional["ChargeMethod"],
        bond_order_method: Optional["WBOMethod"],
        atom_features: List[AtomFeature],
        bond_features: List[BondFeature],
        molecule_to_dgl: Optional[MoleculeToDGLFunc] = None,
    ) -> "DGLMoleculeDataset":
        """Creates a data set from a specified set of labelled molecule stores.

        Args:
            molecule_stores: The molecule stores which contain the pre-labelled
                molecules.
            partial_charge_method: The partial charge method to label each atom using.
                If ``None``, atoms won't be labelled with partial charges.
            bond_order_method: The Wiberg bond order method to label each bond using.
                If ``None``, bonds won't be labelled with WBOs.
            atom_features: The atom features to compute for each molecule.
            bond_features: The bond features to compute for each molecule.
            molecule_to_dgl: A (optional) callable to use when converting an OpenFF
                ``Molecule`` object to a ``DGLMolecule`` object. By default, the
                ``DGLMolecule.from_openff`` class method is used.
        """

        from openff.toolkit.topology import Molecule
        from simtk import unit

        from nagl.storage import MoleculeStore

        assert partial_charge_method is not None or bond_order_method is not None, (
            "at least one of the ``partial_charge_method`` and  ``bond_order_method`` "
            "must not be ``None``.")

        if isinstance(molecule_stores, MoleculeStore):
            molecule_stores = [molecule_stores]

        stored_records = list(
            record for molecule_store in molecule_stores
            for record in molecule_store.retrieve(
                [] if partial_charge_method is None else partial_charge_method,
                [] if bond_order_method is None else bond_order_method,
            ))

        entries = []

        for record in tqdm(stored_records, desc="featurizing molecules"):

            with capture_toolkit_warnings():

                molecule: Molecule = Molecule.from_mapped_smiles(
                    record.smiles, allow_undefined_stereo=True)

            if partial_charge_method is not None:

                molecule.partial_charges = (numpy.array(
                    record.average_partial_charges(partial_charge_method)) *
                                            unit.elementary_charge)

            if bond_order_method is not None:

                bond_order_value_tuples = [
                    value_tuple for conformer in record.conformers
                    for bond_order_set in conformer.bond_orders
                    if bond_order_set.method == bond_order_method
                    for value_tuple in bond_order_set.values
                ]

                bond_orders = defaultdict(list)

                for index_a, index_b, value in bond_order_value_tuples:
                    bond_orders[tuple(sorted([index_a,
                                              index_b]))].append(value)

                for bond in molecule.bonds:

                    bond.fractional_bond_order = numpy.mean(bond_orders[tuple(
                        sorted([bond.atom1_index, bond.atom2_index]))])

            entries.append(
                cls._build_entry(
                    molecule,
                    atom_features,
                    bond_features,
                    functools.partial(
                        cls._labelled_molecule_to_dict,
                        partial_charge_method=partial_charge_method,
                        bond_order_method=bond_order_method,
                    ),
                    molecule_to_dgl,
                ))

        return cls(entries)