Esempio n. 1
0
def test_filter_cli(openff_methane: Molecule, runner):

    # Create an SDF file to filter.
    with stream_to_file("molecules.sdf") as writer:

        writer(
            Molecule.from_smiles("C1(=C(C(=C(C(=C1Cl)Cl)Cl)Cl)Cl)[O-].[Na+]"))
        writer(Molecule.from_smiles("CCC(C)(C)C(F)(F)CCCCC(F)(F)C(C)(C)CC"))

    arguments = [
        "--input", "molecules.sdf", "--output", "filtered.sdf", "--strip-ions"
    ]

    result = runner.invoke(filter_cli, arguments)

    if result.exit_code != 0:
        raise result.exception

    assert os.path.isfile("filtered.sdf")

    filtered_molecules = [
        molecule for molecule in stream_from_file("filtered.sdf")
    ]
    assert len(filtered_molecules) == 1

    filtered_molecule = filtered_molecules[0]

    assert (filtered_molecule.to_smiles(toolkit_registry=RDKitToolkitWrapper())
            == "[O-][c]1[c]([Cl])[c]([Cl])[c]([Cl])[c]([Cl])[c]1[Cl]")
Esempio n. 2
0
def filter_cli(
    input_path: str,
    output_path: str,
    n_processes: int,
    strip_ions: bool,
):

    print(" - Filtering molecules")

    with capture_toolkit_warnings():
        with stream_to_file(output_path) as writer:

            with Pool(processes=n_processes) as pool:

                for molecule, should_include in tqdm(
                        pool.imap(
                            functools.partial(apply_filter,
                                              retain_largest=strip_ions),
                            stream_from_file(input_path),
                        ), ):

                    if not should_include:
                        continue

                    writer(molecule)
Esempio n. 3
0
def test_enumerate_cli(openff_methane: Molecule, runner):

    # Create an SDF file to enumerate.
    buteneol = Molecule.from_smiles(r"C/C=C(/C)\O")

    with stream_to_file("molecules.sdf") as writer:

        writer(buteneol)
        writer(buteneol)

    arguments = ["--input", "molecules.sdf", "--output", "tautomers.sdf", "--tautomers"]

    result = runner.invoke(enumerate_cli, arguments)

    if result.exit_code != 0:
        raise result.exception

    assert os.path.isfile("tautomers.sdf")

    tautomers = [molecule for molecule in stream_from_file("tautomers.sdf")]
    assert len(tautomers) == 4

    assert {
        tautomer.to_smiles(
            explicit_hydrogens=False, toolkit_registry=RDKitToolkitWrapper()
        )
        for tautomer in tautomers
    } == {"C/C=C(/C)O", "C=C(O)CC", "CCC(C)=O", "CC=C(C)O"}
Esempio n. 4
0
def enumerate_cli(
    input_path: str,
    output_path: str,
    enumerate_tautomers: bool,
    max_tautomers: int,
    enumerate_protomers: bool,
    max_protomers: int,
    n_processes: int,
):

    print(f" - Enumerating"
          f"{' tautomers' if enumerate_tautomers else ''}"
          f"{'/' if enumerate_protomers and enumerate_tautomers else ''}"
          f"{' protomers' if enumerate_protomers else ''}")

    unique_molecules = set()

    with capture_toolkit_warnings():
        with stream_to_file(output_path) as writer:

            with Pool(processes=n_processes) as pool:

                for smiles in tqdm(
                        pool.imap(
                            functools.partial(
                                _enumerate_tautomers,
                                enumerate_tautomers=enumerate_tautomers,
                                max_tautomers=max_tautomers,
                                enumerate_protomers=enumerate_protomers,
                                max_protomers=max_protomers,
                            ),
                            stream_from_file(input_path, as_smiles=True),
                        ), ):

                    for pattern in smiles:

                        from openff.toolkit.topology import Molecule

                        molecule: Molecule = Molecule.from_smiles(
                            pattern, allow_undefined_stereo=True)

                        inchi_key = molecule.to_inchikey(fixed_hydrogens=True)

                        if inchi_key in unique_molecules:
                            continue

                        writer(molecule)
                        unique_molecules.add(inchi_key)
Esempio n. 5
0
def test_read_write_streams():

    molecules = [Molecule.from_smiles("C"), Molecule.from_smiles("CO")]

    with temporary_cd():

        with stream_to_file("molecules.sdf") as writer:

            for molecule in molecules:
                writer(molecule)

        loaded_molecules = [*stream_from_file("molecules.sdf")]

    assert len(molecules) == len(loaded_molecules)
    assert {molecule.to_smiles() for molecule in molecules} == {
        molecule.to_smiles() for molecule in loaded_molecules
    }
Esempio n. 6
0
def label_cli(
    input_path: str,
    output_path: str,
    guess_stereo: bool,
    rms_cutoff: float,
    worker_type: str,
    n_workers: int,
    batch_size: int,
    lsf_memory: int,
    lsf_walltime: str,
    lsf_queue: str,
    lsf_env: str,
):

    from dask import distributed

    root_logger: logging.Logger = logging.getLogger("nagl")
    root_logger.setLevel(logging.INFO)

    root_handler = logging.StreamHandler()
    root_handler.setFormatter(logging.Formatter("%(message)s"))

    _logger.info("Labeling molecules")

    with capture_toolkit_warnings():

        all_smiles = [
            smiles
            for smiles in tqdm(
                stream_from_file(input_path, as_smiles=True),
                desc="loading molecules",
                ncols=80,
            )
        ]

    unique_smiles = sorted({*all_smiles})

    if len(unique_smiles) != len(all_smiles):

        _logger.warning(
            f"{len(all_smiles) - len(unique_smiles)} duplicate molecules were ignored"
        )

    n_batches = int(math.ceil(len(all_smiles) / batch_size))

    if n_workers < 0:
        n_workers = n_batches

    if n_workers > n_batches:

        _logger.warning(
            f"More workers were requested then there are batches to compute. Only "
            f"{n_batches} workers will be requested."
        )

        n_workers = n_batches

    # Set-up dask to distribute the processing.
    if worker_type == "lsf":
        dask_cluster = setup_dask_lsf_cluster(
            n_workers, lsf_queue, lsf_memory, lsf_walltime, lsf_env
        )
    elif worker_type == "local":
        dask_cluster = setup_dask_local_cluster(n_workers)
    else:
        raise NotImplementedError()

    _logger.info(
        f"{len(unique_smiles)} molecules will labelled in {n_batches} batches across "
        f"{n_workers} workers\n"
    )

    dask_client = distributed.Client(dask_cluster)

    # Submit the tasks to be computed in chunked batches.
    def batch(iterable):
        n_iterables = len(iterable)

        for i in range(0, n_iterables, batch_size):
            yield iterable[i : min(i + batch_size, n_iterables)]

    futures = [
        dask_client.submit(
            functools.partial(
                label_molecules,
                guess_stereochemistry=guess_stereo,
                partial_charge_methods=["am1", "am1bcc"],
                bond_order_methods=["am1"],
                rms_cutoff=rms_cutoff,
            ),
            batched_molecules,
        )
        for batched_molecules in batch(unique_smiles)
    ]

    # Create a database to store the labelled molecules in and store general
    # provenance information.
    storage = MoleculeStore(output_path)

    storage.set_provenance(
        general_provenance={
            "date": datetime.now().strftime("%d-%m-%Y"),
        },
        software_provenance=get_labelling_software_provenance(),
    )

    # Save out the molecules as they are ready.
    error_file_path = output_path.replace(".sqlite", "-errors.log")

    with open(error_file_path, "w") as file:

        for future in tqdm(
            distributed.as_completed(futures, raise_errors=False),
            total=n_batches,
            desc="labelling molecules",
            ncols=80,
        ):

            for molecule_record, error in tqdm(
                future.result(),
                desc="storing batch",
                ncols=80,
            ):

                try:

                    with capture_toolkit_warnings():

                        if molecule_record is not None and error is None:
                            storage.store(molecule_record)

                except BaseException as e:

                    formatted_traceback = traceback.format_exception(
                        etype=type(e), value=e, tb=e.__traceback__
                    )
                    error = f"Could not store record: {formatted_traceback}"

                if error is not None:

                    file.write("=".join(["="] * 40) + "\n")
                    file.write(error + "\n")
                    file.flush()

                    continue

            future.release()

    if worker_type == "lsf":
        dask_cluster.scale(n=0)